Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -37,6 +37,7 @@ FunctionPass *createSITypeRewriter(); FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSIFoldOperandsPass(); +FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); @@ -58,6 +59,9 @@ void initializeSIFoldOperandsPass(PassRegistry &); extern char &SIFoldOperandsID; +void initializeSIPeepholeSDWAPass(PassRegistry &); +extern char &SIPeepholeSDWAID; + void initializeSIShrinkInstructionsPass(PassRegistry&); extern char &SIShrinkInstructionsID; Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -93,6 +93,11 @@ cl::init(false), cl::Hidden); +static cl::opt EnableSDWAPeephole( + "amdgpu-sdwa-peephole", + cl::desc("Enable SDWA peepholer"), + cl::init(false)); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -103,6 +108,7 @@ initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); initializeSIFoldOperandsPass(*PR); + initializeSIPeepholeSDWAPass(*PR); initializeSIShrinkInstructionsPass(*PR); initializeSIFixControlFlowLiveIntervalsPass(*PR); initializeSILoadStoreOptimizerPass(*PR); @@ -664,6 +670,10 @@ void GCNPassConfig::addPreRegAlloc() { addPass(createSIShrinkInstructionsPass()); + if (EnableSDWAPeephole) { + addPass(&SIPeepholeSDWAID); + addPass(&DeadMachineInstructionElimID); + } addPass(createSIWholeQuadModePass()); } Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -88,6 +88,7 @@ SIMachineFunctionInfo.cpp SIMachineScheduler.cpp SIOptimizeExecMasking.cpp + SIPeepholeSDWA.cpp SIRegisterInfo.cpp SIShrinkInstructions.cpp SITypeRewriter.cpp Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -761,6 +761,9 @@ int getVOPe32(uint16_t Opcode); LLVM_READONLY + int getSDWAOp(uint16_t Opcode); + + LLVM_READONLY int getCommuteRev(uint16_t Opcode); LLVM_READONLY Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -1441,6 +1441,15 @@ let ValueCols = [["4", "0"]]; } +// Maps ordinary instructions to their SDWA counterparts +def getSDWAOp : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["Default"]; + let ValueCols = [["SDWA"]]; +} + def getMaskedMIMGOp : InstrMapping { let FilterClass = "MIMG_Mask"; let RowFields = ["Op"]; Index: lib/Target/AMDGPU/SIPeepholeSDWA.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -0,0 +1,533 @@ +//===-- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This pass tries to apply several peephole SDWA patterns. +/// +/// E.g. original: +/// V_LSHRREV_B32_e32 %vreg0, 16, %vreg1 +/// V_ADD_I32_e32 %vreg2, %vreg0, %vreg3 +/// V_LSHLREV_B32_e32 %vreg4, 16, %vreg2 +/// +/// Replace: +/// V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3 +/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +/// +//===----------------------------------------------------------------------===// + + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "si-peephole-sdwa" + +STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); +STATISTIC(NumSDWAInstructionsPeepholed, + "Number of instruction converted to SDWA."); + +namespace { + +class SDWAOperand; + +class SIPeepholeSDWA : public MachineFunctionPass { +private: + MachineRegisterInfo *MRI; + const SIRegisterInfo *TRI; + const SIInstrInfo *TII; + + std::unordered_map> SDWAOperands; + +public: + static char ID; + + typedef SmallVector, 4> SDWAOperandsVector; + + SIPeepholeSDWA() : MachineFunctionPass(ID) { + initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + void matchSDWAOperands(MachineBasicBlock &MBB); + bool isConvertibleToSDWA(const MachineInstr &MI); + void convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); + + StringRef getPassName() const override { return "SI Peephole SDWA"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +class SDWAOperand { +private: + MachineOperand *Target; // Operand that would be used in converted instruction + MachineOperand *Replaced; // Operand that would be replace by Target + +public: + SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) + : Target(TargetOp), Replaced(ReplacedOp) { + assert(Target->isReg()); + assert(Replaced->isReg()); + } + + virtual ~SDWAOperand() {} + + virtual SmallVector + potentialsToConvert(const SIInstrInfo *TII) = 0; + virtual void convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; + + MachineOperand *getTargetOperand() const { return Target; } + MachineOperand *getReplacedOperand() const { return Replaced; } + MachineInstr *getParentInst() const { return Target->getParent(); } + MachineRegisterInfo *getMRI() const { + return &getParentInst()->getParent()->getParent()->getRegInfo(); + } +}; + +using namespace AMDGPU::SDWA; + +class SDWASrcOperand : public SDWAOperand { +private: + SdwaSel SrcSel; + +public: + SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, + SdwaSel SrcSel_ = DWORD) + : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_) {} + + virtual SmallVector + potentialsToConvert(const SIInstrInfo *TII) override; + virtual void convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; + + SdwaSel getSrcSel() const { return SrcSel; } +}; + +class SDWADstOperand : public SDWAOperand { +private: + SdwaSel DstSel; + DstUnused DstUn; + +public: + SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, + SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) + : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} + + virtual SmallVector + potentialsToConvert(const SIInstrInfo *TII) override; + virtual void convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; + + SdwaSel getDstSel() const { return DstSel; } + DstUnused getDstUnused() const { return DstUn; } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) + +char SIPeepholeSDWA::ID = 0; + +char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; + +FunctionPass *llvm::createSIPeepholeSDWAPass() { + return new SIPeepholeSDWA(); +} + +#ifndef NDEBUG + +static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) { + switch(Sel) { + case BYTE_0: OS << "BYTE_0"; break; + case BYTE_1: OS << "BYTE_1"; break; + case BYTE_2: OS << "BYTE_2"; break; + case BYTE_3: OS << "BYTE_3"; break; + case WORD_0: OS << "WORD_0"; break; + case WORD_1: OS << "WORD_1"; break; + case DWORD: OS << "DWORD"; break; + } + return OS; +} + +static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { + switch(Un) { + case UNUSED_PAD: OS << "UNUSED_PAD"; break; + case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; + case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; + } + return OS; +} + +static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) { + OS << "SDWA src: " << *Src.getTargetOperand() + << " src_sel:" << Src.getSrcSel() << '\n'; + return OS; +} + +static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) { + OS << "SDWA dst: " << *Dst.getTargetOperand() + << " dst_sel:" << Dst.getDstSel() + << " dst_unused:" << Dst.getDstUnused() << '\n'; + return OS; +} + +#endif + +static bool isSameBB(const MachineInstr *FirstMI, const MachineInstr *SecondMI) { + assert(FirstMI && SecondMI); + const MachineBasicBlock *FirstBB = FirstMI->getParent(); + const MachineBasicBlock *SecondBB = SecondMI->getParent(); + assert(FirstBB && SecondBB); + int FirstBBNum = FirstBB->getNumber(); + int SecondBBNum = SecondBB->getNumber(); + return FirstBBNum >= 0 && SecondBBNum >= 0 && FirstBBNum == SecondBBNum; +} + +static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { + assert(To.isReg() && From.isReg()); + To.setReg(From.getReg()); + To.setSubReg(From.getSubReg()); + To.setIsUndef(From.isUndef()); + if (To.isUse()) { + To.setIsKill(From.isKill()); + } else { + To.setIsDead(From.isDead()); + } +} + +static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { + return LHS.isReg() && + RHS.isReg() && + LHS.getReg() == RHS.getReg() && + LHS.getSubReg() == RHS.getSubReg(); +} + +static bool isSubregOf(const MachineOperand &SubReg, + const MachineOperand &SuperReg, + const TargetRegisterInfo *TRI) { + if (!SuperReg.isReg() || !SubReg.isReg()) + return false; + + if (isSameReg(SuperReg, SubReg)) + return true; + + if (SuperReg.getReg() != SubReg.getReg()) + return false; + + LaneBitmask::Type SuperMask = + TRI->getSubRegIndexLaneMask(SuperReg.getSubReg()).getAsInteger(); + LaneBitmask::Type SubMask = + TRI->getSubRegIndexLaneMask(SubReg.getSubReg()).getAsInteger(); + return TRI->regmaskSubsetEqual(&SubMask, &SuperMask); +} + +SmallVector +SDWASrcOperand::potentialsToConvert(const SIInstrInfo *TII) { + // For SDWA src operand potential instructions are those that use register + // defined by parent instruction + MachineRegisterInfo *MRI = getMRI(); + MachineOperand *Replaced = getReplacedOperand(); + assert(Replaced->isReg()); + + for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) { + // If this is use of another subreg of dst reg then do nothing + if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo())) + continue; + + // Check that this instruction and ParentMI are in same basic block and that + // this is use of the same subreg + if (isSameBB(PotentialMO.getParent(), getParentInst()) && + isSameReg(PotentialMO, *Replaced)) { + return SmallVector(1, PotentialMO.getParent()); + } + + // If there exist use of dst in another basic block or use of superreg of + // dst then we should not combine this opernad + break; + } + + return SmallVector(); +} + +void SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { + // Find operand in instruction that matches source operand and replace it with + // target operand. Set corresponding src_sel + + MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); + assert(Src && Src->isReg()); + if (!isSameReg(*Src, *getReplacedOperand())) { + // If this is not src0 then it should be src1 + Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); + + assert(Src && Src->isReg()); + + if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || + MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && + !isSameReg(*Src, *getReplacedOperand())) { + // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to + // src2. This is not allowed. + return; + } + + assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel); + } + copyRegOperand(*Src, *getTargetOperand()); + SrcSel->setImm(getSrcSel()); +} + +SmallVector +SDWADstOperand::potentialsToConvert(const SIInstrInfo *TII) { + // For SDWA dst operand potential instruction is one that defines register + // that this operand uses + MachineRegisterInfo *MRI = getMRI(); + MachineInstr *ParentMI = getParentInst(); + MachineOperand *Replaced = getReplacedOperand(); + assert(Replaced->isReg()); + + for (MachineInstr &PotentialMI : MRI->def_instructions(Replaced->getReg())) { + MachineOperand *PotentialMO = + TII->getNamedOperand(PotentialMI, AMDGPU::OpName::vdst); + if (!PotentialMO || !PotentialMO->isReg()) + break; + + if (!isSubregOf(*Replaced, *PotentialMO, MRI->getTargetRegisterInfo())) + continue; + + if (isSameBB(getParentInst(), &PotentialMI) && + isSameReg(*Replaced, *PotentialMO)) { + // Check that ParentMI is the only instruction that uses replaced register + for (MachineOperand &UseMO : MRI->use_operands(PotentialMO->getReg())) { + if (isSubregOf(UseMO, *PotentialMO, MRI->getTargetRegisterInfo()) && + !UseMO.getParent()->isIdenticalTo(*ParentMI)) { + return SmallVector(); + } + } + return SmallVector(1, &PotentialMI); + } + + break; + } + return SmallVector(); +} + +void SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { + // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused + + if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || + MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && + getDstSel() != AMDGPU::SDWA::DWORD) { + // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD + return; + } + + MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + assert(Operand && + Operand->isReg() && + isSameReg(*Operand, *getReplacedOperand())); + copyRegOperand(*Operand, *getTargetOperand()); + MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); + assert(DstSel); + DstSel->setImm(getDstSel()); + MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); + assert(DstUnused); + DstUnused->setImm(getDstUnused()); + + // Remove original instruction because it would conflict with our new + // instruction by register definition + getParentInst()->eraseFromParent(); +} + +void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { + for (MachineInstr &MI : MBB) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::V_LSHRREV_B32_e32: { + // from: v_lshrrev_b32_e32 v1, 16, v0 + // to SDWA src: v0 src_sel:WORD_1 + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + if (!Src0->isImm() || Src0->getImm() != 16) + break; + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + auto SDWASrc = make_unique(Src1, Dst, WORD_1); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); + SDWAOperands[&MI] = std::move(SDWASrc); + ++NumSDWAPatternsFound; + break; + } + + case AMDGPU::V_LSHLREV_B32_e32: { + // from: v_lshlrev_b32_e32 v1, 16, v0 + // to SDWA dst: v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + if (!Src0->isImm() || Src0->getImm() != 16) + break; + + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + + auto SDWADst = make_unique(Dst, Src1, WORD_1, UNUSED_PAD); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); + SDWAOperands[&MI] = std::move(SDWADst); + ++NumSDWAPatternsFound; + break; + } + } + } +} + +bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) { + // Check if this instruction can be converted to SDWA: + // 1. Does this opcode support SDWA + if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1) + return false; + + // 2. Are all operands - VGPRs + for (const MachineOperand &Operand : MI.explicit_operands()) { + if (!Operand.isReg() || !TRI->isVGPR(*MRI, Operand.getReg())) + return false; + } + + return true; +} + +void SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, + const SDWAOperandsVector &SDWAOperands) { + int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode()); + assert(SDWAOpcode != -1); + + const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); + + // Create SDWA version of instruction MI and initialize its operands + MachineInstrBuilder SDWAInst = + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),SDWADesc); + + // Copy dst, if it is present in original then should also be present in SDWA + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + if (Dst) { + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); + SDWAInst.add(*Dst); + } else { + assert(TII->isVOPC(MI)); + } + + // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and + // src0_modifiers (except for v_nop_sdwa, but it can't get here) + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + assert( + Src0 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1); + SDWAInst.addImm(0); + SDWAInst.add(*Src0); + + // Copy src1 if present, initialize src1_modifiers. + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src1) { + assert( + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1); + SDWAInst.addImm(0); + SDWAInst.add(*Src1); + } else { + assert(TII->isVOP1(MI)); + } + + if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || + SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { + // v_mac_f16/32 has additional src2 operand tied to vdst + MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + assert(Src2); + SDWAInst.add(*Src2); + } + + // Initialize clamp. + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); + SDWAInst.addImm(0); + + // Initialize dst_sel and dst_unused if present + if (Dst) { + assert( + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1); + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); + } + + // Initialize src0_sel + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + + + // Initialize src1_sel if present + if (Src1) { + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + } + + // Apply all sdwa operand pattenrs + for (auto &Operand : SDWAOperands) { + Operand->convertToSDWA(*SDWAInst, TII); + } + + DEBUG(dbgs() << "Convert instruction:" << MI + << "Into:" << *SDWAInst << '\n'); + ++NumSDWAInstructionsPeepholed; + + MI.eraseFromParent(); +} + +bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget(); + + if (!ST.hasSDWA() || + !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9 + return false; + } + + MRI = &MF.getRegInfo(); + TRI = ST.getRegisterInfo(); + TII = ST.getInstrInfo(); + + std::unordered_map PotentialMatches; + + // FIXME: For now we only combine instructions in one basic block + for (MachineBasicBlock &MBB : MF) { + SDWAOperands.clear(); + matchSDWAOperands(MBB); + + PotentialMatches.clear(); + for (auto &OperandPair : SDWAOperands) { + auto &Operand = OperandPair.second; + for (MachineInstr *PotentialMI : + OperandPair.second->potentialsToConvert(TII)) { + PotentialMatches[PotentialMI].push_back(std::move(Operand)); + } + } + + for (auto &PotentialPair : PotentialMatches) { + MachineInstr &PotentialMI = *PotentialPair.first; + if (isConvertibleToSDWA(PotentialMI)) { + convertToSDWA(PotentialMI, PotentialPair.second); + } + } + } + return false; +} Index: lib/Target/AMDGPU/VOPInstructions.td =================================================================== --- lib/Target/AMDGPU/VOPInstructions.td +++ lib/Target/AMDGPU/VOPInstructions.td @@ -318,7 +318,7 @@ let SDWA = 1; let Uses = [EXEC]; - let SubtargetPredicate = HasSDWA; + let SubtargetPredicate = !if(P.HasExt, HasSDWA, DisableInst); let AssemblerPredicate = !if(P.HasExt, HasSDWA, DisableInst); let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.SDWA, AMDGPUAsmVariants.Disable); Index: test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -0,0 +1,274 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji --amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SDWA -check-prefix=GCN %s + +; GCN-LABEL: {{^}}add_shr_i32: +; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]] +; NOSDWA-NOT: v_add_i32_sdwa + +; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +define void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %a = load i32, i32 addrspace(1)* %in, align 4 + %shr = lshr i32 %a, 16 + %add = add i32 %a, %shr + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}sub_shr_i32: +; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_subrev_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]] +; NOSDWA-NOT: v_subrev_i32_sdwa + +; SDWA: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +define void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %a = load i32, i32 addrspace(1)* %in, align 4 + %shr = lshr i32 %a, 16 + %sub = sub i32 %shr, %a + store i32 %sub, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_shr_i32: +; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST1]], v[[DST0]] +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 + +define void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) { + %a = load i32, i32 addrspace(1)* %in1, align 4 + %b = load i32, i32 addrspace(1)* %in2, align 4 + %shra = lshr i32 %a, 16 + %shrb = lshr i32 %b, 16 + %mul = mul i32 %shra, %shrb + store i32 %mul, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_i16: +; NOSDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa +; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-NOT: v_mul_u32_u24_sdwa + +define void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) { +entry: + %a = load i16, i16 addrspace(1)* %ina, align 4 + %b = load i16, i16 addrspace(1)* %inb, align 4 + %mul = mul i16 %a, %b + store i16 %mul, i16 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v2i16: +; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL]], v{{[0-9]+}} + +define void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { +entry: + %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 + %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 + %mul = mul <2 x i16> %a, %b + store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v4i16: +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL1]], v{{[0-9]+}} +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL0]], v{{[0-9]+}} + +define void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) { +entry: + %a = load <4 x i16>, <4 x i16> addrspace(1)* %ina, align 4 + %b = load <4 x i16>, <4 x i16> addrspace(1)* %inb, align 4 + %mul = mul <4 x i16> %a, %b + store <4 x i16> %mul, <4 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v8i16: +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL1]], v{{[0-9]+}} +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL0]], v{{[0-9]+}} +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL3]], v{{[0-9]+}} +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL2]], v{{[0-9]+}} + +define void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) { +entry: + %a = load <8 x i16>, <8 x i16> addrspace(1)* %ina, align 4 + %b = load <8 x i16>, <8 x i16> addrspace(1)* %inb, align 4 + %mul = mul <8 x i16> %a, %b + store <8 x i16> %mul, <8 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_half: +; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_f16_sdwa +; SDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-NOT: v_mul_f16_sdwa + +define void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) { +entry: + %a = load half, half addrspace(1)* %ina, align 4 + %b = load half, half addrspace(1)* %inb, align 4 + %mul = fmul half %a, %b + store half %mul, half addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v2half: +; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} +; NOSDWA-NOT: v_mul_f16_sdwa + +; SDWA: v_mul_f16_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL]], v{{[0-9]+}} + +define void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { +entry: + %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4 + %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4 + %mul = fmul <2 x half> %a, %b + store <2 x half> %mul, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v4half: +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_f16_sdwa + +; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +define void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) { +entry: + %a = load <4 x half>, <4 x half> addrspace(1)* %ina, align 4 + %b = load <4 x half>, <4 x half> addrspace(1)* %inb, align 4 + %mul = fmul <4 x half> %a, %b + store <4 x half> %mul, <4 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v8half: +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_f16_sdwa + +; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +define void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) { +entry: + %a = load <8 x half>, <8 x half> addrspace(1)* %ina, align 4 + %b = load <8 x half>, <8 x half> addrspace(1)* %inb, align 4 + %mul = fmul <8 x half> %a, %b + store <8 x half> %mul, <8 x half> addrspace(1)* %out, align 4 + ret void +} + + +; GCN-LABEL: {{^}}mac_v2half: +; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} +; NOSDWA-NOT: v_mac_f16_sdwa + +; SDWA: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} + +define void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { +entry: + %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4 + %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4 + %mul = fmul <2 x half> %a, %b + %mac = fadd <2 x half> %mul, %b + store <2 x half> %mac, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}immediate_mul_v2i16: +; NOSDWA-NOT: v_mul_u32_u24_sdwa +; SDWA-NOT: v_mul_u32_u24_sdwa + +define void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +entry: + %a = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4 + %mul = mul <2 x i16> %a, + store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_add_v2i16: +; NOSDWA-NOT: v_mul_u32_u24_sdwa +; NOSDWA-NOT: v_add_i32_sdwa +; SDWA-NOT: v_mul_u32_u24_sdwa +; SDWA-NOT: v_add_i32_sdwa + +define void @mul_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb, i1 addrspace(1)* %incond) { +entry: + %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 + %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 + %cond = load i1, i1 addrspace(1)* %incond, align 4 + br i1 %cond, label %mul_label, label %add_label +mul_label: + %mul = mul <2 x i16> %a, %b + br label %store_label +add_label: + %add = add <2 x i16> %a, %b + br label %store_label +store_label: + %store = phi <2 x i16> [%mul, %mul_label], [%add, %add_label] + store <2 x i16> %store, <2 x i16> addrspace(1)* %out, align 4 + ret void +} \ No newline at end of file