Index: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h @@ -37,6 +37,7 @@ FunctionPass *createSITypeRewriter(); FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSIFoldOperandsPass(); +FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); @@ -58,6 +59,9 @@ void initializeSIFoldOperandsPass(PassRegistry &); extern char &SIFoldOperandsID; +void initializeSIPeepholeSDWAPass(PassRegistry &); +extern char &SIPeepholeSDWAID; + void initializeSIShrinkInstructionsPass(PassRegistry&); extern char &SIShrinkInstructionsID; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -94,6 +94,11 @@ cl::init(false), cl::Hidden); +static cl::opt EnableSDWAPeephole( + "amdgpu-sdwa-peephole", + cl::desc("Enable SDWA peepholer"), + cl::init(false)); + // Enable address space based alias analysis static cl::opt EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), @@ -109,6 +114,7 @@ initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); initializeSIFoldOperandsPass(*PR); + initializeSIPeepholeSDWAPass(*PR); initializeSIShrinkInstructionsPass(*PR); initializeSIFixControlFlowLiveIntervalsPass(*PR); initializeSILoadStoreOptimizerPass(*PR); @@ -683,6 +689,10 @@ void GCNPassConfig::addPreRegAlloc() { addPass(createSIShrinkInstructionsPass()); + if (EnableSDWAPeephole) { + addPass(&SIPeepholeSDWAID); + addPass(&DeadMachineInstructionElimID); + } addPass(createSIWholeQuadModePass()); } Index: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt +++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt @@ -89,6 +89,7 @@ SIMachineFunctionInfo.cpp SIMachineScheduler.cpp SIOptimizeExecMasking.cpp + SIPeepholeSDWA.cpp SIRegisterInfo.cpp SIShrinkInstructions.cpp SITypeRewriter.cpp Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h @@ -769,6 +769,9 @@ int getVOPe32(uint16_t Opcode); LLVM_READONLY + int getSDWAOp(uint16_t Opcode); + + LLVM_READONLY int getCommuteRev(uint16_t Opcode); LLVM_READONLY Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td @@ -1441,6 +1441,15 @@ let ValueCols = [["4", "0"]]; } +// Maps ordinary instructions to their SDWA counterparts +def getSDWAOp : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["Default"]; + let ValueCols = [["SDWA"]]; +} + def getMaskedMIMGOp : InstrMapping { let FilterClass = "MIMG_Mask"; let RowFields = ["Op"]; Index: llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -0,0 +1,692 @@ +//===-- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This pass tries to apply several peephole SDWA patterns. +/// +/// E.g. original: +/// V_LSHRREV_B32_e32 %vreg0, 16, %vreg1 +/// V_ADD_I32_e32 %vreg2, %vreg0, %vreg3 +/// V_LSHLREV_B32_e32 %vreg4, 16, %vreg2 +/// +/// Replace: +/// V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3 +/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +/// +//===----------------------------------------------------------------------===// + + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "si-peephole-sdwa" + +STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); +STATISTIC(NumSDWAInstructionsPeepholed, + "Number of instruction converted to SDWA."); + +namespace { + +class SDWAOperand; + +class SIPeepholeSDWA : public MachineFunctionPass { +private: + MachineRegisterInfo *MRI; + const SIRegisterInfo *TRI; + const SIInstrInfo *TII; + + std::unordered_map> SDWAOperands; + +public: + static char ID; + + typedef SmallVector, 4> SDWAOperandsVector; + + SIPeepholeSDWA() : MachineFunctionPass(ID) { + initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + void matchSDWAOperands(MachineBasicBlock &MBB); + bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); + + StringRef getPassName() const override { return "SI Peephole SDWA"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +class SDWAOperand { +private: + MachineOperand *Target; // Operand that would be used in converted instruction + MachineOperand *Replaced; // Operand that would be replace by Target + +public: + SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) + : Target(TargetOp), Replaced(ReplacedOp) { + assert(Target->isReg()); + assert(Replaced->isReg()); + } + + virtual ~SDWAOperand() {} + + virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; + virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; + + MachineOperand *getTargetOperand() const { return Target; } + MachineOperand *getReplacedOperand() const { return Replaced; } + MachineInstr *getParentInst() const { return Target->getParent(); } + MachineRegisterInfo *getMRI() const { + return &getParentInst()->getParent()->getParent()->getRegInfo(); + } +}; + +using namespace AMDGPU::SDWA; + +class SDWASrcOperand : public SDWAOperand { +private: + SdwaSel SrcSel; + bool Abs; + bool Neg; + bool Sext; + +public: + SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, + SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, + bool Sext_ = false) + : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), + Neg(Neg_), Sext(Sext_) {} + + virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; + virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; + + SdwaSel getSrcSel() const { return SrcSel; } + bool getAbs() const { return Abs; } + bool getNeg() const { return Neg; } + bool getSext() const { return Sext; } + + uint64_t getSrcMods() const; +}; + +class SDWADstOperand : public SDWAOperand { +private: + SdwaSel DstSel; + DstUnused DstUn; + +public: + SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, + SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) + : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} + + virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; + virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; + + SdwaSel getDstSel() const { return DstSel; } + DstUnused getDstUnused() const { return DstUn; } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) + +char SIPeepholeSDWA::ID = 0; + +char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; + +FunctionPass *llvm::createSIPeepholeSDWAPass() { + return new SIPeepholeSDWA(); +} + +#ifndef NDEBUG + +static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) { + switch(Sel) { + case BYTE_0: OS << "BYTE_0"; break; + case BYTE_1: OS << "BYTE_1"; break; + case BYTE_2: OS << "BYTE_2"; break; + case BYTE_3: OS << "BYTE_3"; break; + case WORD_0: OS << "WORD_0"; break; + case WORD_1: OS << "WORD_1"; break; + case DWORD: OS << "DWORD"; break; + } + return OS; +} + +static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { + switch(Un) { + case UNUSED_PAD: OS << "UNUSED_PAD"; break; + case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; + case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; + } + return OS; +} + +static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) { + OS << "SDWA src: " << *Src.getTargetOperand() + << " src_sel:" << Src.getSrcSel() + << " abs:" << Src.getAbs() << " neg:" << Src.getNeg() + << " sext:" << Src.getSext() << '\n'; + return OS; +} + +static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) { + OS << "SDWA dst: " << *Dst.getTargetOperand() + << " dst_sel:" << Dst.getDstSel() + << " dst_unused:" << Dst.getDstUnused() << '\n'; + return OS; +} + +#endif + +static bool isSameBB(const MachineInstr *FirstMI, const MachineInstr *SecondMI) { + assert(FirstMI && SecondMI); + return FirstMI->getParent() == SecondMI->getParent(); +} + +static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { + assert(To.isReg() && From.isReg()); + To.setReg(From.getReg()); + To.setSubReg(From.getSubReg()); + To.setIsUndef(From.isUndef()); + if (To.isUse()) { + To.setIsKill(From.isKill()); + } else { + To.setIsDead(From.isDead()); + } +} + +static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { + return LHS.isReg() && + RHS.isReg() && + LHS.getReg() == RHS.getReg() && + LHS.getSubReg() == RHS.getSubReg(); +} + +static bool isSubregOf(const MachineOperand &SubReg, + const MachineOperand &SuperReg, + const TargetRegisterInfo *TRI) { + + if (!SuperReg.isReg() || !SubReg.isReg()) + return false; + + if (isSameReg(SuperReg, SubReg)) + return true; + + if (SuperReg.getReg() != SubReg.getReg()) + return false; + + LaneBitmask::Type SuperMask = + TRI->getSubRegIndexLaneMask(SuperReg.getSubReg()).getAsInteger(); + LaneBitmask::Type SubMask = + TRI->getSubRegIndexLaneMask(SubReg.getSubReg()).getAsInteger(); + return TRI->regmaskSubsetEqual(&SubMask, &SuperMask); +} + +uint64_t SDWASrcOperand::getSrcMods() const { + uint64_t Mods = 0; + if (Abs || Neg) { + assert(!Sext && + "Float and integer src modifiers can't be set simulteniously"); + Mods |= Abs ? SISrcMods::ABS : 0; + Mods |= Neg ? SISrcMods::NEG : 0; + } else if (Sext) { + Mods |= SISrcMods::SEXT; + } + + return Mods; +} + +MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { + // For SDWA src operand potential instruction is one that use register + // defined by parent instruction + MachineRegisterInfo *MRI = getMRI(); + MachineOperand *Replaced = getReplacedOperand(); + assert(Replaced->isReg()); + + MachineInstr *PotentialMI = nullptr; + for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) { + // If this is use of another subreg of dst reg then do nothing + if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo())) + continue; + + // If there exist use of dst in another basic block or use of superreg of + // dst then we should not combine this opernad + if (!isSameBB(PotentialMO.getParent(), getParentInst()) || + !isSameReg(PotentialMO, *Replaced)) + return nullptr; + + // Check that PotentialMI is only instruction that uses dst reg + if (PotentialMI == nullptr) { + PotentialMI = PotentialMO.getParent(); + } else if (PotentialMI != PotentialMO.getParent()) { + return nullptr; + } + } + + return PotentialMI; +} + +bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { + // Find operand in instruction that matches source operand and replace it with + // target operand. Set corresponding src_sel + + MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); + MachineOperand *SrcMods = + TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); + assert(Src && Src->isReg()); + if (!isSameReg(*Src, *getReplacedOperand())) { + // If this is not src0 then it should be src1 + Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); + SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); + + assert(Src && Src->isReg()); + + if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || + MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && + !isSameReg(*Src, *getReplacedOperand())) { + // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to + // src2. This is not allowed. + return false; + } + + assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods); + } + copyRegOperand(*Src, *getTargetOperand()); + SrcSel->setImm(getSrcSel()); + SrcMods->setImm(getSrcMods()); + getTargetOperand()->setIsKill(false); + return true; +} + +MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { + // For SDWA dst operand potential instruction is one that defines register + // that this operand uses + MachineRegisterInfo *MRI = getMRI(); + MachineInstr *ParentMI = getParentInst(); + MachineOperand *Replaced = getReplacedOperand(); + assert(Replaced->isReg()); + + for (MachineOperand &PotentialMO : MRI->def_operands(Replaced->getReg())) { + if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo())) + continue; + + if (!isSameBB(getParentInst(), PotentialMO.getParent()) || + !isSameReg(*Replaced, PotentialMO)) + return nullptr; + + // Check that ParentMI is the only instruction that uses replaced register + for (MachineOperand &UseMO : MRI->use_operands(PotentialMO.getReg())) { + if (isSubregOf(UseMO, PotentialMO, MRI->getTargetRegisterInfo()) && + UseMO.getParent() != ParentMI) { + return nullptr; + } + } + + // Due to SSA this should be onle def of replaced register, so return it + return PotentialMO.getParent(); + } + + return nullptr; +} + +bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { + // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused + + if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || + MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && + getDstSel() != AMDGPU::SDWA::DWORD) { + // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD + return false; + } + + MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + assert(Operand && + Operand->isReg() && + isSameReg(*Operand, *getReplacedOperand())); + copyRegOperand(*Operand, *getTargetOperand()); + MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); + assert(DstSel); + DstSel->setImm(getDstSel()); + MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); + assert(DstUnused); + DstUnused->setImm(getDstUnused()); + + // Remove original instruction because it would conflict with our new + // instruction by register definition + getParentInst()->eraseFromParent(); + return true; +} + +void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { + for (MachineInstr &MI : MBB) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::V_LSHRREV_B32_e32: + case AMDGPU::V_ASHRREV_I32_e32: + case AMDGPU::V_LSHLREV_B32_e32: { + // from: v_lshrrev_b32_e32 v1, 16/24, v0 + // to SDWA src:v0 src_sel:WORD_1/BYTE_3 + + // from: v_ashrrev_i32_e32 v1, 16/24, v0 + // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 + + // from: v_lshlrev_b32_e32 v1, 16/24, v0 + // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + if (!Src0->isImm()) + break; + + int64_t Imm = Src0->getImm(); + if (Imm != 16 && Imm != 24) + break; + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + if (TRI->isPhysicalRegister(Src1->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + if (Opcode == AMDGPU::V_LSHLREV_B32_e32) { + auto SDWADst = make_unique( + Dst, Src1, Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); + SDWAOperands[&MI] = std::move(SDWADst); + ++NumSDWAPatternsFound; + } else { + auto SDWASrc = make_unique( + Src1, Dst, Imm == 16 ? WORD_1 : BYTE_3, false, false, + Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); + SDWAOperands[&MI] = std::move(SDWASrc); + ++NumSDWAPatternsFound; + } + break; + } + + case AMDGPU::V_LSHRREV_B16_e32: + case AMDGPU::V_ASHRREV_I16_e32: + case AMDGPU::V_LSHLREV_B16_e32: { + // from: v_lshrrev_b16_e32 v1, 8, v0 + // to SDWA src:v0 src_sel:BYTE_1 + + // from: v_ashrrev_i16_e32 v1, 8, v0 + // to SDWA src:v0 src_sel:BYTE_1 sext:1 + + // from: v_lshlrev_b16_e32 v1, 8, v0 + // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + if (!Src0->isImm() || Src0->getImm() != 8) + break; + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + if (TRI->isPhysicalRegister(Src1->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + if (Opcode == AMDGPU::V_LSHLREV_B16_e32) { + auto SDWADst = + make_unique(Dst, Src1, BYTE_1, UNUSED_PAD); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); + SDWAOperands[&MI] = std::move(SDWADst); + ++NumSDWAPatternsFound; + } else { + auto SDWASrc = make_unique( + Src1, Dst, BYTE_1, false, false, + Opcode == AMDGPU::V_LSHRREV_B16_e32 ? false : true); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); + SDWAOperands[&MI] = std::move(SDWASrc); + ++NumSDWAPatternsFound; + } + break; + } + + case AMDGPU::V_BFE_I32: + case AMDGPU::V_BFE_U32: { + // e.g.: + // from: v_bfe_u32 v1, v0, 8, 8 + // to SDWA src:v0 src_sel:BYTE_1 + + // offset | width | src_sel + // ------------------------ + // 0 | 8 | BYTE_0 + // 0 | 16 | WORD_0 + // 0 | 32 | DWORD ? + // 8 | 8 | BYTE_1 + // 16 | 8 | BYTE_2 + // 16 | 16 | WORD_1 + // 24 | 8 | BYTE_3 + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (!Src1->isImm()) + break; + int64_t Offset = Src1->getImm(); + + MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (!Src2->isImm()) + break; + int64_t Width = Src2->getImm(); + + SdwaSel SrcSel = DWORD; + + if (Offset == 0 && Width == 8) + SrcSel = BYTE_0; + else if (Offset == 0 && Width == 16) + SrcSel = WORD_0; + else if (Offset == 0 && Width == 32) + SrcSel = DWORD; + else if (Offset == 8 && Width == 8) + SrcSel = BYTE_1; + else if (Offset == 16 && Width == 8) + SrcSel = BYTE_2; + else if (Offset == 16 && Width == 16) + SrcSel = WORD_1; + else if (Offset == 24 && Width == 8) + SrcSel = BYTE_3; + else + break; + + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + if (TRI->isPhysicalRegister(Src0->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + auto SDWASrc = make_unique( + Src0, Dst, SrcSel, false, false, + Opcode == AMDGPU::V_BFE_U32 ? false : true); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); + SDWAOperands[&MI] = std::move(SDWASrc); + ++NumSDWAPatternsFound; + break; + } + case AMDGPU::V_AND_B32_e32: { + // e.g.: + // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 + // to SDWA src:v0 src_sel:WORD_0/BYTE_0 + + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + if (!Src0->isImm()) + break; + + int64_t Imm = Src0->getImm(); + if (Imm != 0x0000ffff && Imm != 0x000000ff) + break; + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + if (TRI->isPhysicalRegister(Src1->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + auto SDWASrc = make_unique( + Src1, Dst, Imm == 0x0000ffff ? WORD_0 : BYTE_0); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); + SDWAOperands[&MI] = std::move(SDWASrc); + ++NumSDWAPatternsFound; + break; + } + } + } +} + +bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, + const SDWAOperandsVector &SDWAOperands) { + // Check if this instruction can be converted to SDWA: + // 1. Does this opcode support SDWA + if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1) + return false; + + // 2. Are all operands - VGPRs + for (const MachineOperand &Operand : MI.explicit_operands()) { + if (!Operand.isReg() || !TRI->isVGPR(*MRI, Operand.getReg())) + return false; + } + + // Convert to sdwa + int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode()); + assert(SDWAOpcode != -1); + + const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); + + // Create SDWA version of instruction MI and initialize its operands + MachineInstrBuilder SDWAInst = + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc); + + // Copy dst, if it is present in original then should also be present in SDWA + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + if (Dst) { + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); + SDWAInst.add(*Dst); + } else { + assert(TII->isVOPC(MI)); + } + + // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and + // src0_modifiers (except for v_nop_sdwa, but it can't get here) + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + assert( + Src0 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1); + SDWAInst.addImm(0); + SDWAInst.add(*Src0); + + // Copy src1 if present, initialize src1_modifiers. + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src1) { + assert( + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1); + SDWAInst.addImm(0); + SDWAInst.add(*Src1); + } else { + assert(TII->isVOP1(MI)); + } + + if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || + SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { + // v_mac_f16/32 has additional src2 operand tied to vdst + MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + assert(Src2); + SDWAInst.add(*Src2); + } + + // Initialize clamp. + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); + SDWAInst.addImm(0); + + // Initialize dst_sel and dst_unused if present + if (Dst) { + assert( + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1); + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); + } + + // Initialize src0_sel + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + + + // Initialize src1_sel if present + if (Src1) { + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + } + + // Apply all sdwa operand pattenrs + bool Converted = false; + for (auto &Operand : SDWAOperands) { + Converted |= Operand->convertToSDWA(*SDWAInst, TII); + } + if (!Converted) { + SDWAInst->eraseFromParent(); + return false; + } + + DEBUG(dbgs() << "Convert instruction:" << MI + << "Into:" << *SDWAInst << '\n'); + ++NumSDWAInstructionsPeepholed; + + MI.eraseFromParent(); + return true; +} + +bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget(); + + if (!ST.hasSDWA() || + !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9 + return false; + } + + MRI = &MF.getRegInfo(); + TRI = ST.getRegisterInfo(); + TII = ST.getInstrInfo(); + + std::unordered_map PotentialMatches; + + // FIXME: For now we only combine instructions in one basic block + for (MachineBasicBlock &MBB : MF) { + SDWAOperands.clear(); + matchSDWAOperands(MBB); + + PotentialMatches.clear(); + for (auto &OperandPair : SDWAOperands) { + auto &Operand = OperandPair.second; + MachineInstr *PotentialMI = Operand->potentialToConvert(TII); + if (PotentialMI) { + PotentialMatches[PotentialMI].push_back(std::move(Operand)); + } + } + + for (auto &PotentialPair : PotentialMatches) { + MachineInstr &PotentialMI = *PotentialPair.first; + convertToSDWA(PotentialMI, PotentialPair.second); + } + } + return false; +} Index: llvm/trunk/lib/Target/AMDGPU/VOPInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/VOPInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/VOPInstructions.td @@ -318,7 +318,7 @@ let SDWA = 1; let Uses = [EXEC]; - let SubtargetPredicate = HasSDWA; + let SubtargetPredicate = !if(P.HasExt, HasSDWA, DisableInst); let AssemblerPredicate = !if(P.HasExt, HasSDWA, DisableInst); let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.SDWA, AMDGPUAsmVariants.Disable); Index: llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -0,0 +1,372 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji --amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SDWA -check-prefix=GCN %s + +; GCN-LABEL: {{^}}add_shr_i32: +; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]] +; NOSDWA-NOT: v_add_i32_sdwa + +; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +define void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %a = load i32, i32 addrspace(1)* %in, align 4 + %shr = lshr i32 %a, 16 + %add = add i32 %a, %shr + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}sub_shr_i32: +; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_subrev_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]] +; NOSDWA-NOT: v_subrev_i32_sdwa + +; SDWA: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +define void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %a = load i32, i32 addrspace(1)* %in, align 4 + %shr = lshr i32 %a, 16 + %sub = sub i32 %shr, %a + store i32 %sub, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_shr_i32: +; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST1]], v[[DST0]] +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 + +define void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) { + %a = load i32, i32 addrspace(1)* %in1, align 4 + %b = load i32, i32 addrspace(1)* %in2, align 4 + %shra = lshr i32 %a, 16 + %shrb = lshr i32 %b, 16 + %mul = mul i32 %shra, %shrb + store i32 %mul, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_i16: +; NOSDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa +; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-NOT: v_mul_u32_u24_sdwa + +define void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) { +entry: + %a = load i16, i16 addrspace(1)* %ina, align 4 + %b = load i16, i16 addrspace(1)* %inb, align 4 + %mul = mul i16 %a, %b + store i16 %mul, i16 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v2i16: +; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL]], v{{[0-9]+}} + +define void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { +entry: + %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 + %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 + %mul = mul <2 x i16> %a, %b + store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v4i16: +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL1]], v{{[0-9]+}} +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL0]], v{{[0-9]+}} + +define void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) { +entry: + %a = load <4 x i16>, <4 x i16> addrspace(1)* %ina, align 4 + %b = load <4 x i16>, <4 x i16> addrspace(1)* %inb, align 4 + %mul = mul <4 x i16> %a, %b + store <4 x i16> %mul, <4 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v8i16: +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL1]], v{{[0-9]+}} +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL0]], v{{[0-9]+}} +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL3]], v{{[0-9]+}} +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL2]], v{{[0-9]+}} + +define void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) { +entry: + %a = load <8 x i16>, <8 x i16> addrspace(1)* %ina, align 4 + %b = load <8 x i16>, <8 x i16> addrspace(1)* %inb, align 4 + %mul = mul <8 x i16> %a, %b + store <8 x i16> %mul, <8 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_half: +; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_f16_sdwa +; SDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-NOT: v_mul_f16_sdwa + +define void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) { +entry: + %a = load half, half addrspace(1)* %ina, align 4 + %b = load half, half addrspace(1)* %inb, align 4 + %mul = fmul half %a, %b + store half %mul, half addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v2half: +; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} +; NOSDWA-NOT: v_mul_f16_sdwa + +; SDWA: v_mul_f16_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 + +define void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { +entry: + %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4 + %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4 + %mul = fmul <2 x half> %a, %b + store <2 x half> %mul, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v4half: +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_f16_sdwa + +; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +define void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) { +entry: + %a = load <4 x half>, <4 x half> addrspace(1)* %ina, align 4 + %b = load <4 x half>, <4 x half> addrspace(1)* %inb, align 4 + %mul = fmul <4 x half> %a, %b + store <4 x half> %mul, <4 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v8half: +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_f16_sdwa + +; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +define void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) { +entry: + %a = load <8 x half>, <8 x half> addrspace(1)* %ina, align 4 + %b = load <8 x half>, <8 x half> addrspace(1)* %inb, align 4 + %mul = fmul <8 x half> %a, %b + store <8 x half> %mul, <8 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_i8: +; NOSDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa +; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-NOT: v_mul_u32_u24_sdwa + +define void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) { +entry: + %a = load i8, i8 addrspace(1)* %ina, align 4 + %b = load i8, i8 addrspace(1)* %inb, align 4 + %mul = mul i8 %a, %b + store i8 %mul, i8 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v2i8: +; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 + +define void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) { +entry: + %a = load <2 x i8>, <2 x i8> addrspace(1)* %ina, align 4 + %b = load <2 x i8>, <2 x i8> addrspace(1)* %inb, align 4 + %mul = mul <2 x i8> %a, %b + store <2 x i8> %mul, <2 x i8> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v4i8: +; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA: v_mul_u32_u24_sdwa +; SDWA: v_mul_u32_u24_sdwa +; SDWA: v_mul_u32_u24_sdwa + +define void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) { +entry: + %a = load <4 x i8>, <4 x i8> addrspace(1)* %ina, align 4 + %b = load <4 x i8>, <4 x i8> addrspace(1)* %inb, align 4 + %mul = mul <4 x i8> %a, %b + store <4 x i8> %mul, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v8i8: +; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA: v_mul_u32_u24_sdwa +; SDWA: v_mul_u32_u24_sdwa +; SDWA: v_mul_u32_u24_sdwa +; SDWA: v_mul_u32_u24_sdwa +; SDWA: v_mul_u32_u24_sdwa +; SDWA: v_mul_u32_u24_sdwa + +define void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) { +entry: + %a = load <8 x i8>, <8 x i8> addrspace(1)* %ina, align 4 + %b = load <8 x i8>, <8 x i8> addrspace(1)* %inb, align 4 + %mul = mul <8 x i8> %a, %b + store <8 x i8> %mul, <8 x i8> addrspace(1)* %out, align 4 + ret void +} + + +; GCN-LABEL: {{^}}mac_v2half: +; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} +; NOSDWA-NOT: v_mac_f16_sdwa + +; SDWA: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] + +define void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { +entry: + %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4 + %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4 + %mul = fmul <2 x half> %a, %b + %mac = fadd <2 x half> %mul, %b + store <2 x half> %mac, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}immediate_mul_v2i16: +; NOSDWA-NOT: v_mul_u32_u24_sdwa +; SDWA-NOT: v_mul_u32_u24_sdwa + +define void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +entry: + %a = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4 + %mul = mul <2 x i16> %a, + store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; Double use of same src - should not convert it +; GCN-LABEL: {{^}}mulmul_v2i16: +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +define void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { +entry: + %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 + %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 + %mul = mul <2 x i16> %a, %b + %mul2 = mul <2 x i16> %mul, %b + store <2 x i16> %mul2, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_add_v2i16: +; NOSDWA-NOT: v_mul_u32_u24_sdwa +; NOSDWA-NOT: v_add_i32_sdwa +; SDWA-NOT: v_mul_u32_u24_sdwa +; SDWA-NOT: v_add_i32_sdwa + +define void @mul_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb, i1 addrspace(1)* %incond) { +entry: + %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 + %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 + %cond = load i1, i1 addrspace(1)* %incond, align 4 + br i1 %cond, label %mul_label, label %add_label +mul_label: + %mul = mul <2 x i16> %a, %b + br label %store_label +add_label: + %add = add <2 x i16> %a, %b + br label %store_label +store_label: + %store = phi <2 x i16> [%mul, %mul_label], [%add, %add_label] + store <2 x i16> %store, <2 x i16> addrspace(1)* %out, align 4 + ret void +} \ No newline at end of file