Index: lib/Target/AMDGPU/SIPeepholeSDWA.cpp =================================================================== --- lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -61,6 +61,7 @@ namespace { class SDWAOperand; +class SDWADstOperand; class SIPeepholeSDWA : public MachineFunctionPass { public: @@ -86,6 +87,7 @@ bool runOnMachineFunction(MachineFunction &MF) override; void matchSDWAOperands(MachineFunction &MF); + std::unique_ptr matchSDWAOperand(MachineInstr &MI); bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const; @@ -122,6 +124,14 @@ MachineRegisterInfo *getMRI() const { return &getParentInst()->getParent()->getParent()->getRegInfo(); } + + const SIRegisterInfo *getTRI() const { + return static_cast(getMRI()->getTargetRegisterInfo()); + } + +#ifndef NDEBUG + virtual void print(raw_ostream& OS) const = 0; +#endif }; using namespace AMDGPU::SDWA; @@ -137,8 +147,8 @@ SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, bool Sext_ = false) - : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), - Neg(Neg_), Sext(Sext_) {} + : SDWAOperand(TargetOp, ReplacedOp), + SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; @@ -150,6 +160,10 @@ uint64_t getSrcMods(const SIInstrInfo *TII, const MachineOperand *SrcOp) const; + +#ifndef NDEBUG + void print(raw_ostream& OS) const override; +#endif }; class SDWADstOperand : public SDWAOperand { @@ -158,15 +172,39 @@ DstUnused DstUn; public: + SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) - : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} + : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; SdwaSel getDstSel() const { return DstSel; } DstUnused getDstUnused() const { return DstUn; } + +#ifndef NDEBUG + void print(raw_ostream& OS) const override; +#endif +}; + +class SDWADstPreserveOperand : public SDWADstOperand { +private: + MachineOperand *Preserve; + +public: + SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, + MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) + : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), + Preserve(PreserveOp) {} + + bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; + + MachineOperand *getPreservedOperand() const { return Preserve; } + +#ifndef NDEBUG + void print(raw_ostream& OS) const override; +#endif }; } // end anonymous namespace @@ -204,20 +242,30 @@ return OS; } -static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) { - OS << "SDWA src: " << *Src.getTargetOperand() - << " src_sel:" << Src.getSrcSel() - << " abs:" << Src.getAbs() << " neg:" << Src.getNeg() - << " sext:" << Src.getSext() << '\n'; +static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { + Operand.print(OS); return OS; } -static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) { - OS << "SDWA dst: " << *Dst.getTargetOperand() - << " dst_sel:" << Dst.getDstSel() - << " dst_unused:" << Dst.getDstUnused() << '\n'; - return OS; +void SDWASrcOperand::print(raw_ostream& OS) const { + OS << "SDWA src: " << *getTargetOperand() + << " src_sel:" << getSrcSel() + << " abs:" << getAbs() << " neg:" << getNeg() + << " sext:" << getSext() << '\n'; +} + +void SDWADstOperand::print(raw_ostream& OS) const { + OS << "SDWA dst: " << *getTargetOperand() + << " dst_sel:" << getDstSel() + << " dst_unused:" << getDstUnused() << '\n'; +} + +void SDWADstPreserveOperand::print(raw_ostream& OS) const { + OS << "SDWA preserve dst: " << *getTargetOperand() + << " dst_sel:" << getDstSel() + << " preserve:" << *getPreservedOperand() << '\n'; } + #endif static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { @@ -232,30 +280,63 @@ } } -static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { - return LHS.isReg() && - RHS.isReg() && - LHS.getReg() == RHS.getReg() && - LHS.getSubReg() == RHS.getSubReg(); +static MachineOperand *findSingleRegUse(const MachineOperand *Reg, + const MachineRegisterInfo *MRI) { + if (!Reg->isReg()) + return nullptr; + + auto TRI = static_cast( + MRI->getTargetRegisterInfo()); + + MachineOperand *DefUseMO = nullptr; + for (MachineOperand &DefUse: MRI->use_operands(Reg->getReg())) { + /* If this is def/use of another subreg of reg then do nothing */ + if (!TRI->isSubregOf(DefUse, *Reg) || + !TRI->isSubregOf(*Reg, DefUse)) + continue; + + /* If there exist def/use of superreg of reg then return nullptr*/ + if (!TRI->isSameReg(DefUse, *Reg)) + return nullptr; + + /* Check that UseMI is only instruction that defs/uses reg */ + if (!DefUseMO) { + DefUseMO = &DefUse; + } else if (DefUseMO->getParent() != DefUse.getParent()) { + return nullptr; + } + } + return DefUseMO; } -static bool isSubregOf(const MachineOperand &SubReg, - const MachineOperand &SuperReg, - const TargetRegisterInfo *TRI) { - if (!SuperReg.isReg() || !SubReg.isReg()) - return false; +static MachineOperand *findSingleRegDef(const MachineOperand *Reg, + const MachineRegisterInfo *MRI) { + if (!Reg->isReg()) + return nullptr; - if (isSameReg(SuperReg, SubReg)) - return true; + auto TRI = static_cast( + MRI->getTargetRegisterInfo()); - if (SuperReg.getReg() != SubReg.getReg()) - return false; + MachineOperand *DefUseMO = nullptr; + for (MachineOperand &DefUse: MRI->def_operands(Reg->getReg())) { + /* If this is def/use of another subreg of reg then do nothing */ + if (!TRI->isSubregOf(DefUse, *Reg) || + !TRI->isSubregOf(*Reg, DefUse)) + continue; + + /* If there exist def/use of superreg of reg then return nullptr*/ + if (!TRI->isSameReg(DefUse, *Reg)) + return nullptr; - LaneBitmask SuperMask = TRI->getSubRegIndexLaneMask(SuperReg.getSubReg()); - LaneBitmask SubMask = TRI->getSubRegIndexLaneMask(SubReg.getSubReg()); - SuperMask |= ~SubMask; - return SuperMask.all(); + /* Check that UseMI is only instruction that defs/uses reg */ + if (!DefUseMO) { + DefUseMO = &DefUse; + } else if (DefUseMO->getParent() != DefUse.getParent()) { + return nullptr; + } + } + return DefUseMO; } uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, @@ -286,42 +367,25 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { // For SDWA src operand potential instruction is one that use register // defined by parent instruction - MachineRegisterInfo *MRI = getMRI(); - MachineOperand *Replaced = getReplacedOperand(); - assert(Replaced->isReg()); - - MachineInstr *PotentialMI = nullptr; - for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) { - // If this is use of another subreg of dst reg then do nothing - if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo())) - continue; - - // If there exist use of superreg of dst then we should not combine this - // opernad - if (!isSameReg(PotentialMO, *Replaced)) - return nullptr; - - // Check that PotentialMI is only instruction that uses dst reg - if (PotentialMI == nullptr) { - PotentialMI = PotentialMO.getParent(); - } else if (PotentialMI != PotentialMO.getParent()) { - return nullptr; - } - } + MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); + if (!PotentialMO) + return nullptr; - return PotentialMI; + return PotentialMO->getParent(); } bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { // Find operand in instruction that matches source operand and replace it with // target operand. Set corresponding src_sel + const SIRegisterInfo *TRI = getTRI(); + MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); MachineOperand *SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); assert(Src && (Src->isReg() || Src->isImm())); - if (!isSameReg(*Src, *getReplacedOperand())) { + if (!TRI->isSameReg(*Src, *getReplacedOperand())) { // If this is not src0 then it should be src1 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); @@ -331,13 +395,13 @@ if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && - !isSameReg(*Src, *getReplacedOperand())) { + !TRI->isSameReg(*Src, *getReplacedOperand())) { // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to // src2. This is not allowed. return false; } - assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods); + assert(TRI->isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods); } copyRegOperand(*Src, *getTargetOperand()); SrcSel->setImm(getSrcSel()); @@ -351,29 +415,20 @@ // that this operand uses MachineRegisterInfo *MRI = getMRI(); MachineInstr *ParentMI = getParentInst(); - MachineOperand *Replaced = getReplacedOperand(); - assert(Replaced->isReg()); - for (MachineOperand &PotentialMO : MRI->def_operands(Replaced->getReg())) { - if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo())) - continue; + MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI); + if (!PotentialMO) + return nullptr; - if (!isSameReg(*Replaced, PotentialMO)) + // Check that ParentMI is the only instruction that uses replaced register + for (MachineOperand &UseMO : MRI->use_operands(PotentialMO->getReg())) { + if (getTRI()->isSubregOf(UseMO, *PotentialMO) && + UseMO.getParent() != ParentMI) { return nullptr; - - // Check that ParentMI is the only instruction that uses replaced register - for (MachineOperand &UseMO : MRI->use_operands(PotentialMO.getReg())) { - if (isSubregOf(UseMO, PotentialMO, MRI->getTargetRegisterInfo()) && - UseMO.getParent() != ParentMI) { - return nullptr; - } } - - // Due to SSA this should be onle def of replaced register, so return it - return PotentialMO.getParent(); } - return nullptr; + return PotentialMO->getParent(); } bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { @@ -389,7 +444,7 @@ MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); assert(Operand && Operand->isReg() && - isSameReg(*Operand, *getReplacedOperand())); + getTRI()->isSameReg(*Operand, *getReplacedOperand())); copyRegOperand(*Operand, *getTargetOperand()); MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); assert(DstSel); @@ -404,6 +459,38 @@ return true; } +bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, + const SIInstrInfo *TII) { + // MI should be moved right before v_or_b32. + // For this we should clear all kill flags on uses of MI src-operands or else + // we can encounter problem with use of killed operand. + for (MachineOperand &MO: MI.uses()) { + if (!MO.isReg()) + continue; + for (MachineOperand &Use: getMRI()->use_operands(MO.getReg())) { + Use.setIsKill(false); + } + } + + // Move MI before v_or_b32 + auto MBB = MI.getParent(); + MBB->remove(&MI); + MBB->insert(getParentInst(), &MI); + + // Add Implicit use of preserved register + MachineInstrBuilder MIB(*MBB->getParent(), MI); + MIB.addReg(getPreservedOperand()->getReg(), + RegState::ImplicitKill, + getPreservedOperand()->getSubReg()); + + // Tie dst to implicit use + MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), + MI.getNumOperands() - 1); + + // Convert MI as any other SDWADstOperand and remove v_or_b32 + return SDWADstOperand::convertToSDWA(MI, TII); +} + Optional SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { if (Op.isImm()) { return Op.getImm(); @@ -413,7 +500,7 @@ // %vreg1 = S_MOV_B32 255; if (Op.isReg()) { for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { - if (!isSameReg(Op, Def)) + if (!TRI->isSameReg(Op, Def)) continue; const MachineInstr *DefInst = Def.getParent(); @@ -431,195 +518,316 @@ return None; } -void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) { - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - case AMDGPU::V_LSHRREV_B32_e32: - case AMDGPU::V_ASHRREV_I32_e32: - case AMDGPU::V_LSHLREV_B32_e32: - case AMDGPU::V_LSHRREV_B32_e64: - case AMDGPU::V_ASHRREV_I32_e64: - case AMDGPU::V_LSHLREV_B32_e64: { - // from: v_lshrrev_b32_e32 v1, 16/24, v0 - // to SDWA src:v0 src_sel:WORD_1/BYTE_3 - - // from: v_ashrrev_i32_e32 v1, 16/24, v0 - // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 - - // from: v_lshlrev_b32_e32 v1, 16/24, v0 - // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD - MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - auto Imm = foldToImm(*Src0); - if (!Imm) - break; - - if (*Imm != 16 && *Imm != 24) - break; - - MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (TRI->isPhysicalRegister(Src1->getReg()) || - TRI->isPhysicalRegister(Dst->getReg())) - break; - - if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || - Opcode == AMDGPU::V_LSHLREV_B32_e64) { - auto SDWADst = make_unique( - Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); - DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); - SDWAOperands[&MI] = std::move(SDWADst); - ++NumSDWAPatternsFound; - } else { - auto SDWASrc = make_unique( - Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, - Opcode != AMDGPU::V_LSHRREV_B32_e32 && - Opcode != AMDGPU::V_LSHRREV_B32_e64); - DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); - SDWAOperands[&MI] = std::move(SDWASrc); - ++NumSDWAPatternsFound; - } - break; - } +std::unique_ptr +SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::V_LSHRREV_B32_e32: + case AMDGPU::V_ASHRREV_I32_e32: + case AMDGPU::V_LSHLREV_B32_e32: + case AMDGPU::V_LSHRREV_B32_e64: + case AMDGPU::V_ASHRREV_I32_e64: + case AMDGPU::V_LSHLREV_B32_e64: { + // from: v_lshrrev_b32_e32 v1, 16/24, v0 + // to SDWA src:v0 src_sel:WORD_1/BYTE_3 + + // from: v_ashrrev_i32_e32 v1, 16/24, v0 + // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 + + // from: v_lshlrev_b32_e32 v1, 16/24, v0 + // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + auto Imm = foldToImm(*Src0); + if (!Imm) + break; + + if (*Imm != 16 && *Imm != 24) + break; + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + if (TRI->isPhysicalRegister(Src1->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || + Opcode == AMDGPU::V_LSHLREV_B32_e64) { + return make_unique( + Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); + } else { + return make_unique( + Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, + Opcode != AMDGPU::V_LSHRREV_B32_e32 && + Opcode != AMDGPU::V_LSHRREV_B32_e64); + } + break; + } - case AMDGPU::V_LSHRREV_B16_e32: - case AMDGPU::V_ASHRREV_I16_e32: - case AMDGPU::V_LSHLREV_B16_e32: - case AMDGPU::V_LSHRREV_B16_e64: - case AMDGPU::V_ASHRREV_I16_e64: - case AMDGPU::V_LSHLREV_B16_e64: { - // from: v_lshrrev_b16_e32 v1, 8, v0 - // to SDWA src:v0 src_sel:BYTE_1 - - // from: v_ashrrev_i16_e32 v1, 8, v0 - // to SDWA src:v0 src_sel:BYTE_1 sext:1 - - // from: v_lshlrev_b16_e32 v1, 8, v0 - // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD - MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - auto Imm = foldToImm(*Src0); - if (!Imm || *Imm != 8) - break; - - MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - - if (TRI->isPhysicalRegister(Src1->getReg()) || - TRI->isPhysicalRegister(Dst->getReg())) - break; - - if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || - Opcode == AMDGPU::V_LSHLREV_B16_e64) { - auto SDWADst = - make_unique(Dst, Src1, BYTE_1, UNUSED_PAD); - DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); - SDWAOperands[&MI] = std::move(SDWADst); - ++NumSDWAPatternsFound; - } else { - auto SDWASrc = make_unique( - Src1, Dst, BYTE_1, false, false, - Opcode != AMDGPU::V_LSHRREV_B16_e32 && - Opcode != AMDGPU::V_LSHRREV_B16_e64); - DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); - SDWAOperands[&MI] = std::move(SDWASrc); - ++NumSDWAPatternsFound; - } - break; - } + case AMDGPU::V_LSHRREV_B16_e32: + case AMDGPU::V_ASHRREV_I16_e32: + case AMDGPU::V_LSHLREV_B16_e32: + case AMDGPU::V_LSHRREV_B16_e64: + case AMDGPU::V_ASHRREV_I16_e64: + case AMDGPU::V_LSHLREV_B16_e64: { + // from: v_lshrrev_b16_e32 v1, 8, v0 + // to SDWA src:v0 src_sel:BYTE_1 + + // from: v_ashrrev_i16_e32 v1, 8, v0 + // to SDWA src:v0 src_sel:BYTE_1 sext:1 + + // from: v_lshlrev_b16_e32 v1, 8, v0 + // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + auto Imm = foldToImm(*Src0); + if (!Imm || *Imm != 8) + break; + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + if (TRI->isPhysicalRegister(Src1->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || + Opcode == AMDGPU::V_LSHLREV_B16_e64) { + return make_unique(Dst, Src1, BYTE_1, UNUSED_PAD); + } else { + return make_unique( + Src1, Dst, BYTE_1, false, false, + Opcode != AMDGPU::V_LSHRREV_B16_e32 && + Opcode != AMDGPU::V_LSHRREV_B16_e64); + } + break; + } - case AMDGPU::V_BFE_I32: - case AMDGPU::V_BFE_U32: { - // e.g.: - // from: v_bfe_u32 v1, v0, 8, 8 - // to SDWA src:v0 src_sel:BYTE_1 - - // offset | width | src_sel - // ------------------------ - // 0 | 8 | BYTE_0 - // 0 | 16 | WORD_0 - // 0 | 32 | DWORD ? - // 8 | 8 | BYTE_1 - // 16 | 8 | BYTE_2 - // 16 | 16 | WORD_1 - // 24 | 8 | BYTE_3 - - MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - auto Offset = foldToImm(*Src1); - if (!Offset) - break; - - MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); - auto Width = foldToImm(*Src2); - if (!Width) - break; - - SdwaSel SrcSel = DWORD; - - if (*Offset == 0 && *Width == 8) - SrcSel = BYTE_0; - else if (*Offset == 0 && *Width == 16) - SrcSel = WORD_0; - else if (*Offset == 0 && *Width == 32) - SrcSel = DWORD; - else if (*Offset == 8 && *Width == 8) - SrcSel = BYTE_1; - else if (*Offset == 16 && *Width == 8) - SrcSel = BYTE_2; - else if (*Offset == 16 && *Width == 16) - SrcSel = WORD_1; - else if (*Offset == 24 && *Width == 8) - SrcSel = BYTE_3; - else - break; - - MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - - if (TRI->isPhysicalRegister(Src0->getReg()) || - TRI->isPhysicalRegister(Dst->getReg())) - break; - - auto SDWASrc = make_unique( - Src0, Dst, SrcSel, false, false, - Opcode != AMDGPU::V_BFE_U32); - DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); - SDWAOperands[&MI] = std::move(SDWASrc); - ++NumSDWAPatternsFound; + case AMDGPU::V_BFE_I32: + case AMDGPU::V_BFE_U32: { + // e.g.: + // from: v_bfe_u32 v1, v0, 8, 8 + // to SDWA src:v0 src_sel:BYTE_1 + + // offset | width | src_sel + // ------------------------ + // 0 | 8 | BYTE_0 + // 0 | 16 | WORD_0 + // 0 | 32 | DWORD ? + // 8 | 8 | BYTE_1 + // 16 | 8 | BYTE_2 + // 16 | 16 | WORD_1 + // 24 | 8 | BYTE_3 + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + auto Offset = foldToImm(*Src1); + if (!Offset) + break; + + MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + auto Width = foldToImm(*Src2); + if (!Width) + break; + + SdwaSel SrcSel = DWORD; + + if (*Offset == 0 && *Width == 8) + SrcSel = BYTE_0; + else if (*Offset == 0 && *Width == 16) + SrcSel = WORD_0; + else if (*Offset == 0 && *Width == 32) + SrcSel = DWORD; + else if (*Offset == 8 && *Width == 8) + SrcSel = BYTE_1; + else if (*Offset == 16 && *Width == 8) + SrcSel = BYTE_2; + else if (*Offset == 16 && *Width == 16) + SrcSel = WORD_1; + else if (*Offset == 24 && *Width == 8) + SrcSel = BYTE_3; + else + break; + + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + if (TRI->isPhysicalRegister(Src0->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + return make_unique( + Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32); + } + + case AMDGPU::V_AND_B32_e32: + case AMDGPU::V_AND_B32_e64: { + // e.g.: + // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 + // to SDWA src:v0 src_sel:WORD_0/BYTE_0 + + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + auto ValSrc = Src1; + auto Imm = foldToImm(*Src0); + + if (!Imm) { + Imm = foldToImm(*Src1); + ValSrc = Src0; + } + + if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) + break; + + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + if (TRI->isPhysicalRegister(Src1->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + return make_unique( + ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); + } + + case AMDGPU::V_OR_B32_e32: + case AMDGPU::V_OR_B32_e64: { + // Patterns for dst_unused:UNUSED_PRESERVE. + // e.g., from: + // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD + // src1_sel:WORD_1 src2_sel:WORD1 + // v_add_f16_e32 v3, v1, v2 + // v_or_b32_e32 v4, v0, v3 + // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 + + // Check if one of operands of v_or_b32 is SDWA instruction + using CheckRetType = Optional>; + auto CheckOROperandsForSDWA = + [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { + if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) + return CheckRetType(None); + + MachineOperand *Op1Def = findSingleRegDef(Op1, MRI); + if (!Op1Def) + return CheckRetType(None); + + MachineInstr *Op1Inst = Op1Def->getParent(); + if (!TII->isSDWA(*Op1Inst)) + return CheckRetType(None); + + MachineOperand *Op2Def = findSingleRegDef(Op2, MRI); + if (!Op2Def) + return CheckRetType(None); + + return CheckRetType(std::make_pair(Op1Def, Op2Def)); + }; + + MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + assert(OrSDWA && OrOther); + auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); + if (!Res) { + OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + assert(OrSDWA && OrOther); + Res = CheckOROperandsForSDWA(OrSDWA, OrOther); + if (!Res) break; - } - case AMDGPU::V_AND_B32_e32: - case AMDGPU::V_AND_B32_e64: { - // e.g.: - // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 - // to SDWA src:v0 src_sel:WORD_0/BYTE_0 - - MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - auto ValSrc = Src1; - auto Imm = foldToImm(*Src0); - - if (!Imm) { - Imm = foldToImm(*Src1); - ValSrc = Src0; - } - - if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) - break; - - MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - - if (TRI->isPhysicalRegister(Src1->getReg()) || - TRI->isPhysicalRegister(Dst->getReg())) - break; - - auto SDWASrc = make_unique( - ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); - DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); - SDWAOperands[&MI] = std::move(SDWASrc); + } + + MachineOperand *OrSDWADef = Res->first; + MachineOperand *OrOtherDef = Res->second; + assert(OrSDWADef && OrOtherDef); + + MachineInstr *SDWAInst = OrSDWADef->getParent(); + MachineInstr *OtherInst = OrOtherDef->getParent(); + + // Check that OtherInstr is actually bitwise compatible with SDWAInst = their + // destination patterns don't overlap. Compatible instruction can be either + // regular instruction with compatible bitness or SDWA instruction with + // correct dst_sel + // SDWAInst | OtherInst bitness / OtherInst dst_sel + // ----------------------------------------------------- + // DWORD | no / no + // WORD_0 | no / BYTE_2/3, WORD_1 + // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 + // BYTE_0 | no / BYTE_1/2/3, WORD_1 + // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 + // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 + // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 + // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK + // but v_add_f32 is not. + + // TODO: add support for non-SDWA instructions as OtherInst. + // For now this only works with SDWA instructions. For regular instructions + // there is no way to determine if instruction write only 8/16/24-bit out of + // full register size and all registers are at min 32-bit wide. + if (!TII->isSDWA(*OtherInst)) + break; + + SdwaSel DstSel = static_cast( + TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));; + SdwaSel OtherDstSel = static_cast( + TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); + + bool DstSelAgree = false; + switch (DstSel) { + case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || + (OtherDstSel == BYTE_3) || + (OtherDstSel == WORD_1)); + break; + case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || + (OtherDstSel == BYTE_1) || + (OtherDstSel == WORD_0)); + break; + case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || + (OtherDstSel == BYTE_2) || + (OtherDstSel == BYTE_3) || + (OtherDstSel == WORD_1)); + break; + case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || + (OtherDstSel == BYTE_2) || + (OtherDstSel == BYTE_3) || + (OtherDstSel == WORD_1)); + break; + case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || + (OtherDstSel == BYTE_1) || + (OtherDstSel == BYTE_3) || + (OtherDstSel == WORD_0)); + break; + case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || + (OtherDstSel == BYTE_1) || + (OtherDstSel == BYTE_2) || + (OtherDstSel == WORD_0)); + break; + default: DstSelAgree = false; + } + + if (!DstSelAgree) + break; + + // Also OtherInst dst_unused should be UNUSED_PAD + DstUnused OtherDstUnused = static_cast( + TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); + if (OtherDstUnused != DstUnused::UNUSED_PAD) + break; + + // Create DstPreserveOperand + MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + assert(OrDst && OrDst->isReg()); + + return make_unique( + OrDst, OrSDWADef, OrOtherDef, DstSel); + + } + } + + return std::unique_ptr(nullptr); +} + +void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (auto Operand = matchSDWAOperand(MI)) { + DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); + SDWAOperands[&MI] = std::move(Operand); ++NumSDWAPatternsFound; - break; - } } } } @@ -627,8 +835,12 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const { + // Check if this is already an SDWA instruction + unsigned Opc = MI.getOpcode(); + if (TII->isSDWA(Opc)) + return true; + // Check if this instruction has opcode that supports SDWA - int Opc = MI.getOpcode(); if (AMDGPU::getSDWAOp(Opc) == -1) Opc = AMDGPU::getVOPe32(Opc); @@ -665,9 +877,15 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands) { // Convert to sdwa - int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode()); - if (SDWAOpcode == -1) - SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(MI.getOpcode())); + int SDWAOpcode; + unsigned Opcode = MI.getOpcode(); + if (TII->isSDWA(Opcode)) { + SDWAOpcode = Opcode; + } else { + SDWAOpcode = AMDGPU::getSDWAOp(Opcode); + if (SDWAOpcode == -1) + SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); + } assert(SDWAOpcode != -1); const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); @@ -743,25 +961,44 @@ } } - // Initialize dst_sel if present + // Copy dst_sel if present, initialize otherwise if needed if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) { - SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); + if (DstSel) { + SDWAInst.add(*DstSel); + } else { + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + } } - // Initialize dst_unused if present + // Copy dst_unused if present, initialize otherwise if needed if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) { - SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); + MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); + if (DstUnused) { + SDWAInst.add(*DstUnused); + } else { + SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); + } } - // Initialize src0_sel + // Copy src0_sel if present, initialize otherwise assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); - SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); - + MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); + if (Src0Sel) { + SDWAInst.add(*Src0Sel); + } else { + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + } - // Initialize src1_sel if present + // Copy src1_sel if present, initialize otherwise if needed if (Src1) { assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); - SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); + if (Src1Sel) { + SDWAInst.add(*Src1Sel); + } else { + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + } } // Apply all sdwa operand pattenrs @@ -777,6 +1014,7 @@ // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was // already destroyed). So if SDWAOperand is also a potential MI then do not // apply it. + if (PotentialMatches.count(Operand->getParentInst()) == 0) Converted |= Operand->convertToSDWA(*SDWAInst, TII); } @@ -838,27 +1076,35 @@ TII = ST.getInstrInfo(); // Find all SDWA operands in MF. - matchSDWAOperands(MF); + bool Changed = false; + bool Ret = false; + do { + matchSDWAOperands(MF); + + for (const auto &OperandPair : SDWAOperands) { + const auto &Operand = OperandPair.second; + MachineInstr *PotentialMI = Operand->potentialToConvert(TII); + if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { + PotentialMatches[PotentialMI].push_back(Operand.get()); + } + } - for (const auto &OperandPair : SDWAOperands) { - const auto &Operand = OperandPair.second; - MachineInstr *PotentialMI = Operand->potentialToConvert(TII); - if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { - PotentialMatches[PotentialMI].push_back(Operand.get()); + for (auto &PotentialPair : PotentialMatches) { + MachineInstr &PotentialMI = *PotentialPair.first; + convertToSDWA(PotentialMI, PotentialPair.second); } - } - for (auto &PotentialPair : PotentialMatches) { - MachineInstr &PotentialMI = *PotentialPair.first; - convertToSDWA(PotentialMI, PotentialPair.second); - } + PotentialMatches.clear(); + SDWAOperands.clear(); + + Changed = !ConvertedInstructions.empty(); - PotentialMatches.clear(); - SDWAOperands.clear(); + if (Changed) + Ret = true; - bool Ret = !ConvertedInstructions.empty(); - while (!ConvertedInstructions.empty()) - legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); + while (!ConvertedInstructions.empty()) + legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); + } while (Changed); return Ret; } Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -166,6 +166,11 @@ const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC, unsigned SubIdx) const; + bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) const; + + bool isSubregOf(const MachineOperand &SubReg, + const MachineOperand &SuperReg) const; + bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, unsigned DefSubReg, const TargetRegisterClass *SrcRC, Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1314,6 +1314,31 @@ } } +bool SIRegisterInfo::isSameReg(const MachineOperand &LHS, + const MachineOperand &RHS) const { + return LHS.isReg() && + RHS.isReg() && + LHS.getReg() == RHS.getReg() && + LHS.getSubReg() == RHS.getSubReg(); +} + +bool SIRegisterInfo::isSubregOf(const MachineOperand &SubReg, + const MachineOperand &SuperReg) const { + if (!SuperReg.isReg() || !SubReg.isReg()) + return false; + + if (isSameReg(SuperReg, SubReg)) + return true; + + if (SuperReg.getReg() != SubReg.getReg()) + return false; + + LaneBitmask SuperMask = getSubRegIndexLaneMask(SuperReg.getSubReg()); + LaneBitmask SubMask = getSubRegIndexLaneMask(SubReg.getSubReg()); + SuperMask |= ~SubMask; + return SuperMask.all(); +} + bool SIRegisterInfo::shouldRewriteCopySrc( const TargetRegisterClass *DefRC, unsigned DefSubReg, Index: test/CodeGen/AMDGPU/fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.f16.ll +++ test/CodeGen/AMDGPU/fabs.f16.ll @@ -127,8 +127,7 @@ ; CI: v_mul_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} ; CI: v_cvt_f16_f32 -; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, -; VI: v_mul_f16_sdwa v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_mul_f16_sdwa v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} ; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]] Index: test/CodeGen/AMDGPU/fcanonicalize.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -207,7 +207,7 @@ } ; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f16: -; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; VI-NOT: v_and_b32 @@ -246,7 +246,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16: ; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} -; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_or_b32 @@ -266,8 +266,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16: ; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}} -; VI: v_lshrrev_b32_e32 [[FNEGHI:v[0-9]+]], 16, [[FNEG]] -; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], [[FNEG]], [[FNEGHI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], [[FNEG]], [[FNEG]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-DAG: v_max_f16_e32 [[REG0:v[0-9]+]], [[FNEG]], [[FNEG]] ; VI-NOT: 0xffff Index: test/CodeGen/AMDGPU/fneg.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg.f16.ll +++ test/CodeGen/AMDGPU/fneg.f16.ll @@ -116,8 +116,7 @@ ; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: v_cvt_f16_f32 -; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, -; VI: v_mul_f16_sdwa v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_mul_f16_sdwa v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,0] neg_hi:[1,0]{{$}} Index: test/CodeGen/AMDGPU/sdwa-peephole-instr.mir =================================================================== --- test/CodeGen/AMDGPU/sdwa-peephole-instr.mir +++ test/CodeGen/AMDGPU/sdwa-peephole-instr.mir @@ -148,13 +148,13 @@ # GCN-LABEL: {{^}}name: vop2_instructions -# VI: %{{[0-9]+}} = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit %exec +# VI: %{{[0-9]+}} = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit %exec # VI: %{{[0-9]+}} = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit %exec # VI: %{{[0-9]+}} = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit %exec # VI: %{{[0-9]+}} = V_MAC_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 6, 1, implicit %exec # VI: %{{[0-9]+}} = V_MAC_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit %exec -# GFX9: %{{[0-9]+}} = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit %exec +# GFX9: %{{[0-9]+}} = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit %exec # GFX9: %{{[0-9]+}} = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit %exec # GFX9: %{{[0-9]+}} = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit %exec # GFX9: %{{[0-9]+}} = V_MAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit %exec Index: test/CodeGen/AMDGPU/sdwa-preserve.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sdwa-preserve.mir @@ -0,0 +1,56 @@ +# RUN: llc -march=amdgcn -mcpu=fiji -start-before=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=SDWA %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=SDWA %s + +# SDWA-LABEL: {{^}}add_f16_u32_preserve + +# SDWA: flat_load_dword [[FIRST:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] +# SDWA: flat_load_dword [[SECOND:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] + +# SDWA: v_add_u32_sdwa [[RES:v[0-9]+]], [[FIRST]], [[SECOND]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_3 +# SDWA: v_add_f16_sdwa [[RES:v[0-9]+]], [[FIRST]], [[SECOND]] dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:WORD_1 + +# SDWA: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], [[RES]] + +--- +name: add_f16_u32_preserve +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: sreg_64 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: vgpr_32 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + - { id: 8, class: vgpr_32 } + - { id: 9, class: vgpr_32 } + - { id: 10, class: vgpr_32 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: vgpr_32 } +body: | + bb.0: + liveins: %vgpr0_vgpr1, %vgpr2_vgpr3, %sgpr30_sgpr31 + + %2 = COPY %sgpr30_sgpr31 + %1 = COPY %vgpr2_vgpr3 + %0 = COPY %vgpr0_vgpr1 + %3 = FLAT_LOAD_DWORD %0, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4) + %4 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4) + + %5 = V_AND_B32_e32 65535, %3, implicit %exec + %6 = V_LSHRREV_B32_e64 16, %4, implicit %exec + %7 = V_BFE_U32 %3, 8, 8, implicit %exec + %8 = V_LSHRREV_B32_e32 24, %4, implicit %exec + + %9 = V_ADD_F16_e64 0, %5, 0, %6, 0, 0, implicit %exec + %10 = V_LSHLREV_B16_e64 8, %9, implicit %exec + %11 = V_ADD_U32_e64 %7, %8, implicit %exec + %12 = V_LSHLREV_B32_e64 16, %11, implicit %exec + + %13 = V_OR_B32_e64 %10, %12, implicit %exec + + FLAT_STORE_DWORD %0, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4) + %sgpr30_sgpr31 = COPY %2 + S_SETPC_B64_return %sgpr30_sgpr31