Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -238,6 +238,36 @@ "Support SDWA (Sub-DWORD Addressing) extension" >; +def FeatureSDWAOmod : SubtargetFeature<"sdwa-omod", + "HasSDWAOmod", + "true", + "Support OMod with SDWA (Sub-DWORD Addressing) extension" +>; + +def FeatureSDWAScalar : SubtargetFeature<"sdwa-scalar", + "HasSDWAScalar", + "true", + "Support scalar register with SDWA (Sub-DWORD Addressing) extension" +>; + +def FeatureSDWASdst : SubtargetFeature<"sdwa-sdst", + "HasSDWASdst", + "true", + "Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension" +>; + +def FeatureSDWAMac : SubtargetFeature<"sdwa-mav", + "HasSDWAMac", + "true", + "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension" +>; + +def FeatureSDWAClampVOPC : SubtargetFeature<"sdwa-clamp-vopc", + "HasSDWAClampVOPC", + "true", + "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension" +>; + def FeatureDPP : SubtargetFeature<"dpp", "HasDPP", "true", @@ -421,8 +451,8 @@ FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel, - FeatureScalarStores, FeatureInv2PiInlineImm, FeatureSDWA, - FeatureDPP + FeatureScalarStores, FeatureInv2PiInlineImm, + FeatureSDWA, FeatureSDWAClampVOPC, FeatureSDWAMac, FeatureDPP ] >; @@ -432,7 +462,8 @@ FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, - FeatureFastFMAF32, FeatureSDWA, FeatureDPP, + FeatureFastFMAF32, FeatureDPP, + FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts ] >; Index: lib/Target/AMDGPU/AMDGPUInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -66,7 +66,9 @@ // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td enum SIEncodingFamily { SI = 0, - VI = 1 + VI = 1, + SDWA = 2, + SDWA9 = 3 }; // Wrapper for Tablegen'd function. enum Subtarget is not defined in any @@ -101,7 +103,12 @@ } int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { - int MCOp = AMDGPU::getMCOpcode(Opcode, subtargetEncodingFamily(ST)); + SIEncodingFamily Gen = subtargetEncodingFamily(ST); + if (get(Opcode).TSFlags & SIInstrFlags::SDWA) + Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9 + : SIEncodingFamily::SDWA; + + int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); // -1 means that Opcode is already a native instruction. if (MCOp == -1) Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -149,6 +149,11 @@ bool HasScalarStores; bool HasInv2PiInlineImm; bool HasSDWA; + bool HasSDWAOmod; + bool HasSDWAScalar; + bool HasSDWASdst; + bool HasSDWAMac; + bool HasSDWAClampVOPC; bool HasDPP; bool FlatAddressSpace; bool FlatInstOffsets; @@ -431,6 +436,26 @@ return HasSDWA; } + bool hasSDWAOmod() const { + return HasSDWAOmod; + } + + bool hasSDWAScalar() const { + return HasSDWAScalar; + } + + bool hasSDWASdst() const { + return HasSDWASdst; + } + + bool hasSDWAMac() const { + return HasSDWAMac; + } + + bool hasSDWAClampVOPC() const { + return HasSDWAClampVOPC; + } + /// \brief Returns the offset in bytes from the start of the input buffer /// of the first explicit kernel argument. unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const { Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -124,6 +124,11 @@ HasScalarStores(false), HasInv2PiInlineImm(false), HasSDWA(false), + HasSDWAOmod(false), + HasSDWAScalar(false), + HasSDWASdst(false), + HasSDWAMac(false), + HasSDWAClampVOPC(false), HasDPP(false), FlatAddressSpace(false), FlatInstOffsets(false), Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -260,6 +260,8 @@ return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID); } + bool isSDWARegKind() const; + bool isImmTy(ImmTy ImmT) const { return isImm() && Imm.Type == ImmT; } @@ -1243,6 +1245,15 @@ return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg()); } +bool AMDGPUOperand::isSDWARegKind() const { + if (AsmParser->isVI()) + return isVReg(); + else if (AsmParser->isGFX9()) + return isRegKind(); + else + return false; +} + uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const { assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers()); @@ -4475,12 +4486,11 @@ if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 && Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) { - // V_NOP_sdwa_vi has no optional sdwa arguments + // v_nop_sdwa_sdwa_vi/gfx9 has no optional sdwa arguments switch (BasicInstType) { case SIInstrFlags::VOP1: addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); - if (isGFX9() && - AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0); } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); @@ -4490,8 +4500,7 @@ case SIInstrFlags::VOP2: addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); - if (isGFX9() && - AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0); } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); @@ -4501,9 +4510,7 @@ break; case SIInstrFlags::VOPC: - if (isVI()) { - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); - } + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD); break; Index: lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h =================================================================== --- lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -65,6 +65,8 @@ uint64_t Inst, uint64_t Address) const; + DecodeStatus convertSDWAInst(MCInst &MI) const; + MCOperand decodeOperand_VGPR_32(unsigned Val) const; MCOperand decodeOperand_VS_32(unsigned Val) const; MCOperand decodeOperand_VS_64(unsigned Val) const; @@ -105,10 +107,10 @@ MCOperand decodeSpecialReg32(unsigned Val) const; MCOperand decodeSpecialReg64(unsigned Val) const; - MCOperand decodeSDWA9Src(const OpWidthTy Width, unsigned Val) const; - MCOperand decodeSDWA9Src16(unsigned Val) const; - MCOperand decodeSDWA9Src32(unsigned Val) const; - MCOperand decodeSDWA9VopcDst(unsigned Val) const; + MCOperand decodeSDWASrc(const OpWidthTy Width, unsigned Val) const; + MCOperand decodeSDWASrc16(unsigned Val) const; + MCOperand decodeSDWASrc32(unsigned Val) const; + MCOperand decodeSDWAVopcDst(unsigned Val) const; }; //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp =================================================================== --- lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -49,6 +49,17 @@ MCDisassembler::SoftFail; } +static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op, + uint16_t NameIdx) { + int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), NameIdx); + if (OpIdx != -1) { + auto I = MI.begin(); + std::advance(I, OpIdx); + MI.insert(I, Op); + } + return OpIdx; +} + static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { auto DAsm = static_cast(Decoder); @@ -106,12 +117,12 @@ return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm)); } -#define DECODE_SDWA9(DecName) \ -DECODE_OPERAND(decodeSDWA9##DecName, decodeSDWA9##DecName) +#define DECODE_SDWA(DecName) \ +DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName) -DECODE_SDWA9(Src32) -DECODE_SDWA9(Src16) -DECODE_SDWA9(VopcDst) +DECODE_SDWA(Src32) +DECODE_SDWA(Src16) +DECODE_SDWA(VopcDst) #include "AMDGPUGenDisassemblerTables.inc" @@ -149,6 +160,7 @@ raw_ostream &WS, raw_ostream &CS) const { CommentStream = &CS; + bool IsSDWA = false; // ToDo: AMDGPUDisassembler supports only VI ISA. if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]) @@ -170,10 +182,10 @@ if (Res) break; Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address); - if (Res) break; + if (Res) { IsSDWA = true; break; } Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address); - if (Res) break; + if (Res) { IsSDWA = true; break; } } // Reinitialize Bytes as DPP64 could have eaten too much @@ -200,17 +212,36 @@ MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si || MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi)) { // Insert dummy unused src2_modifiers. - int Src2ModIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::src2_modifiers); - auto I = MI.begin(); - std::advance(I, Src2ModIdx); - MI.insert(I, MCOperand::createImm(0)); + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src2_modifiers); } + if (Res && IsSDWA) + Res = convertSDWAInst(MI); + Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0; return Res; } +DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { + if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) { + if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst) != -1) + // VOPC - insert clamp + insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp); + } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) { + int SDst = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst); + if (SDst != -1) { + // VOPC - insert VCC register as sdst + insertNamedMCOperand(MI, MCOperand::createReg(AMDGPU::VCC), + AMDGPU::OpName::sdst); + } else { + // VOP1/2 - insert omod if present in instruction + insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod); + } + } + return MCDisassembler::Success; +} + const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const { return getContext().getRegisterInfo()-> getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]); @@ -592,36 +623,43 @@ return errOperand(Val, "unknown operand encoding " + Twine(Val)); } -MCOperand AMDGPUDisassembler::decodeSDWA9Src(const OpWidthTy Width, - unsigned Val) const { +MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width, + unsigned Val) const { using namespace AMDGPU::SDWA; - if (SDWA9EncValues::SRC_VGPR_MIN <= Val && - Val <= SDWA9EncValues::SRC_VGPR_MAX) { - return createRegOperand(getVgprClassId(Width), - Val - SDWA9EncValues::SRC_VGPR_MIN); - } - if (SDWA9EncValues::SRC_SGPR_MIN <= Val && - Val <= SDWA9EncValues::SRC_SGPR_MAX) { - return createSRegOperand(getSgprClassId(Width), - Val - SDWA9EncValues::SRC_SGPR_MIN); - } + if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) { + if (SDWA9EncValues::SRC_VGPR_MIN <= Val && + Val <= SDWA9EncValues::SRC_VGPR_MAX) { + return createRegOperand(getVgprClassId(Width), + Val - SDWA9EncValues::SRC_VGPR_MIN); + } + if (SDWA9EncValues::SRC_SGPR_MIN <= Val && + Val <= SDWA9EncValues::SRC_SGPR_MAX) { + return createSRegOperand(getSgprClassId(Width), + Val - SDWA9EncValues::SRC_SGPR_MIN); + } - return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN); + return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN); + } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) { + return createRegOperand(getVgprClassId(Width), Val); + } + llvm_unreachable("unsupported target"); } -MCOperand AMDGPUDisassembler::decodeSDWA9Src16(unsigned Val) const { - return decodeSDWA9Src(OPW16, Val); +MCOperand AMDGPUDisassembler::decodeSDWASrc16(unsigned Val) const { + return decodeSDWASrc(OPW16, Val); } -MCOperand AMDGPUDisassembler::decodeSDWA9Src32(unsigned Val) const { - return decodeSDWA9Src(OPW32, Val); +MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const { + return decodeSDWASrc(OPW32, Val); } -MCOperand AMDGPUDisassembler::decodeSDWA9VopcDst(unsigned Val) const { +MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const { using namespace AMDGPU::SDWA; + assert(STI.getFeatureBits()[AMDGPU::FeatureGFX9] && + "SDWAVopcDst should be present only on GFX9"); if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) { Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK; if (Val > AMDGPU::EncValues::SGPR_MAX) { Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -52,15 +52,15 @@ return 0; } - virtual unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { + virtual unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { return 0; } - virtual unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { + virtual unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { return 0; } Index: lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -69,14 +69,14 @@ unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const override; - - unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const override; - - unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const override; + + unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const override; + + unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const override; }; } // end anonymous namespace @@ -328,11 +328,11 @@ } unsigned -SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { +SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { using namespace AMDGPU::SDWA; - + uint64_t RegEnc = 0; const MCOperand &MO = MI.getOperand(OpNo); @@ -347,9 +347,9 @@ } unsigned -SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { +SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { using namespace AMDGPU::SDWA; uint64_t RegEnc = 0; Index: lib/Target/AMDGPU/SIDefines.h =================================================================== --- lib/Target/AMDGPU/SIDefines.h +++ lib/Target/AMDGPU/SIDefines.h @@ -118,9 +118,9 @@ // Operand for source modifiers for VOP instructions OPERAND_INPUT_MODS, - // Operand for GFX9 SDWA instructions - OPERAND_SDWA9_SRC, - OPERAND_SDWA9_VOPC_DST, + // Operand for SDWA instructions + OPERAND_SDWA_SRC, + OPERAND_SDWA_VOPC_DST, /// Operand with 32-bit immediate that uses the constant bus. OPERAND_KIMM32, Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -814,6 +814,9 @@ int getSDWAOp(uint16_t Opcode); LLVM_READONLY + int getBasicFromSDWAOp(uint16_t Opcode); + + LLVM_READONLY int getCommuteRev(uint16_t Opcode); LLVM_READONLY Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2108,7 +2108,9 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const { - if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET) + if (!MO.isImm() || + OperandType < AMDGPU::OPERAND_SRC_FIRST || + OperandType > AMDGPU::OPERAND_SRC_LAST) return false; // MachineOperand provides no way to tell the true operand size, since it only @@ -2433,8 +2435,73 @@ } } + // Verify SDWA + if (isSDWA(MI)) { + + if (!ST.hasSDWA()) { + ErrInfo = "SDWA is not supported on this target"; + return false; + } + + int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); + if ( DstIdx == -1) + DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::sdst); + + const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; + + for (int OpIdx: OpIndicies) { + if (OpIdx == -1) + continue; + const MachineOperand &MO = MI.getOperand(OpIdx); + + if (!ST.hasSDWAScalar()) { + // Only VGPRS on VI + if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { + ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; + return false; + } + } else { + // No immediates on GFX9 + if (!MO.isReg()) { + ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9"; + return false; + } + } + } + + if (!ST.hasSDWAOmod()) { + // No omod allowed on VI + const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); + if (OMod != nullptr && + (!OMod->isImm() || OMod->getImm() != 0)) { + ErrInfo = "OMod not allowed in SDWA instructions on VI"; + return false; + } + } + + uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); + if (isVOPC(BasicOpcode)) { + if (!ST.hasSDWASdst() && DstIdx != -1) { + // Only vcc allowed as dst on VI for VOPC + const MachineOperand &Dst = MI.getOperand(DstIdx); + if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { + ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; + return false; + } + } else if (!ST.hasSDWAClampVOPC()) { + // No clamp allowed on GFX9 for VOPC + const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); + if (Clamp != nullptr && + (!Clamp->isImm() || Clamp->getImm() != 0)) { + ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; + return false; + } + } + } + } + // Verify VOP* - if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) { + if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) { // Only look at the true operands. Only a real operand can use the constant // bus, and we don't want to check pseudo-operands like the source modifier // flags. Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -20,6 +20,8 @@ int NONE = -1; int SI = 0; int VI = 1; + int SDWA = 2; + int SDWA9 = 3; } //===----------------------------------------------------------------------===// @@ -452,25 +454,25 @@ let ParserMatchClass = VReg32OrOffClass; } -class SDWA9Src : RegisterOperand { +class SDWASrc : RegisterOperand { let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_SDWA9_SRC"; - let EncoderMethod = "getSDWA9SrcEncoding"; + let OperandType = "OPERAND_SDWA_SRC"; + let EncoderMethod = "getSDWASrcEncoding"; } -def SDWA9Src32 : SDWA9Src { - let DecoderMethod = "decodeSDWA9Src32"; +def SDWASrc32 : SDWASrc { + let DecoderMethod = "decodeSDWASrc32"; } -def SDWA9Src16 : SDWA9Src { - let DecoderMethod = "decodeSDWA9Src16"; +def SDWASrc16 : SDWASrc { + let DecoderMethod = "decodeSDWASrc16"; } -def SDWA9VopcDst : VOPDstOperand { +def SDWAVopcDst : VOPDstOperand { let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_SDWA9_VOPC_DST"; - let EncoderMethod = "getSDWA9VopcDstEncoding"; - let DecoderMethod = "decodeSDWA9VopcDst"; + let OperandType = "OPERAND_SDWA_VOPC_DST"; + let EncoderMethod = "getSDWAVopcDstEncoding"; + let DecoderMethod = "decodeSDWAVopcDst"; } class NamedMatchClass : AsmOperandClass { @@ -634,13 +636,13 @@ def Int32InputMods : IntInputMods; def Int64InputMods : IntInputMods; -def FPRegInputModsMatchClass : AsmOperandClass { - let Name = "RegWithFPInputMods"; +def FPRegSDWAInputModsMatchClass : AsmOperandClass { + let Name = "SDWARegWithFPInputMods"; let ParserMethod = "parseRegWithFPInputMods"; - let PredicateMethod = "isRegKind"; + let PredicateMethod = "isSDWARegKind"; } -def FPRegInputMods : InputMods { +def FPRegSDWAInputMods : InputMods { let PrintMethod = "printOperandAndFPInputMods"; } @@ -655,13 +657,13 @@ } -def IntRegInputModsMatchClass : AsmOperandClass { - let Name = "RegWithIntInputMods"; +def IntRegSDWAInputModsMatchClass : AsmOperandClass { + let Name = "SDWARegWithIntInputMods"; let ParserMethod = "parseRegWithIntInputMods"; - let PredicateMethod = "isRegKind"; + let PredicateMethod = "isSDWARegKind"; } -def IntRegInputMods : InputMods { +def IntRegSDWAInputMods : InputMods { let PrintMethod = "printOperandAndIntInputMods"; } @@ -851,10 +853,10 @@ } // Returns the register class to use for the destination of VOP[12C] -// instructions with GFX9 SDWA extension -class getSDWA9DstForVT { +// instructions with SDWA extension +class getSDWADstForVT { RegisterOperand ret = !if(!eq(VT.Size, 1), - SDWA9VopcDst, // VOPC + SDWAVopcDst, // VOPC VOPDstOperand); // VOP1/2 32-bit dst } @@ -898,8 +900,8 @@ !if(!eq(VT.Size, 64), VReg_64, VGPR_32)); } -class getSDWA9SrcForVT { - RegisterOperand ret = !if(!eq(VT.Size, 16), SDWA9Src16, SDWA9Src32); +class getSDWASrcForVT { + RegisterOperand ret = !if(!eq(VT.Size, 16), SDWASrc16, SDWASrc32); } // Returns the register class to use for sources of VOP3 instructions for the @@ -995,7 +997,7 @@ ); } -// Return type of input modifiers operand specified input operand for SDWA/DPP +// Return type of input modifiers operand specified input operand for DPP class getSrcModExt { bit isFP = !if(!eq(VT.Value, f16.Value), 1, !if(!eq(VT.Value, f32.Value), 1, @@ -1004,13 +1006,13 @@ Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods); } -// Return type of input modifiers operand specified input operand for SDWA 9 -class getSrcModSDWA9 { +// Return type of input modifiers operand specified input operand for SDWA +class getSrcModSDWA { bit isFP = !if(!eq(VT.Value, f16.Value), 1, !if(!eq(VT.Value, f32.Value), 1, !if(!eq(VT.Value, f64.Value), 1, 0))); - Operand ret = !if(isFP, FPRegInputMods, IntRegInputMods); + Operand ret = !if(isFP, FPRegSDWAInputMods, IntRegSDWAInputMods); } // Returns the input arguments for VOP[12C] instructions for the given SrcVT. @@ -1141,36 +1143,12 @@ /* endif */))); } -class getInsSDWA { - dag ret = !if(!eq(NumSrcArgs, 0), - // VOP1 without input operands (V_NOP) - (ins), - !if(!eq(NumSrcArgs, 1), - // VOP1_SDWA - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, - src0_sel:$src0_sel), - !if(!eq(NumSrcArgs, 2), - !if(!eq(DstVT.Size, 1), - // VOPC_SDWA with modifiers - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel), - // VOP2_SDWA with modifiers - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, - src0_sel:$src0_sel, src1_sel:$src1_sel)), - (ins)/* endif */))); -} -// Ins for GFX9 SDWA -class getInsSDWA9 { +// Ins for SDWA +class getInsSDWA { dag ret = !if(!eq(NumSrcArgs, 0), // VOP1 without input operands (V_NOP) @@ -1178,31 +1156,31 @@ !if(!eq(NumSrcArgs, 1), // VOP1 !if(!eq(HasSDWAOMod, 0), - // VOP1_SDWA9 without omod + // VOP1_SDWA without omod (ins Src0Mod:$src0_modifiers, Src0RC:$src0, clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel), - // VOP1_SDWA9 with omod + // VOP1_SDWA with omod (ins Src0Mod:$src0_modifiers, Src0RC:$src0, clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel)), !if(!eq(NumSrcArgs, 2), !if(!eq(DstVT.Size, 1), - // VOPC_SDWA9 + // VOPC_SDWA (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, - src0_sel:$src0_sel, src1_sel:$src1_sel), - // VOP2_SDWA9 + clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel), + // VOP2_SDWA !if(!eq(HasSDWAOMod, 0), - // VOP2_SDWA9 without omod + // VOP2_SDWA without omod (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel), - // VOP1_SDWA9 with omod + // VOP2_SDWA with omod (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, clampmod:$clamp, omod:$omod, @@ -1220,12 +1198,12 @@ (outs)); // V_NOP } -// Outs for GFX9 SDWA -class getOutsSDWA9 { +// Outs for SDWA +class getOutsSDWA { dag ret = !if(HasDst, !if(!eq(DstVT.Size, 1), - (outs DstRCSDWA9:$sdst), - (outs DstRCSDWA9:$vdst)), + (outs DstRCSDWA:$sdst), + (outs DstRCSDWA:$vdst)), (outs)); // V_NOP } @@ -1387,8 +1365,7 @@ field ValueType Src2VT = ArgVT[3]; field RegisterOperand DstRC = getVALUDstForVT.ret; field RegisterOperand DstRCDPP = getVALUDstForVT.ret; - field RegisterOperand DstRCSDWA = getVALUDstForVT.ret; - field RegisterOperand DstRCSDWA9 = getSDWA9DstForVT.ret; + field RegisterOperand DstRCSDWA = getSDWADstForVT.ret; field RegisterOperand Src0RC32 = getVOPSrc0ForVT.ret; field RegisterClass Src1RC32 = getVregSrcForVT.ret; field RegisterOperand Src0RC64 = getVOP3SrcForVT.ret; @@ -1396,19 +1373,15 @@ field RegisterOperand Src2RC64 = getVOP3SrcForVT.ret; field RegisterClass Src0DPP = getVregSrcForVT.ret; field RegisterClass Src1DPP = getVregSrcForVT.ret; - field RegisterClass Src0SDWA = getVregSrcForVT.ret; - field RegisterClass Src1SDWA = getVregSrcForVT.ret; - field RegisterOperand Src0SDWA9 = getSDWA9SrcForVT.ret; - field RegisterOperand Src1SDWA9 = getSDWA9SrcForVT.ret; + field RegisterOperand Src0SDWA = getSDWASrcForVT.ret; + field RegisterOperand Src1SDWA = getSDWASrcForVT.ret; field Operand Src0Mod = getSrcMod.ret; field Operand Src1Mod = getSrcMod.ret; field Operand Src2Mod = getSrcMod.ret; field Operand Src0ModDPP = getSrcModExt.ret; field Operand Src1ModDPP = getSrcModExt.ret; - field Operand Src0ModSDWA = getSrcModExt.ret; - field Operand Src1ModSDWA = getSrcModExt.ret; - field Operand Src0ModSDWA9 = getSrcModSDWA9.ret; - field Operand Src1ModSDWA9 = getSrcModSDWA9.ret; + field Operand Src0ModSDWA = getSrcModSDWA.ret; + field Operand Src1ModSDWA = getSrcModSDWA.ret; field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1); @@ -1457,8 +1430,7 @@ field dag Outs32 = Outs; field dag Outs64 = Outs; field dag OutsDPP = getOutsExt.ret; - field dag OutsSDWA = getOutsExt.ret; - field dag OutsSDWA9 = getOutsSDWA9.ret; + field dag OutsSDWA = getOutsSDWA.ret; field dag Ins32 = getIns32.ret; field dag Ins64 = getIns64.ret; field dag InsSDWA = getInsSDWA.ret; - field dag InsSDWA9 = getInsSDWA9.ret; + field string Asm32 = getAsm32.ret; field string Asm64 = getAsm64.ret; @@ -1628,13 +1598,13 @@ let ValueCols = [["SDWA"]]; } -// Maps ordinary instructions to their SDWA GFX9 counterparts -def getSDWA9Op : InstrMapping { +// Maps SDWA instructions to their ordinary counterparts +def getBasicFromSDWAOp : InstrMapping { let FilterClass = "VOP"; let RowFields = ["OpName"]; let ColFields = ["AsmVariantName"]; - let KeyCol = ["Default"]; - let ValueCols = [["SDWA9"]]; + let KeyCol = ["SDWA"]; + let ValueCols = [["Default"]]; } def getMaskedMIMGOp : InstrMapping { @@ -1669,7 +1639,9 @@ let ColFields = ["Subtarget"]; let KeyCol = [!cast(SIEncodingFamily.NONE)]; let ValueCols = [[!cast(SIEncodingFamily.SI)], - [!cast(SIEncodingFamily.VI)]]; + [!cast(SIEncodingFamily.VI)], + [!cast(SIEncodingFamily.SDWA)], + [!cast(SIEncodingFamily.SDWA9)]]; } // Get equivalent SOPK instruction. Index: lib/Target/AMDGPU/SIPeepholeSDWA.cpp =================================================================== --- lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -67,9 +67,9 @@ bool runOnMachineFunction(MachineFunction &MF) override; void matchSDWAOperands(MachineFunction &MF); - bool isConvertibleToSDWA(const MachineInstr &MI) const; + bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); - void legalizeScalarOperands(MachineInstr &MI) const; + void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const; StringRef getPassName() const override { return "SI Peephole SDWA"; } @@ -224,7 +224,7 @@ static bool isSubregOf(const MachineOperand &SubReg, const MachineOperand &SuperReg, const TargetRegisterInfo *TRI) { - + if (!SuperReg.isReg() || !SubReg.isReg()) return false; @@ -557,7 +557,7 @@ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - + if (TRI->isPhysicalRegister(Src0->getReg()) || TRI->isPhysicalRegister(Dst->getReg())) break; @@ -590,7 +590,7 @@ break; MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - + if (TRI->isPhysicalRegister(Src1->getReg()) || TRI->isPhysicalRegister(Dst->getReg())) break; @@ -607,16 +607,38 @@ } } -bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const { +bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, + const SISubtarget &ST) const { // Check if this instruction has opcode that supports SDWA - unsigned Opc = MI.getOpcode(); - if (AMDGPU::getSDWAOp(Opc) != -1) - return true; - int Opc32 = AMDGPU::getVOPe32(Opc); - if (Opc32 != -1 && AMDGPU::getSDWAOp(Opc32) != -1) - return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) && - !TII->getNamedOperand(MI, AMDGPU::OpName::sdst); - return false; + int Opc = MI.getOpcode(); + if (AMDGPU::getSDWAOp(Opc) == -1) + Opc = AMDGPU::getVOPe32(Opc); + + if (Opc == -1 || AMDGPU::getSDWAOp(Opc) == -1) + return false; + + if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + return false; + + if (TII->isVOPC(Opc)) { + if (!ST.hasSDWASdst()) { + const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + if (SDst && SDst->getReg() != AMDGPU::VCC) + return false; + } + + if (!ST.hasSDWAClampVOPC() && TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) + return false; + + } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { + return false; + } + + if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 || + Opc == AMDGPU::V_MAC_F32_e32)) + return false; + + return true; } bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, @@ -641,6 +663,11 @@ if (Dst) { assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); SDWAInst.add(*Dst); + } else { + Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + assert(Dst && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); + SDWAInst.add(*Dst); } // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and @@ -677,9 +704,23 @@ SDWAInst.add(*Src2); } - // Initialize clamp. + // Copy clamp if present, initialize otherwise assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); - SDWAInst.addImm(0); + MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); + if (Clamp) { + SDWAInst.add(*Clamp); + } else { + SDWAInst.addImm(0); + } + + // Copy omod if present, initialize otherwise if needed + MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); + if (OMod) { + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1); + SDWAInst.add(*OMod); + } else if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { + SDWAInst.addImm(0); + } // Initialize dst_sel and dst_unused if present if (Dst) { @@ -733,16 +774,25 @@ } // If an instruction was converted to SDWA it should not have immediates or SGPR -// operands. Copy its scalar operands into VGPRs. -void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const { +// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. +void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const { const MCInstrDesc &Desc = TII->get(MI.getOpcode()); - for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { - MachineOperand &Op = MI.getOperand(I); + unsigned ConstantBusCount = 0; + for (MachineOperand &Op: MI.explicit_uses()) { if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) continue; + + unsigned I = MI.getOperandNo(&Op); if (Desc.OpInfo[I].RegClass == -1 || !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))) continue; + + if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && + TRI->isSGPRReg(*MRI, Op.getReg())) { + ++ConstantBusCount; + continue; + } + unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), VGPR); @@ -758,22 +808,20 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget(); - if (!ST.hasSDWA() || - !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9 + if (!ST.hasSDWA()) return false; - } MRI = &MF.getRegInfo(); TRI = ST.getRegisterInfo(); TII = ST.getInstrInfo(); - + // Find all SDWA operands in MF. matchSDWAOperands(MF); for (const auto &OperandPair : SDWAOperands) { const auto &Operand = OperandPair.second; MachineInstr *PotentialMI = Operand->potentialToConvert(TII); - if (PotentialMI && isConvertibleToSDWA(*PotentialMI)) { + if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { PotentialMatches[PotentialMI].push_back(Operand.get()); } } @@ -788,7 +836,7 @@ bool Ret = !ConvertedInstructions.empty(); while (!ConvertedInstructions.empty()) - legalizeScalarOperands(*ConvertedInstructions.pop_back_val()); + legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); return Ret; } Index: lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP1Instructions.td +++ lib/Target/AMDGPU/VOP1Instructions.td @@ -93,11 +93,6 @@ let AsmMatchConverter = "cvtSdwaVOP1"; } -class VOP1_SDWA9_Pseudo pattern=[]> : - VOP_SDWA9_Pseudo { - let AsmMatchConverter = "cvtSdwaVOP1"; -} - class getVOP1Pat64 : LetDummies { list ret = !if(P.HasModifiers, @@ -117,7 +112,6 @@ def _e32 : VOP1_Pseudo ; def _e64 : VOP3_Pseudo .ret>; def _sdwa : VOP1_SDWA_Pseudo ; - def _sdwa9 : VOP1_SDWA9_Pseudo ; } // Special profile for instructions which have clamp @@ -274,12 +268,10 @@ let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0); let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel); - let InsSDWA9 = (ins Src0RC32:$vdst, Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, - clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, - src0_sel:$src0_sel); let Asm32 = getAsm32<1, 1>.ret; let Asm64 = getAsm64<1, 1, 0, 1>.ret; @@ -545,8 +537,8 @@ VOP1_SDWAe (NAME#"_sdwa").Pfl>; def _sdwa_gfx9 : - VOP_SDWA9_Real (NAME#"_sdwa9")>, - VOP1_SDWA9Ae (NAME#"_sdwa9").Pfl>; + VOP_SDWA9_Real (NAME#"_sdwa")>, + VOP1_SDWA9Ae (NAME#"_sdwa").Pfl>; // For now left dpp only for asm/dasm // TODO: add corresponding pseudo Index: lib/Target/AMDGPU/VOP2Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP2Instructions.td +++ lib/Target/AMDGPU/VOP2Instructions.td @@ -114,11 +114,6 @@ let AsmMatchConverter = "cvtSdwaVOP2"; } -class VOP2_SDWA9_Pseudo pattern=[]> : - VOP_SDWA9_Pseudo { - let AsmMatchConverter = "cvtSdwaVOP2"; -} - class getVOP2Pat64 : LetDummies { list ret = !if(P.HasModifiers, [(set P.DstVT:$vdst, @@ -139,7 +134,6 @@ Commutable_REV; def _sdwa : VOP2_SDWA_Pseudo ; - def _sdwa9 : VOP2_SDWA9_Pseudo ; } multiclass VOP2bInst { let AsmMatchConverter = "cvtSdwaVOP2b"; } - - def _sdwa9 : VOP2_SDWA9_Pseudo { - let AsmMatchConverter = "cvtSdwaVOP2b"; - } } def _e64 : VOP3_Pseudo .ret>, @@ -221,17 +211,13 @@ VGPR_32:$src2, // stub argument dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, VGPR_32:$src2, // stub argument - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); - let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, - Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, - VGPR_32:$src2, // stub argument - clampmod:$clamp, omod:$omod, - dst_sel:$dst_sel, dst_unused:$dst_unused, - src0_sel:$src0_sel, src1_sel:$src1_sel); let Asm32 = getAsm32<1, 2, vt>.ret; let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret; let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret; @@ -289,15 +275,10 @@ let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); - let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, - Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, - clampmod:$clamp, omod:$omod, - dst_sel:$dst_sel, dst_unused:$dst_unused, - src0_sel:$src0_sel, src1_sel:$src1_sel); - let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0, Src1Mod:$src1_modifiers, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, @@ -728,8 +709,8 @@ multiclass VOP2_SDWA9_Real op> { def _sdwa_gfx9 : - VOP_SDWA9_Real (NAME#"_sdwa9")>, - VOP2_SDWA9Ae (NAME#"_sdwa9").Pfl>; + VOP_SDWA9_Real (NAME#"_sdwa")>, + VOP2_SDWA9Ae (NAME#"_sdwa").Pfl>; } multiclass VOP2be_Real_e32e64_vi op> : Index: lib/Target/AMDGPU/VOPCInstructions.td =================================================================== --- lib/Target/AMDGPU/VOPCInstructions.td +++ lib/Target/AMDGPU/VOPCInstructions.td @@ -113,11 +113,6 @@ let AsmMatchConverter = "cvtSdwaVOPC"; } -class VOPC_SDWA9_Pseudo pattern=[]> : - VOP_SDWA9_Pseudo { - let AsmMatchConverter = "cvtSdwaVOPC"; -} - // This class is used only with VOPC instructions. Use $sdst for out operand class VOPCInstAlias : InstAlias , PredicateControl { @@ -189,13 +184,6 @@ let isConvergent = DefExec; let isCompare = 1; } - - def _sdwa9 : VOPC_SDWA9_Pseudo { - let Defs = !if(DefExec, [VCC, EXEC], [VCC]); - let SchedRW = P.Schedule; - let isConvergent = DefExec; - let isCompare = 1; - } } def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>; @@ -540,14 +528,12 @@ VOPC_Profile { let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); let Asm64 = "$sdst, $src0_modifiers, $src1"; + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel); - let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, - Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, - src0_sel:$src0_sel, src1_sel:$src1_sel); + let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel"; - //let AsmSDWA9 = " $sdst, $src0_modifiers, $src1_modifiers $src0_sel $src1_sel"; let HasSrc1Mods = 0; let HasClamp = 0; let HasOMod = 0; @@ -580,12 +566,6 @@ let SchedRW = p.Schedule; let isConvergent = DefExec; } - - def _sdwa9 : VOPC_SDWA9_Pseudo { - let Defs = !if(DefExec, [VCC, EXEC], [VCC]); - let SchedRW = p.Schedule; - let isConvergent = DefExec; - } } def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>; @@ -954,8 +934,8 @@ VOPC_SDWAe (NAME#"_sdwa").Pfl>; def _sdwa_gfx9 : - VOP_SDWA9_Real (NAME#"_sdwa9")>, - VOPC_SDWA9e (NAME#"_sdwa9").Pfl>; + VOP_SDWA9_Real (NAME#"_sdwa")>, + VOPC_SDWA9e (NAME#"_sdwa").Pfl>; def : VOPCInstAlias (NAME#"_e64"), !cast(NAME#"_e32_vi")> { Index: lib/Target/AMDGPU/VOPInstructions.td =================================================================== --- lib/Target/AMDGPU/VOPInstructions.td +++ lib/Target/AMDGPU/VOPInstructions.td @@ -300,6 +300,19 @@ let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0); } +// GFX9 adds two features to SDWA: +// 1. Add 3 fields to the SDWA microcode word: S0, S1 and OMOD. +// a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather +// than VGPRs (at most 1 can be an SGPR); +// b. OMOD is the standard output modifier (result *2, *4, /2) +// 2. Add a new version of the SDWA microcode word for VOPC: SDWAB. This +// replaces OMOD and the dest fields with SD and SDST (SGPR destination) +// field. +// a. When SD=1, the SDST is used as the destination for the compare result; +// b. When SD=0, VCC is used. +// +// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA + // gfx9 SDWA basic encoding class VOP_SDWA9e : Enc64 { bits<9> src0; // {src0_sgpr{0}, src0{7-0}} @@ -353,6 +366,7 @@ string Mnemonic = opName; string AsmOperands = P.AsmSDWA; + string AsmOperands9 = P.AsmSDWA9; let Size = 8; let mayLoad = 0; @@ -372,53 +386,9 @@ VOPProfile Pfl = P; } -// GFX9 adds two features to SDWA: -// 1. Add 3 fields to the SDWA microcode word: S0, S1 and OMOD. -// a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather -// than VGPRs (at most 1 can be an SGPR); -// b. OMOD is the standard output modifier (result *2, *4, /2) -// 2. Add a new version of the SDWA microcode word for VOPC: SDWAB. This -// replaces OMOD and the dest fields with SD and SDST (SGPR destination) -// field. -// a. When SD=1, the SDST is used as the destination for the compare result; -// b.when SD=0, VCC is used. -// -// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA - -class VOP_SDWA9_Pseudo pattern=[]> : - InstSI , - VOP , - SIMCInstr , - MnemonicAlias { - - let isPseudo = 1; - let isCodeGenOnly = 1; - let UseNamedOperandTable = 1; - - string Mnemonic = opName; - string AsmOperands = P.AsmSDWA9; - - let Size = 8; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - - let VALU = 1; - let SDWA = 1; - let Uses = [EXEC]; - - let SubtargetPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst); - let AssemblerPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst); - let AsmVariantName = !if(P.HasSDWA9, AMDGPUAsmVariants.SDWA9, - AMDGPUAsmVariants.Disable); - let DecoderNamespace = "SDWA9"; - - VOPProfile Pfl = P; -} - class VOP_SDWA_Real : InstSI , - SIMCInstr { + SIMCInstr { let isPseudo = 0; let isCodeGenOnly = 0; @@ -431,6 +401,10 @@ let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; + // string Mnemonic = ps.Mnemonic; + // string AsmOperands = ps.AsmOperands; + // string AsmOperands9 = ps.AsmOperands9; + // Copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; let AssemblerPredicate = ps.AssemblerPredicate; @@ -443,9 +417,9 @@ let TSFlags = ps.TSFlags; } -class VOP_SDWA9_Real : - InstSI , - SIMCInstr { +class VOP_SDWA9_Real : + InstSI , + SIMCInstr { let isPseudo = 0; let isCodeGenOnly = 0; @@ -458,13 +432,15 @@ let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; + let SubtargetPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst); + let AssemblerPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst); + let AsmVariantName = !if(ps.Pfl.HasSDWA9, AMDGPUAsmVariants.SDWA9, + AMDGPUAsmVariants.Disable); + let DecoderNamespace = "SDWA9"; + // Copy relevant pseudo op flags - let SubtargetPredicate = ps.SubtargetPredicate; - let AssemblerPredicate = ps.AssemblerPredicate; let AsmMatchConverter = ps.AsmMatchConverter; - let AsmVariantName = ps.AsmVariantName; let UseNamedOperandTable = ps.UseNamedOperandTable; - let DecoderNamespace = ps.DecoderNamespace; let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; Index: test/CodeGen/AMDGPU/fneg.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg.f16.ll +++ test/CodeGen/AMDGPU/fneg.f16.ll @@ -134,11 +134,10 @@ ; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}} ; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]] ; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]] -; GFX9-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]] -; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 -; VI-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 +; GFX89-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in %fneg = fsub <2 x half> , %val Index: test/CodeGen/AMDGPU/fpext.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fpext.f16.ll +++ test/CodeGen/AMDGPU/fpext.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI -check-prefix=SIGFX9 %s +; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=SIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s ; GCN-LABEL: {{^}}fpext_f16_to_f32 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -35,11 +35,10 @@ ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f32 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GFX9-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; GCN-DAG: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SIGFX9: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]] -; VI: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SI: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]] +; GFX89: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}} ; GCN: s_endpgm @@ -55,9 +54,9 @@ ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f64 ; GCN: buffer_load_dword -; SIGFX9-DAG: v_lshrrev_b32_e32 -; SIGFX9-DAG: v_cvt_f32_f16_e32 -; VI: v_cvt_f32_f16_sdwa +; SI-DAG: v_lshrrev_b32_e32 +; SI-DAG: v_cvt_f32_f16_e32 +; GFX89: v_cvt_f32_f16_sdwa ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f64_f32_e32 Index: test/CodeGen/AMDGPU/llvm.rint.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -42,13 +42,13 @@ ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] ; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] -; GFX9: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GFX9: v_rndne_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] +; GFX9: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] ; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm + define amdgpu_kernel void @rint_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { Index: test/CodeGen/AMDGPU/sdwa-gfx9.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sdwa-gfx9.mir @@ -0,0 +1,88 @@ +# RUN: llc -march=amdgcn -mcpu=kaveri -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CI -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s + +# GCN-LABEL: {{^}}name: add_shr_i32 +# GCN: [[SMOV:%[0-9]+]] = S_MOV_B32 123 + +# CI: [[SHIFT:%[0-9]+]] = V_LSHRREV_B32_e64 16, %{{[0-9]+}}, implicit %exec +# CI: %{{[0-9]+}} = V_ADD_I32_e32 [[SMOV]], killed [[SHIFT]], implicit-def %vcc, implicit %exec + +# VI: [[VMOV:%[0-9]+]] = V_MOV_B32_e32 [[SMOV]], implicit %exec +# VI: %{{[0-9]+}} = V_ADD_I32_sdwa 0, [[VMOV]], 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit-def %vcc, implicit %exec + +# GFX9: %{{[0-9]+}} = V_ADD_I32_sdwa 0, [[SMOV]], 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit-def %vcc, implicit %exec + +--- +name: add_shr_i32 +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: sreg_64 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sreg_32_xm0 } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32 } + - { id: 9, class: vgpr_32 } + - { id: 10, class: vgpr_32 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: sreg_32_xm0 } +body: | + bb.0: + liveins: %vgpr0_vgpr1, %vgpr2_vgpr3, %sgpr30_sgpr31 + + %2 = COPY %sgpr30_sgpr31 + %1 = COPY %vgpr2_vgpr3 + %0 = COPY %vgpr0_vgpr1 + %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4) + %12 = S_MOV_B32 123 + %10 = V_LSHRREV_B32_e64 16, %3, implicit %exec + %11 = V_ADD_I32_e32 %12, killed %10, implicit-def %vcc, implicit %exec + FLAT_STORE_DWORD %0, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4) + %sgpr30_sgpr31 = COPY %2 + S_SETPC_B64_return %sgpr30_sgpr31 + +... + +# GCN-LABEL: {{^}}name: trunc_shr_f32 + +# CI: [[SHIFT:%[0-9]+]] = V_LSHRREV_B32_e64 16, %{{[0-9]+}}, implicit %exec +# CI: %{{[0-9]+}} = V_TRUNC_F32_e64 0, killed [[SHIFT]], 1, 2, implicit-def %vcc, implicit %exec + +# VI: [[SHIFT:%[0-9]+]] = V_LSHRREV_B32_e64 16, %{{[0-9]+}}, implicit %exec +# VI: %{{[0-9]+}} = V_TRUNC_F32_e64 0, killed [[SHIFT]], 1, 2, implicit-def %vcc, implicit %exec + +#GFX9: %{{[0-9]+}} = V_TRUNC_F32_sdwa 0, %{{[0-9]+}}, 1, 2, 6, 0, 5, implicit %exec + +--- +name: trunc_shr_f32 +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: sreg_64 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sreg_32_xm0 } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32 } + - { id: 9, class: vgpr_32 } + - { id: 10, class: vgpr_32 } + - { id: 11, class: vgpr_32 } +body: | + bb.0: + liveins: %vgpr0_vgpr1, %vgpr2_vgpr3, %sgpr30_sgpr31 + + %2 = COPY %sgpr30_sgpr31 + %1 = COPY %vgpr2_vgpr3 + %0 = COPY %vgpr0_vgpr1 + %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4) + %10 = V_LSHRREV_B32_e64 16, %3, implicit %exec + %11 = V_TRUNC_F32_e64 0, killed %10, 1, 2, implicit-def %vcc, implicit %exec + FLAT_STORE_DWORD %0, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4) + %sgpr30_sgpr31 = COPY %2 + S_SETPC_B64_return %sgpr30_sgpr31 Index: test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- test/CodeGen/AMDGPU/sdwa-peephole.ll +++ test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SDWA -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s ; GCN-LABEL: {{^}}add_shr_i32: ; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} @@ -72,9 +73,11 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + +; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { entry: @@ -93,12 +96,15 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) { entry: @@ -117,18 +123,23 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL6]], v[[DST_MUL7]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL4]], v[[DST_MUL5]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL6]], v[[DST_MUL7]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL4]], v[[DST_MUL5]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) { entry: @@ -162,9 +173,12 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} ; NOSDWA-NOT: v_mul_f16_sdwa -; SDWA-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] +; VI-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] + +; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { entry: %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4 @@ -182,10 +196,13 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_f16_sdwa -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) { entry: @@ -204,14 +221,19 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_f16_sdwa -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) { entry: @@ -245,7 +267,11 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; VI: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 + +; GFX9-DAG: v_mul_lo_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DAG: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) { entry: @@ -264,9 +290,13 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa + +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) { entry: @@ -285,12 +315,19 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa + +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) { entry: @@ -330,8 +367,11 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} ; NOSDWA-NOT: v_mac_f16_sdwa -; SDWA: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] +; VI: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] + +; GFX9: v_pk_mul_f16 v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v[[SRC:[0-9]+]] +; GFX9: v_pk_add_f16 v{{[0-9]+}}, v[[DST_MUL]], v[[SRC]] define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { entry: @@ -345,10 +385,13 @@ ; GCN-LABEL: {{^}}immediate_mul_v2i16: ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141 -; SDWA-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b -; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M123]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M321]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141 +; VI-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b +; VI-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M123]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M321]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD + +; GFX9: s_mov_b32 s[[IMM:[0-9]+]], 0x141007b +; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, s[[IMM]] define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { entry: @@ -367,7 +410,10 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD + +; GFX9: v_pk_mul_lo_u16 v[[DST1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v[[DST1]], v{{[0-9]+}} define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { entry: @@ -382,7 +428,9 @@ ; GCN-LABEL: {{^}}add_bb_v2i16: ; NOSDWA-NOT: v_add_i32_sdwa -; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 + +; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @add_bb_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { entry: Index: test/CodeGen/AMDGPU/sdwa-scalar-ops.mir =================================================================== --- test/CodeGen/AMDGPU/sdwa-scalar-ops.mir +++ test/CodeGen/AMDGPU/sdwa-scalar-ops.mir @@ -1,4 +1,5 @@ -# RUN: llc -march=amdgcn -mcpu=fiji -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=fiji -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s # GCN-LABEL: {{^}}sdwa_imm_operand: # GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2 @@ -8,11 +9,17 @@ # GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 # GCN-LABEL: {{^}}sdwa_sgpr_operand: -# GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2 -# GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2 -# GCN: BB1_1: -# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +# VI: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2 +# VI-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2 +# VI: BB1_1: +# VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +# VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 + +# GFX9: s_mov_b32 s[[SHIFT:[0-9]+]], 2 +# GFX9-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2 +# GFX9: BB1_1: +# GFX9: v_lshlrev_b32_sdwa v{{[0-9]+}}, s[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +# GFX9: v_lshlrev_b32_sdwa v{{[0-9]+}}, s[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 --- | ; ModuleID = 'sdwa-scalar-ops.opt.ll'