Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -1449,6 +1449,14 @@ [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], [IntrNoMem, IntrConvergent]>; +// llvm.amdgcn.mov.dpp8.i32 +// is a 32-bit constant whose high 8 bits must be zero which selects +// the lanes to read from. +def int_amdgcn_mov_dpp8 : + Intrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, llvm_i32_ty], + [IntrNoMem, IntrConvergent]>; + def int_amdgcn_s_get_waveid_in_workgroup : GCCBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">, Intrinsic<[llvm_i32_ty], [], [IntrReadMem]>; Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -329,6 +329,13 @@ "Support DPP (Data Parallel Primitives) extension" >; +// DPP8 allows arbitrary cross-lane swizzling withing groups of 8 lanes. +def FeatureDPP8 : SubtargetFeature<"dpp8", + "HasDPP8", + "true", + "Support DPP8 (Data Parallel Primitives) extension" +>; + def FeatureR128A16 : SubtargetFeature<"r128-a16", "HasR128A16", "true", @@ -610,7 +617,7 @@ FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts, FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, - FeatureVOP3Literal, FeatureNoDataDepHazard, + FeatureVOP3Literal, FeatureDPP8, FeatureNoDataDepHazard, FeatureDoesNotSupportSRAMECC ] >; @@ -962,9 +969,15 @@ def HasDPP : Predicate<"Subtarget->hasDPP()">, AssemblerPredicate<"FeatureGCN3Encoding,FeatureDPP">; +def HasDPP8 : Predicate<"Subtarget->hasDPP8()">, + AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureDPP8">; + def HasR128A16 : Predicate<"Subtarget->hasR128A16()">, AssemblerPredicate<"FeatureR128A16">; +def HasDPP16 : Predicate<"Subtarget->hasDPP()">, + AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureDPP">; + def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">, AssemblerPredicate<"FeatureIntClamp">; Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -331,6 +331,7 @@ bool HasSDWAMac; bool HasSDWAOutModsVOPC; bool HasDPP; + bool HasDPP8; bool HasR128A16; bool HasNSAEncoding; bool HasDLInsts; @@ -836,6 +837,10 @@ return HasDPP; } + bool hasDPP8() const { + return HasDPP8; + } + bool hasR128A16() const { return HasR128A16; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -218,6 +218,7 @@ HasSDWAMac(false), HasSDWAOutModsVOPC(false), HasDPP(false), + HasDPP8(false), HasR128A16(false), HasNSAEncoding(false), HasDLInsts(false), Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -147,10 +147,12 @@ ImmTyD16, ImmTyClampSI, ImmTyOModSI, + ImmTyDPP8, ImmTyDppCtrl, ImmTyDppRowMask, ImmTyDppBankMask, ImmTyDppBoundCtrl, + ImmTyDppFi, ImmTySdwaDstSel, ImmTySdwaSrc0Sel, ImmTySdwaSrc1Sel, @@ -327,6 +329,7 @@ bool isBankMask() const { return isImmTy(ImmTyDppBankMask); } bool isRowMask() const { return isImmTy(ImmTyDppRowMask); } bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); } + bool isFI() const { return isImmTy(ImmTyDppFi); } bool isSDWADstSel() const { return isImmTy(ImmTySdwaDstSel); } bool isSDWASrc0Sel() const { return isImmTy(ImmTySdwaSrc0Sel); } bool isSDWASrc1Sel() const { return isImmTy(ImmTySdwaSrc1Sel); } @@ -520,6 +523,7 @@ bool isSMRDOffset8() const; bool isSMRDOffset20() const; bool isSMRDLiteralOffset() const; + bool isDPP8() const; bool isDPPCtrl() const; bool isGPRIdxMode() const; bool isS16Imm() const; @@ -687,10 +691,12 @@ case ImmTyFORMAT: OS << "FORMAT"; break; case ImmTyClampSI: OS << "ClampSI"; break; case ImmTyOModSI: OS << "OModSI"; break; + case ImmTyDPP8: OS << "DPP8"; break; case ImmTyDppCtrl: OS << "DppCtrl"; break; case ImmTyDppRowMask: OS << "DppRowMask"; break; case ImmTyDppBankMask: OS << "DppBankMask"; break; case ImmTyDppBoundCtrl: OS << "DppBoundCtrl"; break; + case ImmTyDppFi: OS << "FI"; break; case ImmTySdwaDstSel: OS << "SdwaDstSel"; break; case ImmTySdwaSrc0Sel: OS << "SdwaSrc0Sel"; break; case ImmTySdwaSrc1Sel: OS << "SdwaSrc1Sel"; break; @@ -1228,11 +1234,14 @@ void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands); OperandMatchResultTy parseDim(OperandVector &Operands); + OperandMatchResultTy parseDPP8(OperandVector &Operands); OperandMatchResultTy parseDPPCtrl(OperandVector &Operands); AMDGPUOperand::Ptr defaultRowMask() const; AMDGPUOperand::Ptr defaultBankMask() const; AMDGPUOperand::Ptr defaultBoundCtrl() const; - void cvtDPP(MCInst &Inst, const OperandVector &Operands); + AMDGPUOperand::Ptr defaultFI() const; + void cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8 = false); + void cvtDPP8(MCInst &Inst, const OperandVector &Operands) { cvtDPP(Inst, Operands, true); } OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix, AMDGPUOperand::ImmTy Type); @@ -5692,6 +5701,7 @@ {"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr}, {"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr}, {"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl}, + {"fi", AMDGPUOperand::ImmTyDppFi, false, nullptr}, {"dst_sel", AMDGPUOperand::ImmTySdwaDstSel, false, nullptr}, {"src0_sel", AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr}, {"src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr}, @@ -6015,6 +6025,10 @@ // dpp //===----------------------------------------------------------------------===// +bool AMDGPUOperand::isDPP8() const { + return isImmTy(ImmTyDPP8); +} + bool AMDGPUOperand::isDPPCtrl() const { using namespace AMDGPU::DPP; @@ -6032,7 +6046,9 @@ (Imm == DppCtrl::ROW_MIRROR) || (Imm == DppCtrl::ROW_HALF_MIRROR) || (Imm == DppCtrl::BCAST15) || - (Imm == DppCtrl::BCAST31); + (Imm == DppCtrl::BCAST31) || + (Imm >= DppCtrl::ROW_SHARE_FIRST && Imm <= DppCtrl::ROW_SHARE_LAST) || + (Imm >= DppCtrl::ROW_XMASK_FIRST && Imm <= DppCtrl::ROW_XMASK_LAST); } return false; } @@ -6091,6 +6107,62 @@ return MatchOperand_Success; } +OperandMatchResultTy AMDGPUAsmParser::parseDPP8(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + StringRef Prefix; + + if (getLexer().getKind() == AsmToken::Identifier) { + Prefix = Parser.getTok().getString(); + } else { + return MatchOperand_NoMatch; + } + + if (Prefix != "dpp8") + return parseDPPCtrl(Operands); + if (!isGFX10()) + return MatchOperand_NoMatch; + + // dpp8:[%d,%d,%d,%d,%d,%d,%d,%d] + + int64_t Sels[8]; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Colon)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::LBrac)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getParser().parseAbsoluteExpression(Sels[0])) + return MatchOperand_ParseFail; + if (0 > Sels[0] || 7 < Sels[0]) + return MatchOperand_ParseFail; + + for (size_t i = 1; i < 8; ++i) { + if (getLexer().isNot(AsmToken::Comma)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getParser().parseAbsoluteExpression(Sels[i])) + return MatchOperand_ParseFail; + if (0 > Sels[i] || 7 < Sels[i]) + return MatchOperand_ParseFail; + } + + if (getLexer().isNot(AsmToken::RBrac)) + return MatchOperand_ParseFail; + Parser.Lex(); + + unsigned DPP8 = 0; + for (size_t i = 0; i < 8; ++i) + DPP8 |= (Sels[i] << (i * 3)); + + Operands.push_back(AMDGPUOperand::CreateImm(this, DPP8, S, AMDGPUOperand::ImmTyDPP8)); + return MatchOperand_Success; +} + OperandMatchResultTy AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { using namespace AMDGPU::DPP; @@ -6121,10 +6193,21 @@ && Prefix != "wave_rol" && Prefix != "wave_shr" && Prefix != "wave_ror" - && Prefix != "row_bcast") { + && Prefix != "row_bcast" + && Prefix != "row_share" + && Prefix != "row_xmask") { return MatchOperand_NoMatch; } + if (!isGFX10() && (Prefix == "row_share" || Prefix == "row_xmask")) + return MatchOperand_NoMatch; + + if (!isVI() && !isGFX9() && + (Prefix == "wave_shl" || Prefix == "wave_shr" || + Prefix == "wave_rol" || Prefix == "wave_ror" || + Prefix == "row_bcast")) + return MatchOperand_NoMatch; + Parser.Lex(); if (getLexer().isNot(AsmToken::Colon)) return MatchOperand_ParseFail; @@ -6182,6 +6265,10 @@ } else { return MatchOperand_ParseFail; } + } else if (Prefix == "row_share" && 0 <= Int && Int <= 15) { + Int |= DppCtrl::ROW_SHARE_FIRST; + } else if (Prefix == "row_xmask" && 0 <= Int && Int <= 15) { + Int |= DppCtrl::ROW_XMASK_FIRST; } else { return MatchOperand_ParseFail; } @@ -6208,7 +6295,11 @@ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppBoundCtrl); } -void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFI() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppFi); +} + +void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) { OptionalImmIndexMap OptionalIdx; unsigned I = 1; @@ -6217,6 +6308,7 @@ ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } + int Fi = 0; for (unsigned E = Operands.size(); I != E; ++I) { auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(), MCOI::TIED_TO); @@ -6232,21 +6324,44 @@ // Skip it. continue; } - if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { - Op.addRegWithFPInputModsOperands(Inst, 2); - } else if (Op.isDPPCtrl()) { - Op.addImmOperands(Inst, 1); - } else if (Op.isImm()) { - // Handle optional arguments - OptionalIdx[Op.getImmTy()] = I; + + if (IsDPP8) { + if (Op.isDPP8()) { + Op.addImmOperands(Inst, 1); + } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + Op.addRegWithFPInputModsOperands(Inst, 2); + } else if (Op.isFI()) { + Fi = Op.getImm(); + } else if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + } else { + llvm_unreachable("Invalid operand type"); + } } else { - llvm_unreachable("Invalid operand type"); + if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + Op.addRegWithFPInputModsOperands(Inst, 2); + } else if (Op.isDPPCtrl()) { + Op.addImmOperands(Inst, 1); + } else if (Op.isImm()) { + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = I; + } else { + llvm_unreachable("Invalid operand type"); + } } } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl); + if (IsDPP8) { + using namespace llvm::AMDGPU::DPP; + Inst.addOperand(MCOperand::createImm(Fi? DPP8_FI_1 : DPP8_FI_0)); + } else { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl); + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::fi) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppFi); + } + } } //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h =================================================================== --- lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -67,6 +67,7 @@ uint64_t Address) const; DecodeStatus convertSDWAInst(MCInst &MI) const; + DecodeStatus convertDPP8Inst(MCInst &MI) const; DecodeStatus convertMIMGInst(MCInst &MI) const; MCOperand decodeOperand_VGPR_32(unsigned Val) const; @@ -127,7 +128,7 @@ bool isVI() const; bool isGFX9() const; bool isGFX10() const; - }; +}; //===----------------------------------------------------------------------===// // AMDGPUSymbolizer Index: lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp =================================================================== --- lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -186,6 +186,16 @@ return MCDisassembler::Fail; } +static bool isValidDPP8(const MCInst &MI) { + using namespace llvm::AMDGPU::DPP; + int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi); + assert(FiIdx != -1); + if ((unsigned)FiIdx >= MI.getNumOperands()) + return false; + unsigned Fi = MI.getOperand(FiIdx).getImm(); + return Fi == DPP8_FI_0 || Fi == DPP8_FI_1; +} + DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef Bytes_, uint64_t Address, @@ -206,6 +216,13 @@ // encodings if (Bytes.size() >= 8) { const uint64_t QW = eatBytes(Bytes); + + Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address); + if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) + break; + + MI = MCInst(); // clear + Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address); if (Res) break; @@ -363,6 +380,24 @@ return MCDisassembler::Success; } +DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { + unsigned Opc = MI.getOpcode(); + unsigned DescNumOps = MCII->get(Opc).getNumOperands(); + + // Insert dummy unused src modifiers. + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src0_modifiers); + + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src1_modifiers); + + return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail; +} + // Note that before gfx10, the MIMG encoding provided no information about // VADDR size. Consequently, decoded instructions always show address as if it // has 1 dword, which could be not really so. Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -116,6 +116,8 @@ const MCSubtargetInfo &STI, raw_ostream &O); void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printDPP8(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printRowMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, @@ -124,6 +126,8 @@ const MCSubtargetInfo &STI, raw_ostream &O); void printBoundCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printFI(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printSDWADstSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -451,6 +451,12 @@ case AMDGPU::V_ADD_CO_CI_U32_sdwa_gfx10: case AMDGPU::V_SUB_CO_CI_U32_sdwa_gfx10: case AMDGPU::V_SUBREV_CO_CI_U32_sdwa_gfx10: + case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10: printDefaultVccOperand(1, STI, O); break; } @@ -681,6 +687,12 @@ case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10: case AMDGPU::V_SUB_CO_CI_U32_e32_gfx10: case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10: + case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10: case AMDGPU::V_CNDMASK_B32_e32_gfx6_gfx7: case AMDGPU::V_CNDMASK_B32_e32_vi: @@ -750,6 +762,20 @@ } } +void AMDGPUInstPrinter::printDPP8(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (!AMDGPU::isGFX10(STI)) + llvm_unreachable("dpp8 is not supported on ASICs earlier than GFX10"); + + unsigned Imm = MI->getOperand(OpNo).getImm(); + O << " dpp8:[" << formatDec(Imm & 0x7); + for (size_t i = 1; i < 8; ++i) { + O << ',' << formatDec((Imm >> (3 * i)) & 0x7); + } + O << ']'; +} + void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -775,21 +801,61 @@ O << " row_ror:"; printU4ImmDecOperand(MI, OpNo, O); } else if (Imm == DppCtrl::WAVE_SHL1) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* wave_shl is not supported starting from GFX10 */"; + return; + } O << " wave_shl:1"; } else if (Imm == DppCtrl::WAVE_ROL1) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* wave_rol is not supported starting from GFX10 */"; + return; + } O << " wave_rol:1"; } else if (Imm == DppCtrl::WAVE_SHR1) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* wave_shr is not supported starting from GFX10 */"; + return; + } O << " wave_shr:1"; } else if (Imm == DppCtrl::WAVE_ROR1) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* wave_ror is not supported starting from GFX10 */"; + return; + } O << " wave_ror:1"; } else if (Imm == DppCtrl::ROW_MIRROR) { O << " row_mirror"; } else if (Imm == DppCtrl::ROW_HALF_MIRROR) { O << " row_half_mirror"; } else if (Imm == DppCtrl::BCAST15) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* row_bcast is not supported starting from GFX10 */"; + return; + } O << " row_bcast:15"; } else if (Imm == DppCtrl::BCAST31) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* row_bcast is not supported starting from GFX10 */"; + return; + } O << " row_bcast:31"; + } else if ((Imm >= DppCtrl::ROW_SHARE_FIRST) && + (Imm <= DppCtrl::ROW_SHARE_LAST)) { + if (!AMDGPU::isGFX10(STI)) { + O << " /* row_share is not supported on ASICs earlier than GFX10 */"; + return; + } + O << " row_share:"; + printU4ImmDecOperand(MI, OpNo, O); + } else if ((Imm >= DppCtrl::ROW_XMASK_FIRST) && + (Imm <= DppCtrl::ROW_XMASK_LAST)) { + if (!AMDGPU::isGFX10(STI)) { + O << " /* row_xmask is not supported on ASICs earlier than GFX10 */"; + return; + } + O << "row_xmask:"; + printU4ImmDecOperand(MI, OpNo, O); } else { O << " /* Invalid dpp_ctrl value */"; } @@ -818,6 +884,16 @@ } } +void AMDGPUInstPrinter::printFI(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + using namespace llvm::AMDGPU::DPP; + unsigned Imm = MI->getOperand(OpNo).getImm(); + if (Imm == DPP_FI_1 || Imm == DPP8_FI_1) { + O << " fi:1"; + } +} + void AMDGPUInstPrinter::printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O) { using namespace llvm::AMDGPU::SDWA; Index: lib/Target/AMDGPU/SIDefines.h =================================================================== --- lib/Target/AMDGPU/SIDefines.h +++ lib/Target/AMDGPU/SIDefines.h @@ -454,7 +454,20 @@ ROW_HALF_MIRROR = 0x141, BCAST15 = 0x142, BCAST31 = 0x143, - DPP_LAST = BCAST31 + DPP_UNUSED8_FIRST = 0x144, + DPP_UNUSED8_LAST = 0x14F, + ROW_SHARE_FIRST = 0x150, + ROW_SHARE_LAST = 0x15F, + ROW_XMASK_FIRST = 0x160, + ROW_XMASK_LAST = 0x16F, + DPP_LAST = ROW_XMASK_LAST +}; + +enum DppFiMode { + DPP_FI_0 = 0, + DPP_FI_1 = 1, + DPP8_FI_0 = 0xE9, + DPP8_FI_1 = 0xEA, }; } // namespace DPP Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3377,10 +3377,29 @@ (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || - (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST)) { + (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) || + (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) { ErrInfo = "Invalid dpp_ctrl value"; return false; } + if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 && + ST.getGeneration() >= AMDGPUSubtarget::GFX10) { + ErrInfo = "Invalid dpp_ctrl value: " + "wavefront shifts are not supported on GFX10+"; + return false; + } + if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && + ST.getGeneration() >= AMDGPUSubtarget::GFX10) { + ErrInfo = "Invalid dpp_ctrl value: " + "broadcats are not supported on GFX10+"; + return false; + } + if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && + ST.getGeneration() < AMDGPUSubtarget::GFX10) { + ErrInfo = "Invalid dpp_ctrl value: " + "row_share and row_xmask are not supported before GFX10"; + return false; + } } return true; Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -847,10 +847,13 @@ def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>; def Dim : NamedOperandU8<"Dim", NamedMatchClass<"Dim", 0>>; +def dpp8 : NamedOperandU32<"DPP8", NamedMatchClass<"DPP8", 0>>; + def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>; def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>; def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>; def bound_ctrl : NamedOperandBit<"BoundCtrl", NamedMatchClass<"BoundCtrl">>; +def FI : NamedOperandU32<"FI", NamedMatchClass<"FI">>; def dst_sel : NamedOperandU32<"SDWADstSel", NamedMatchClass<"SDWADstSel">>; def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>; @@ -1525,6 +1528,42 @@ /* endif */))); } +class getInsDPP16 { + dag ret = !con(getInsDPP.ret, + (ins FI:$fi)); +} + +class getInsDPP8 { + dag ret = !if (!eq(NumSrcArgs, 0), + // VOP1 without input operands (V_NOP) + (ins dpp8:$dpp8, FI:$fi), + !if (!eq(NumSrcArgs, 1), + !if (!eq(HasModifiers, 1), + // VOP1_DPP with modifiers + (ins DstRC:$old, Src0Mod:$src0_modifiers, + Src0RC:$src0, dpp8:$dpp8, FI:$fi) + /* else */, + // VOP1_DPP without modifiers + (ins DstRC:$old, Src0RC:$src0, dpp8:$dpp8, FI:$fi) + /* endif */) + /* NumSrcArgs == 2 */, + !if (!eq(HasModifiers, 1), + // VOP2_DPP with modifiers + (ins DstRC:$old, + Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + dpp8:$dpp8, FI:$fi) + /* else */, + // VOP2_DPP without modifiers + (ins DstRC:$old, + Src0RC:$src0, Src1RC:$src1, dpp8:$dpp8, FI:$fi) + /* endif */))); +} // Ins for SDWA @@ -1683,6 +1722,26 @@ string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; } +class getAsmDPP16 { + string ret = getAsmDPP.ret#"$fi"; +} + +class getAsmDPP8 { + string dst = !if(HasDst, + !if(!eq(DstVT.Size, 1), + "$sdst", + "$vdst"), + ""); // use $sdst for VOPC + string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); + string src1 = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1_modifiers", + " $src1_modifiers,")); + string args = !if(!eq(HasModifiers, 0), + getAsm32<0, NumSrcArgs, DstVT>.ret, + ", "#src0#src1); + string ret = dst#args#"$dpp8$fi"; +} + class getAsmSDWA { string dst = !if(HasDst, !if(!eq(DstVT.Size, 1), @@ -1866,6 +1925,7 @@ field dag Outs32 = Outs; field dag Outs64 = Outs; field dag OutsDPP = getOutsExt.ret; + field dag OutsDPP8 = getOutsExt.ret; field dag OutsSDWA = getOutsSDWA.ret; field dag Ins32 = getIns32.ret; @@ -1885,6 +1945,10 @@ getInsDPP.ret, (ins)); + field dag InsDPP16 = getInsDPP16.ret; + field dag InsDPP8 = getInsDPP8.ret; field dag InsSDWA = getInsSDWA.ret; @@ -1900,6 +1964,8 @@ HasSrc2FloatMods>.ret; field string AsmDPP = !if(HasExtDPP, getAsmDPP.ret, ""); + field string AsmDPP16 = getAsmDPP16.ret; + field string AsmDPP8 = getAsmDPP8.ret; field string AsmSDWA = getAsmSDWA.ret; field string AsmSDWA9 = getAsmSDWA9.ret; Index: lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP1Instructions.td +++ lib/Target/AMDGPU/VOP1Instructions.td @@ -272,6 +272,7 @@ let InsDPP = (ins DstRC:$vdst, DstRC:$old, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsDPP16 = !con(InsDPP, (ins FI:$fi)); let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, @@ -280,6 +281,7 @@ let Asm32 = getAsm32<1, 1>.ret; let Asm64 = getAsm64<1, 1, 0, 0, 1>.ret; let AsmDPP = getAsmDPP<1, 1, 0>.ret; + let AsmDPP16 = getAsmDPP16<1, 1, 0>.ret; let AsmSDWA = getAsmSDWA<1, 1>.ret; let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret; @@ -432,8 +434,8 @@ // Target-specific instruction encodings. //===----------------------------------------------------------------------===// -class VOP1_DPP op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : - VOP_DPP { +class VOP1_DPP op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = 0> : + VOP_DPP { let hasSideEffects = ps.hasSideEffects; let Defs = ps.Defs; let SchedRW = ps.SchedRW; @@ -446,6 +448,29 @@ let Inst{31-25} = 0x3f; } +class VOP1_DPP16 op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : + VOP1_DPP { + let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst); + let SubtargetPredicate = HasDPP16; +} + +class VOP1_DPP8 op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : + VOP_DPP8 { + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + + bits<8> vdst; + let Inst{8-0} = fi; + let Inst{16-9} = op; + let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0); + let Inst{31-25} = 0x3f; + + let AssemblerPredicate = !if(p.HasExt, HasDPP8, DisableInst); + let SubtargetPredicate = HasDPP8; +} + //===----------------------------------------------------------------------===// // GFX10. //===----------------------------------------------------------------------===// @@ -473,15 +498,28 @@ let DecoderNamespace = "SDWA10"; } } + multiclass VOP1_Real_dpp_gfx10 op> { + def _dpp_gfx10 : VOP1_DPP16(NAME#"_e32")> { + let DecoderNamespace = "SDWA10"; + } + } + multiclass VOP1_Real_dpp8_gfx10 op> { + def _dpp8_gfx10 : VOP1_DPP8(NAME#"_e32")> { + let DecoderNamespace = "DPP8"; + } + } } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" multiclass VOP1_Real_gfx10_no_dpp op> : VOP1_Real_e32_gfx10, VOP1_Real_e64_gfx10, VOP1_Real_sdwa_gfx10; -multiclass VOP1_Real_gfx10 op> : +multiclass VOP1_Real_gfx10_no_dpp8 op> : VOP1_Real_e32_gfx10, VOP1_Real_e64_gfx10, - VOP1_Real_sdwa_gfx10; + VOP1_Real_sdwa_gfx10, VOP1_Real_dpp_gfx10; + +multiclass VOP1_Real_gfx10 op> : + VOP1_Real_gfx10_no_dpp8, VOP1_Real_dpp8_gfx10; defm V_PIPEFLUSH : VOP1_Real_gfx10<0x01b>; defm V_MOVRELSD_2_B32 : VOP1_Real_gfx10<0x048>; @@ -564,6 +602,9 @@ multiclass VOP1_Real_gfx6_gfx7_gfx10 op> : VOP1_Real_gfx6_gfx7, VOP1_Real_gfx10; +multiclass VOP1_Real_gfx6_gfx7_gfx10_no_dpp8 op> : + VOP1_Real_gfx6_gfx7, VOP1_Real_gfx10_no_dpp8; + multiclass VOP1_Real_gfx6_gfx7_gfx10_no_dpp op> : VOP1_Real_gfx6_gfx7, VOP1_Real_gfx10_no_dpp; @@ -625,8 +666,8 @@ defm V_FREXP_MANT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x040>; defm V_CLREXCP : VOP1_Real_gfx6_gfx7_gfx10<0x041>; defm V_MOVRELD_B32 : VOP1_Real_gfx6_gfx7_gfx10_no_dpp<0x042>; -defm V_MOVRELS_B32 : VOP1_Real_gfx6_gfx7_gfx10_no_dpp<0x043>; -defm V_MOVRELSD_B32 : VOP1_Real_gfx6_gfx7_gfx10_no_dpp<0x044>; +defm V_MOVRELS_B32 : VOP1_Real_gfx6_gfx7_gfx10_no_dpp8<0x043>; +defm V_MOVRELSD_B32 : VOP1_Real_gfx6_gfx7_gfx10_no_dpp8<0x044>; //===----------------------------------------------------------------------===// // GFX8, GFX9 (VI). @@ -856,3 +897,30 @@ } defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; + +//===----------------------------------------------------------------------===// +// GFX10 +//===----------------------------------------------------------------------===// + +let OtherPredicates = [isGFX10Plus] in { +def : GCNPat < + (i32 (int_amdgcn_mov_dpp8 i32:$src, imm:$dpp8)), + (V_MOV_B32_dpp8_gfx10 $src, $src, (as_i32imm $dpp8), (i32 DPP8Mode.FI_0)) +>; + +def : GCNPat < + (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, + imm:$bound_ctrl)), + (V_MOV_B32_dpp_gfx10 $src, $src, (as_i32imm $dpp_ctrl), + (as_i32imm $row_mask), (as_i32imm $bank_mask), + (as_i1imm $bound_ctrl), (i32 0)) +>; + +def : GCNPat < + (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask, + imm:$bank_mask, imm:$bound_ctrl)), + (V_MOV_B32_dpp_gfx10 $old, $src, (as_i32imm $dpp_ctrl), + (as_i32imm $row_mask), (as_i32imm $bank_mask), + (as_i1imm $bound_ctrl), (i32 0)) +>; +} // End OtherPredicates = [isGFX10Plus] Index: lib/Target/AMDGPU/VOP2Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP2Instructions.td +++ lib/Target/AMDGPU/VOP2Instructions.td @@ -275,6 +275,12 @@ VGPR_32:$src2, // stub argument dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsDPP16 = !con(InsDPP, (ins FI:$fi)); + + let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, + Src1ModDPP:$src1_modifiers, Src1DPP:$src1, + VGPR_32:$src2, // stub argument + dpp8:$dpp8, FI:$fi); let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, @@ -285,6 +291,8 @@ let Asm32 = getAsm32<1, 2, vt0>.ret; let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, vt0>.ret; let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt0>.ret; + let AsmDPP16 = getAsmDPP16<1, 2, HasModifiers, vt0>.ret; + let AsmDPP8 = getAsmDPP8<1, 2, 0, vt0>.ret; let AsmSDWA = getAsmSDWA<1, 2, vt0>.ret; let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt0>.ret; let HasSrc2 = 0; @@ -307,6 +315,8 @@ let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; + let AsmDPP8 = "$vdst, vcc, $src0, $src1 $dpp8$fi"; + let AsmDPP16 = AsmDPP#"$fi"; let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); } @@ -319,6 +329,8 @@ let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; + let AsmDPP8 = "$vdst, vcc, $src0, $src1, vcc $dpp8$fi"; + let AsmDPP16 = AsmDPP#"$fi"; let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); @@ -337,6 +349,8 @@ Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsDPP16 = !con(InsDPP, (ins FI:$fi)); + let HasExt = 1; let HasExtDPP = 1; let HasExtSDWA = 1; @@ -350,6 +364,8 @@ let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmSDWA9 = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; + let AsmDPP8 = "$vdst, $src0, $src1, vcc $dpp8$fi"; + let AsmDPP16 = AsmDPP#"$fi"; let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst); @@ -369,6 +385,8 @@ Src1ModDPP:$src1_modifiers, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsDPP16 = !con(InsDPP, (ins FI:$fi)); + let HasExt = 1; let HasExtDPP = 1; let HasExtSDWA = 1; @@ -760,8 +778,9 @@ //===----------------------------------------------------------------------===// class VOP2_DPP op, VOP2_Pseudo ps, - string opName = ps.OpName, VOPProfile p = ps.Pfl> : - VOP_DPP { + string opName = ps.OpName, VOPProfile p = ps.Pfl, + bit IsDPP16 = 0> : + VOP_DPP { let hasSideEffects = ps.hasSideEffects; let Defs = ps.Defs; let SchedRW = ps.SchedRW; @@ -776,6 +795,34 @@ let Inst{31} = 0x0; } +class VOP2_DPP16 op, VOP2_Pseudo ps, + string opName = ps.OpName, VOPProfile p = ps.Pfl> : + VOP2_DPP { + let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst); + let SubtargetPredicate = HasDPP16; +} + +class VOP2_DPP8 op, VOP2_Pseudo ps, + string opName = ps.OpName, VOPProfile p = ps.Pfl> : + VOP_DPP8 { + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + + bits<8> vdst; + bits<8> src1; + + let Inst{8-0} = fi; + let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, 0); + let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0); + let Inst{30-25} = op; + let Inst{31} = 0x0; + + let AssemblerPredicate = !if(p.HasExt, HasDPP8, DisableInst); + let SubtargetPredicate = HasDPP8; +} + //===----------------------------------------------------------------------===// // GFX10. //===----------------------------------------------------------------------===// @@ -813,6 +860,16 @@ let DecoderNamespace = "SDWA10"; } } + multiclass VOP2_Real_dpp_gfx10 op> { + def _dpp_gfx10 : VOP2_DPP16(NAME#"_e32")> { + let DecoderNamespace = "SDWA10"; + } + } + multiclass VOP2_Real_dpp8_gfx10 op> { + def _dpp8_gfx10 : VOP2_DPP8(NAME#"_e32")> { + let DecoderNamespace = "DPP8"; + } + } //===------------------------- VOP2 (with name) -------------------------===// multiclass VOP2_Real_e32_gfx10_with_name op, string opName, @@ -844,6 +901,21 @@ let AsmString = asmName # ps.AsmOperands; } } + multiclass VOP2_Real_dpp_gfx10_with_name op, string opName, + string asmName> { + def _dpp_gfx10 : VOP2_DPP16(opName#"_e32")> { + VOP2_Pseudo ps = !cast(opName#"_e32"); + let AsmString = asmName # ps.Pfl.AsmDPP16; + } + } + multiclass VOP2_Real_dpp8_gfx10_with_name op, string opName, + string asmName> { + def _dpp8_gfx10 : VOP2_DPP8(opName#"_e32")> { + VOP2_Pseudo ps = !cast(opName#"_e32"); + let AsmString = asmName # ps.Pfl.AsmDPP8; + let DecoderNamespace = "DPP8"; + } + } } // End DecoderNamespace = "SDWA10" //===------------------------------ VOP2be ------------------------------===// @@ -868,6 +940,18 @@ let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands); let DecoderNamespace = "SDWA10"; } + def _dpp_gfx10 : + VOP2_DPP16(opName#"_e32"), asmName> { + string AsmDPP = !cast(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # !subst(", vcc", "", AsmDPP); + let DecoderNamespace = "SDWA10"; + } + def _dpp8_gfx10 : + VOP2_DPP8(opName#"_e32"), asmName> { + string AsmDPP8 = !cast(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # !subst(", vcc", "", AsmDPP8); + let DecoderNamespace = "DPP8"; + } def _sdwa_w64_gfx10 : Base_VOP_SDWA10_Real(opName#"_sdwa")>, @@ -877,6 +961,18 @@ let isAsmParserOnly = 1; let DecoderNamespace = "SDWA10"; } + def _dpp_w64_gfx10 : + VOP2_DPP16(opName#"_e32"), asmName> { + string AsmDPP = !cast(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # AsmDPP; + let isAsmParserOnly = 1; + } + def _dpp8_w64_gfx10 : + VOP2_DPP8(opName#"_e32"), asmName> { + string AsmDPP8 = !cast(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # AsmDPP8; + let isAsmParserOnly = 1; + } } //===----------------------------- VOP3Only -----------------------------===// @@ -902,13 +998,15 @@ multiclass VOP2_Real_gfx10 op> : VOP2_Real_e32_gfx10, VOP2_Real_e64_gfx10, - VOP2_Real_sdwa_gfx10; + VOP2_Real_sdwa_gfx10, VOP2_Real_dpp_gfx10, VOP2_Real_dpp8_gfx10; multiclass VOP2_Real_gfx10_with_name op, string opName, string asmName> : VOP2_Real_e32_gfx10_with_name, VOP2_Real_e64_gfx10_with_name, - VOP2_Real_sdwa_gfx10_with_name; + VOP2_Real_sdwa_gfx10_with_name, + VOP2_Real_dpp_gfx10_with_name, + VOP2_Real_dpp8_gfx10_with_name; defm V_CNDMASK_B32 : Base_VOP2_Real_gfx10<0x001>; defm V_XNOR_B32 : VOP2_Real_gfx10<0x01e>; Index: lib/Target/AMDGPU/VOPInstructions.td =================================================================== --- lib/Target/AMDGPU/VOPInstructions.td +++ lib/Target/AMDGPU/VOPInstructions.td @@ -539,7 +539,7 @@ class VOP_SDWA10_Real : Base_VOP_SDWA10_Real, SIMCInstr; -class VOP_DPPe : Enc64 { +class VOP_DPPe : Enc64 { bits<2> src0_modifiers; bits<8> src0; bits<2> src1_modifiers; @@ -551,6 +551,7 @@ let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); let Inst{48-40} = dpp_ctrl; + let Inst{50} = !if(IsDPP16, fi, ?); let Inst{51} = bound_ctrl; let Inst{52} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // src0_neg let Inst{53} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // src0_abs @@ -623,11 +624,11 @@ let TSFlags = ps.TSFlags; } -class VOP_DPP : +class VOP_DPP : InstSI , - VOP_DPPe

{ + VOP_DPPe { let mayLoad = 0; let mayStore = 0; @@ -648,6 +649,42 @@ let DecoderNamespace = "DPP"; } +class VOP_DPP8e : Enc64 { + bits<8> src0; + bits<24> dpp8; + bits<9> fi; + + let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{63-40} = dpp8{23-0}; +} + +class VOP_DPP8 : + InstSI, + VOP_DPP8e

{ + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + + let VALU = 1; + let DPP = 1; + let Size = 8; + + let AsmMatchConverter = "cvtDPP8"; + let SubtargetPredicate = HasDPP8; + let AssemblerPredicate = !if(P.HasExt, HasDPP8, DisableInst); + let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP, + AMDGPUAsmVariants.Disable); + let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); + let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); +} + +def DPP8Mode { + int FI_0 = 0xE9; + int FI_1 = 0xEA; +} + class getNumNodeArgs { SDNode N = !cast(Op); SDTypeProfile TP = N.TypeProfile; Index: test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll @@ -1,14 +1,15 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-OPT %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOOPT %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT,PREGFX10,PREGFX10-OPT %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-NOOPT,PREGFX10,PREGFX10-NOOPT %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT %s ; FIXME: The register allocator / scheduler should be able to avoid these hazards. ; VI-LABEL: {{^}}dpp_test: ; VI: v_mov_b32_e32 v0, s{{[0-9]+}} ; VI-NOOPT: v_mov_b32_e32 v1, s{{[0-9]+}} -; VI-OPT: s_nop 1 -; VI-NOOPT: s_nop 0 -; VI-NOOPT: s_nop 0 +; PREGFX10-OPT: s_nop 1 +; PREGFX10-NOOPT: s_nop 0 +; PREGFX10-NOOPT: s_nop 0 ; VI-OPT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] ; VI-NOOPT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x01,0x08,0x11] define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) { @@ -20,14 +21,14 @@ ; VI-LABEL: {{^}}dpp_wait_states: ; VI-NOOPT: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s{{[0-9]+}} ; VI: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s{{[0-9]+}} -; VI-OPT: s_nop 1 -; VI-NOOPT: s_nop 0 -; VI-NOOPT: s_nop 0 +; PREGFX10-OPT: s_nop 1 +; PREGFX10-NOOPT: s_nop 0 +; PREGFX10-NOOPT: s_nop 0 ; VI-OPT: v_mov_b32_dpp [[VGPR0]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; VI-NOOPT: v_mov_b32_dpp [[VGPR1]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl: -; VI-OPT: s_nop 1 -; VI-NOOPT: s_nop 0 -; VI-NOOPT: s_nop 0 +; PREGFX10-OPT: s_nop 1 +; PREGFX10-NOOPT: s_nop 0 +; PREGFX10-NOOPT: s_nop 0 ; VI-OPT: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; VI-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 define amdgpu_kernel void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) { @@ -39,16 +40,16 @@ ; VI-LABEL: {{^}}dpp_first_in_bb: ; VI: ; %endif -; VI-OPT: s_mov_b32 -; VI-OPT: s_mov_b32 -; VI-NOOPT: s_waitcnt -; VI-NOOPT-NEXT: s_nop 0 +; PREGFX10-OPT: s_mov_b32 +; PREGFX10-OPT: s_mov_b32 +; PREGFX10-NOOPT: s_waitcnt +; PREGFX10-NOOPT-NEXT: s_nop 0 ; VI: v_mov_b32_dpp [[VGPR0:v[0-9]+]], v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 -; VI-OPT: s_nop 1 +; PREGFX10-OPT: s_nop 1 ; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 -; VI-OPT: s_nop 1 -; VI-NOOPT: s_nop 0 -; VI-NOOPT: s_nop 0 +; PREGFX10-OPT: s_nop 1 +; PREGFX10-NOOPT: s_nop 0 +; PREGFX10-NOOPT: s_nop 0 ; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 define amdgpu_kernel void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) { %cmp = fcmp oeq float %cond, 0.0 Index: test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll @@ -0,0 +1,26 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +; GFX10-LABEL: {{^}}dpp8_test: +; GFX10: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} +; GFX10: v_mov_b32_dpp [[SRC]], [[SRC]] dpp8:[1,0,0,0,0,0,0,0]{{$}} +define amdgpu_kernel void @dpp8_test(i32 addrspace(1)* %out, i32 %in) { + %tmp0 = call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %in, i32 1) #0 + store i32 %tmp0, i32 addrspace(1)* %out + ret void +} + +; GFX10-LABEL: {{^}}dpp8_wait_states: +; GFX10-NOOPT: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s{{[0-9]+}} +; GFX10: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s{{[0-9]+}} +; GFX10: v_mov_b32_dpp [[VGPR0]], [[VGPR0]] dpp8:[1,0,0,0,0,0,0,0]{{$}} +; GFX10: v_mov_b32_dpp [[VGPR0]], [[VGPR0]] dpp8:[5,0,0,0,0,0,0,0]{{$}} +define amdgpu_kernel void @dpp8_wait_states(i32 addrspace(1)* %out, i32 %in) { + %tmp0 = call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %in, i32 1) #0 + %tmp1 = call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %tmp0, i32 5) #0 + store i32 %tmp1, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #0 + +attributes #0 = { nounwind readnone convergent } Index: test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -1,26 +1,37 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-OPT %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOOPT %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s -; VI-LABEL: {{^}}dpp_test: -; VI: v_mov_b32_e32 v0, s{{[0-9]+}} -; VI: v_mov_b32_e32 v1, s{{[0-9]+}} -; VI-OPT: s_nop 1 -; VI-NOOPT: s_nop 0 -; VI-NOOPT: s_nop 0 -; VI: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x01,0x08,0x11] +; GCN-LABEL: {{^}}dpp_test: +; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} +; GFX8: s_nop 1 +; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) { - %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 1) #0 + %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 0) #0 store i32 %tmp0, i32 addrspace(1)* %out ret void } +; GCN-LABEL: {{^}}dpp_test_bc: +; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} +; GFX8: s_nop 1 +; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0{{$}} +define amdgpu_kernel void @dpp_test_bc(i32 addrspace(1)* %out, i32 %in1, i32 %in2) { + %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 1) #0 + store i32 %tmp0, i32 addrspace(1)* %out + ret void +} + + ; VI-LABEL: {{^}}dpp_test1: -; VI-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}} -; VI-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0 -; VI-NEXT: s_nop 0 -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf +; GFX10: v_add_nc_u32_e32 [[REG:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GFX8-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GFX8-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX8-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GFX8: s_nop 0 +; GFX8-NEXT: s_nop 0 +; GFX8-OPT-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf @0 = internal unnamed_addr addrspace(3) global [448 x i32] undef, align 4 define weak_odr amdgpu_kernel void @dpp_test1(i32* %arg) local_unnamed_addr { bb: Index: test/MC/AMDGPU/dpp-err.s =================================================================== --- /dev/null +++ test/MC/AMDGPU/dpp-err.s @@ -0,0 +1,38 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding < %s 2>&1 | FileCheck -check-prefix=GFX89 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding < %s 2>&1 | FileCheck -check-prefix=GFX89 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -show-encoding < %s 2>&1 | FileCheck -check-prefix=GFX10 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding < %s 2>&1 | FileCheck -check-prefix=GFX89-ERR %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding < %s 2>&1 | FileCheck -check-prefix=GFX89-ERR %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -show-encoding < %s 2>&1 | FileCheck -check-prefix=GFX10-ERR %s + +v_mov_b32_dpp v0, v1 row_share:1 row_mask:0x1 bank_mask:0x1 +// GFX89-ERR: not a valid operand. +// GFX10: v_mov_b32_dpp v0, v1 row_share:1 row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x51,0x01,0x11] + +v_mov_b32_dpp v0, v1 row_xmask:1 row_mask:0x1 bank_mask:0x1 +// GFX89-ERR: not a valid operand. +// GFX10: v_mov_b32_dpp v0, v1 row_xmask:1 row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x61,0x01,0x11] + +v_mov_b32_dpp v0, v1 wave_shl:1 row_mask:0x1 bank_mask:0x1 +// GFX89: v0, v1 wave_shl:1 row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x30,0x01,0x11] +// GFX10-ERR: not a valid operand. + +v_mov_b32_dpp v0, v1 wave_shr:1 row_mask:0x1 bank_mask:0x1 +// GFX89: v0, v1 wave_shr:1 row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x38,0x01,0x11] +// GFX10-ERR: not a valid operand. + +v_mov_b32_dpp v0, v1 wave_rol:1 row_mask:0x1 bank_mask:0x1 +// GFX89: v0, v1 wave_rol:1 row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x34,0x01,0x11] +// GFX10-ERR: not a valid operand. + +v_mov_b32_dpp v0, v1 wave_ror:1 row_mask:0x1 bank_mask:0x1 +// GFX89: v0, v1 wave_ror:1 row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x3c,0x01,0x11] +// GFX10-ERR: not a valid operand. + +v_mov_b32_dpp v0, v1 row_bcast:15 row_mask:0x1 bank_mask:0x1 +// GFX89: v0, v1 row_bcast:15 row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x42,0x01,0x11] +// GFX10-ERR: not a valid operand. + +v_mov_b32_dpp v0, v1 row_bcast:31 row_mask:0x1 bank_mask:0x1 +// GFX89: v0, v1 row_bcast:31 row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x43,0x01,0x11] +// GFX10-ERR: not a valid operand.