Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -190,6 +190,12 @@ "Has i16/f16 instructions" >; +def FeatureVOP3P : SubtargetFeature<"vop3p", + "HasVOP3PInsts", + "true", + "Has VOP3P packed instructions" +>; + def FeatureMovrel : SubtargetFeature<"movrel", "HasMovrel", "true", @@ -394,7 +400,7 @@ FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, - FeatureApertureRegs, FeatureGFX9Insts + FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P ] >; @@ -569,7 +575,10 @@ def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; -def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">; +def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, + AssemblerPredicate<"Feature16BitInsts">; +def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, + AssemblerPredicate<"FeatureVOP3P">; def HasSDWA : Predicate<"Subtarget->hasSDWA()">, AssemblerPredicate<"FeatureSDWA">; Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -131,6 +131,7 @@ bool SGPRInitBug; bool HasSMemRealTime; bool Has16BitInsts; + bool HasVOP3PInsts; bool HasMovrel; bool HasVGPRIndexMode; bool HasScalarStores; @@ -211,6 +212,10 @@ return Has16BitInsts; } + bool hasVOP3PInsts() const { + return HasVOP3PInsts; + } + bool hasHWFP64() const { return FP64; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -116,6 +116,7 @@ SGPRInitBug(false), HasSMemRealTime(false), Has16BitInsts(false), + HasVOP3PInsts(false), HasMovrel(false), HasVGPRIndexMode(false), HasScalarStores(false), Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -157,7 +157,11 @@ ImmTySendMsg, ImmTyInterpSlot, ImmTyInterpAttr, - ImmTyAttrChan + ImmTyAttrChan, + ImmTyOpSel, + ImmTyOpSelHi, + ImmTyNegLo, + ImmTyNegHi }; struct TokOp { @@ -294,6 +298,10 @@ bool isInterpSlot() const { return isImmTy(ImmTyInterpSlot); } bool isInterpAttr() const { return isImmTy(ImmTyInterpAttr); } bool isAttrChan() const { return isImmTy(ImmTyAttrChan); } + bool isOpSel() const { return isImmTy(ImmTyOpSel); } + bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); } + bool isNegLo() const { return isImmTy(ImmTyNegLo); } + bool isNegHi() const { return isImmTy(ImmTyNegHi); } bool isMod() const { return isClampSI() || isOModSI(); @@ -313,6 +321,10 @@ return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i16); } + bool isSCSrcV2B16() const { + return isSCSrcB16(); + } + bool isSCSrcB32() const { return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i32); } @@ -325,6 +337,10 @@ return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16); } + bool isSCSrcV2F16() const { + return isSCSrcF16(); + } + bool isSCSrcF32() const { return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f32); } @@ -341,6 +357,11 @@ return isSCSrcB16() || isLiteralImm(MVT::i16); } + bool isSSrcV2B16() const { + llvm_unreachable("cannot happen"); + return isSSrcB16(); + } + bool isSSrcB64() const { // TODO: Find out how SALU supports extension of 32-bit literals to 64 bits. // See isVSrc64(). @@ -359,6 +380,11 @@ return isSCSrcB16() || isLiteralImm(MVT::f16); } + bool isSSrcV2F16() const { + llvm_unreachable("cannot happen"); + return isSSrcF16(); + } + bool isVCSrcB32() const { return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32); } @@ -371,6 +397,10 @@ return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i16); } + bool isVCSrcV2B16() const { + return isVCSrcB16(); + } + bool isVCSrcF32() const { return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f32); } @@ -383,6 +413,10 @@ return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f16); } + bool isVCSrcV2F16() const { + return isVCSrcF16(); + } + bool isVSrcB32() const { return isVCSrcF32() || isLiteralImm(MVT::i32); } @@ -395,6 +429,11 @@ return isVCSrcF16() || isLiteralImm(MVT::i16); } + bool isVSrcV2B16() const { + llvm_unreachable("cannot happen"); + return isVSrcB16(); + } + bool isVSrcF32() const { return isVCSrcF32() || isLiteralImm(MVT::f32); } @@ -407,6 +446,11 @@ return isVCSrcF16() || isLiteralImm(MVT::f16); } + bool isVSrcV2F16() const { + llvm_unreachable("cannot happen"); + return isVSrcF16(); + } + bool isKImmFP32() const { return isLiteralImm(MVT::f32); } @@ -607,6 +651,10 @@ case ImmTyInterpSlot: OS << "InterpSlot"; break; case ImmTyInterpAttr: OS << "InterpAttr"; break; case ImmTyAttrChan: OS << "AttrChan"; break; + case ImmTyOpSel: OS << "OpSel"; break; + case ImmTyOpSelHi: OS << "OpSelHi"; break; + case ImmTyNegLo: OS << "NegLo"; break; + case ImmTyNegHi: OS << "NegHi"; break; } } @@ -783,6 +831,8 @@ Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY }; + typedef std::map OptionalImmIndexMap; + AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) @@ -877,10 +927,18 @@ //bool ProcessInstruction(MCInst &Inst); OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int); + OperandMatchResultTy parseIntWithPrefix(const char *Prefix, OperandVector &Operands, AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, bool (*ConvertResult)(int64_t &) = nullptr); + + OperandMatchResultTy parseOperandArrayWithPrefix( + const char *Prefix, + OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, + bool (*ConvertResult)(int64_t&) = nullptr); + OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands, AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone); @@ -947,7 +1005,12 @@ void cvtId(MCInst &Inst, const OperandVector &Operands); void cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands); + + void cvtVOP3Impl(MCInst &Inst, + const OperandVector &Operands, + OptionalImmIndexMap &OptionalIdx); void cvtVOP3(MCInst &Inst, const OperandVector &Operands); + void cvtVOP3P(MCInst &Inst, const OperandVector &Operands); void cvtMIMG(MCInst &Inst, const OperandVector &Operands); void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands); @@ -995,6 +1058,30 @@ return getFltSemantics(VT.getSizeInBits() / 8); } +static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { + switch (OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + return &APFloat::IEEEsingle(); + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + return &APFloat::IEEEdouble(); + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + return &APFloat::IEEEhalf(); + default: + llvm_unreachable("unsupported fp type"); + } +} + //===----------------------------------------------------------------------===// // Operand //===----------------------------------------------------------------------===// @@ -1040,7 +1127,7 @@ if (type.getScalarSizeInBits() == 16) { return AMDGPU::isInlinableLiteral16( - static_cast(FPLiteral.bitcastToAPInt().getZExtValue()), + static_cast(FPLiteral.bitcastToAPInt().getZExtValue()), AsmParser->hasInv2PiInlineImm()); } @@ -1132,13 +1219,15 @@ // Check that this operand accepts literals assert(AMDGPU::isSISrcOperand(InstDesc, OpNum)); - auto OpSize = AMDGPU::getOperandSize(InstDesc, OpNum); // expected operand size + APInt Literal(64, Val); + uint8_t OpTy = InstDesc.OpInfo[OpNum].OperandType; if (Imm.IsFPImm) { // We got fp literal token - APInt Literal(64, Val); - - switch (OpSize) { - case 8: + switch (OpTy) { + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: { if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(), AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Literal.getZExtValue())); @@ -1162,17 +1251,32 @@ // unclear how we should encode them. This case should be checked earlier // in predicate methods (isLiteralImm()) llvm_unreachable("fp literal in 64-bit integer instruction."); - - case 4: - case 2: { + } + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { bool lost; APFloat FPLiteral(APFloat::IEEEdouble(), Literal); // Convert literal to single precision - FPLiteral.convert(*getFltSemantics(OpSize), + FPLiteral.convert(*getOpFltSemantics(OpTy), APFloat::rmNearestTiesToEven, &lost); // We allow precision lost but not overflow or underflow. This should be // checked earlier in isLiteralImm() - Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue())); + + uint64_t ImmVal = FPLiteral.bitcastToAPInt().getZExtValue(); + if (OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 || + OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) { + ImmVal |= (ImmVal << 16); + } + + Inst.addOperand(MCOperand::createImm(ImmVal)); return; } default: @@ -1185,8 +1289,11 @@ // We got int literal token. // Only sign extend inline immediates. // FIXME: No errors on truncation - switch (OpSize) { - case 4: + switch (OpTy) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: { if (isInt<32>(Val) && AMDGPU::isInlinableLiteral32(static_cast(Val), AsmParser->hasInv2PiInlineImm())) { @@ -1196,18 +1303,23 @@ Inst.addOperand(MCOperand::createImm(Val & 0xffffffff)); return; - - case 8: - if (AMDGPU::isInlinableLiteral64(Val, - AsmParser->hasInv2PiInlineImm())) { + } + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: { + if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Val)); return; } Inst.addOperand(MCOperand::createImm(Lo_32(Val))); return; - - case 2: + } + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: { if (isInt<16>(Val) && AMDGPU::isInlinableLiteral16(static_cast(Val), AsmParser->hasInv2PiInlineImm())) { @@ -1217,7 +1329,18 @@ Inst.addOperand(MCOperand::createImm(Val & 0xffff)); return; + } + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { + auto LiteralVal = static_cast(Literal.getLoBits(16).getZExtValue()); + assert(AMDGPU::isInlinableLiteral16(LiteralVal, + AsmParser->hasInv2PiInlineImm())); + uint32_t ImmVal = static_cast(LiteralVal) << 16 | + static_cast(LiteralVal); + Inst.addOperand(MCOperand::createImm(ImmVal)); + return; + } default: llvm_unreachable("invalid operand size"); } @@ -2263,6 +2386,56 @@ return MatchOperand_Success; } +OperandMatchResultTy AMDGPUAsmParser::parseOperandArrayWithPrefix( + const char *Prefix, + OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy, + bool (*ConvertResult)(int64_t&)) { + StringRef Name = Parser.getTok().getString(); + if (!Name.equals(Prefix)) + return MatchOperand_NoMatch; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Colon)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::LBrac)) + return MatchOperand_ParseFail; + Parser.Lex(); + + unsigned Val = 0; + SMLoc S = Parser.getTok().getLoc(); + + // FIXME: How to verify the number of elements matches the number of src + // operands? + for (int I = 0; I < 3; ++I) { + if (I != 0) { + if (getLexer().is(AsmToken::RBrac)) + break; + + if (getLexer().isNot(AsmToken::Comma)) + return MatchOperand_ParseFail; + Parser.Lex(); + } + + if (getLexer().isNot(AsmToken::Integer)) + return MatchOperand_ParseFail; + + int64_t Op; + if (getParser().parseAbsoluteExpression(Op)) + return MatchOperand_ParseFail; + + if (Op != 0 && Op != 1) + return MatchOperand_ParseFail; + Val |= (Op << I); + } + + Parser.Lex(); + Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S, ImmTy)); + return MatchOperand_Success; +} + OperandMatchResultTy AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, AMDGPUOperand::ImmTy ImmTy) { @@ -2295,12 +2468,11 @@ return MatchOperand_Success; } -typedef std::map OptionalImmIndexMap; - -static void addOptionalImmOperand(MCInst& Inst, const OperandVector& Operands, - OptionalImmIndexMap& OptionalIdx, - AMDGPUOperand::ImmTy ImmT, - int64_t Default = 0) { +static void addOptionalImmOperand( + MCInst& Inst, const OperandVector& Operands, + AMDGPUAsmParser::OptionalImmIndexMap& OptionalIdx, + AMDGPUOperand::ImmTy ImmT, + int64_t Default = 0) { auto i = OptionalIdx.find(ImmT); if (i != OptionalIdx.end()) { unsigned Idx = i->second; @@ -3209,6 +3381,10 @@ {"src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr}, {"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr}, {"vm", AMDGPUOperand::ImmTyExpVM, true, nullptr}, + {"op_sel", AMDGPUOperand::ImmTyOpSel, false, nullptr}, + {"op_sel_hi", AMDGPUOperand::ImmTyOpSelHi, false, nullptr}, + {"neg_lo", AMDGPUOperand::ImmTyNegLo, false, nullptr}, + {"neg_hi", AMDGPUOperand::ImmTyNegHi, false, nullptr} }; OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) { @@ -3225,6 +3401,12 @@ res = parseSDWASel(Operands, Op.Name, Op.Type); } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstUnused) { res = parseSDWADstUnused(Operands); + } else if (Op.Type == AMDGPUOperand::ImmTyOpSel || + Op.Type == AMDGPUOperand::ImmTyOpSelHi || + Op.Type == AMDGPUOperand::ImmTyNegLo || + Op.Type == AMDGPUOperand::ImmTyNegHi) { + res = parseOperandArrayWithPrefix(Op.Name, Operands, Op.Type, + Op.ConvertResult); } else { res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult); } @@ -3280,8 +3462,8 @@ && Desc.getOperandConstraint(OpNum + 1, MCOI::OperandConstraint::TIED_TO) == -1; } -void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { - OptionalImmIndexMap OptionalIdx; +void AMDGPUAsmParser::cvtVOP3Impl(MCInst &Inst, const OperandVector &Operands, + OptionalImmIndexMap &OptionalIdx) { unsigned I = 1; const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { @@ -3298,6 +3480,12 @@ llvm_unreachable("unhandled operand type"); } } +} + +void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { + OptionalImmIndexMap OptionalIdx; + + cvtVOP3Impl(Inst, Operands, OptionalIdx); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); @@ -3322,6 +3510,74 @@ } } +void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) { + OptionalImmIndexMap OptIdx; + + cvtVOP3Impl(Inst, Operands, OptIdx); + + // FIXME: This is messy. Parse the modifiers as if it was a normal VOP3 + // instruction, and then figure out where to actually put the modifiers + int Opc = Inst.getOpcode(); + + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp) != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClampSI); + } + + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel); + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSelHi, -1); + + int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo); + if (NegLoIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo); + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi); + } + + const int Ops[] = { AMDGPU::OpName::src0, + AMDGPU::OpName::src1, + AMDGPU::OpName::src2 }; + const int ModOps[] = { AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers }; + + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi); + + unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + unsigned OpSelHi = Inst.getOperand(OpSelHiIdx).getImm(); + unsigned NegLo = 0; + unsigned NegHi = 0; + + if (NegLoIdx != -1) { + int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi); + NegLo = Inst.getOperand(NegLoIdx).getImm(); + NegHi = Inst.getOperand(NegHiIdx).getImm(); + } + + for (int J = 0; J < 3; ++J) { + int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]); + if (OpIdx == -1) + break; + + uint32_t ModVal = 0; + + if ((OpSel & (1 << J)) != 0) + ModVal |= SISrcMods::OP_SEL_0; + + if ((OpSelHi & (1 << J)) != 0) + ModVal |= SISrcMods::OP_SEL_1; + + if ((NegLo & (1 << J)) != 0) + ModVal |= SISrcMods::NEG; + + if ((NegHi & (1 << J)) != 0) + ModVal |= SISrcMods::NEG_HI; + + int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); + + Inst.getOperand(ModIdx).setImm(ModVal); + } +} + //===----------------------------------------------------------------------===// // dpp //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h =================================================================== --- lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -67,6 +67,7 @@ MCOperand decodeOperand_VS_32(unsigned Val) const; MCOperand decodeOperand_VS_64(unsigned Val) const; MCOperand decodeOperand_VSrc16(unsigned Val) const; + MCOperand decodeOperand_VSrcV216(unsigned Val) const; MCOperand decodeOperand_VReg_64(unsigned Val) const; MCOperand decodeOperand_VReg_96(unsigned Val) const; @@ -85,6 +86,7 @@ OPW64, OPW128, OPW16, + OPWV216, OPW_LAST_, OPW_FIRST_ = OPW32 }; Index: lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp =================================================================== --- lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -97,6 +97,14 @@ return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm)); } +static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm)); +} + #define GET_SUBTARGETINFO_ENUM #include "AMDGPUGenSubtargetInfo.inc" #undef GET_SUBTARGETINFO_ENUM @@ -264,6 +272,10 @@ return decodeSrcOp(OPW16, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VSrcV216(unsigned Val) const { + return decodeSrcOp(OPWV216, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const { // Some instructions have operand restrictions beyond what the encoding // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra @@ -424,6 +436,7 @@ case OPW64: return MCOperand::createImm(getInlineImmVal64(Imm)); case OPW16: + case OPWV216: return MCOperand::createImm(getInlineImmVal16(Imm)); default: llvm_unreachable("implement me"); @@ -437,6 +450,7 @@ default: // fall case OPW32: case OPW16: + case OPWV216: return VGPR_32RegClassID; case OPW64: return VReg_64RegClassID; case OPW128: return VReg_128RegClassID; @@ -450,6 +464,7 @@ default: // fall case OPW32: case OPW16: + case OPWV216: return SGPR_32RegClassID; case OPW64: return SGPR_64RegClassID; case OPW128: return SGPR_128RegClassID; @@ -463,6 +478,7 @@ default: // fall case OPW32: case OPW16: + case OPWV216: return TTMP_32RegClassID; case OPW64: return TTMP_64RegClassID; case OPW128: return TTMP_128RegClassID; @@ -498,6 +514,7 @@ switch (Width) { case OPW32: case OPW16: + case OPWV216: return decodeSpecialReg32(Val); case OPW64: return decodeSpecialReg64(Val); Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h =================================================================== --- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -90,6 +90,8 @@ raw_ostream &O); void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); + void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O); void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI, @@ -117,6 +119,14 @@ const MCSubtargetInfo &STI, raw_ostream &O); void printSDWADstUnused(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printOpSel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printOpSelHi(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNegLo(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNegHi(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printInterpSlot(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printInterpAttr(const MCInst *MI, unsigned OpNo, Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp =================================================================== --- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -375,6 +375,14 @@ O << formatHex(static_cast(Imm)); } +void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint16_t Lo16 = static_cast(Imm); + assert(Lo16 == static_cast(Imm >> 16)); + printImmediate16(Lo16, STI, O); +} + void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -489,6 +497,10 @@ case AMDGPU::OPERAND_REG_IMM_FP16: printImmediate16(Op.getImm(), STI, O); break; + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + printImmediateV216(Op.getImm(), STI, O); + break; case MCOI::OPERAND_UNKNOWN: case MCOI::OPERAND_PCREL: O << formatDec(Op.getImm()); @@ -730,6 +742,71 @@ } } +static bool allOpsDefaultValue(const int* Ops, int NumOps, int Mod) { + int DefaultValue = (Mod == SISrcMods::OP_SEL_1); + + for (int I = 0; I < NumOps; ++I) { + if (!!(Ops[I] & Mod) != DefaultValue) + return false; + } + + return true; +} + +static void printPackedModifier(const MCInst *MI, StringRef Name, unsigned Mod, + raw_ostream &O) { + unsigned Opc = MI->getOpcode(); + int NumOps = 0; + int Ops[3]; + + for (int OpName : { AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers }) { + int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName); + if (Idx == -1) + break; + + Ops[NumOps++] = MI->getOperand(Idx).getImm(); + } + + if (allOpsDefaultValue(Ops, NumOps, Mod)) + return; + + O << Name; + for (int I = 0; I < NumOps; ++I) { + if (I != 0) + O << ','; + + O << !!(Ops[I] & Mod); + } + + O << ']'; +} + +void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printPackedModifier(MI, " op_sel:[", SISrcMods::OP_SEL_0, O); +} + +void AMDGPUInstPrinter::printOpSelHi(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printPackedModifier(MI, " op_sel_hi:[", SISrcMods::OP_SEL_1, O); +} + +void AMDGPUInstPrinter::printNegLo(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printPackedModifier(MI, " neg_lo:[", SISrcMods::NEG, O); +} + +void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O); +} + void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { Index: lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -220,15 +220,35 @@ Imm = MO.getImm(); } - switch (AMDGPU::getOperandSize(OpInfo)) { - case 4: + switch (OpInfo.OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: return getLit32Encoding(static_cast(Imm), STI); - case 8: + + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: return getLit64Encoding(static_cast(Imm), STI); - case 2: + + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: // FIXME Is this correct? What do inline immediates do on SI for f16 src // which does not have f16 support? return getLit16Encoding(static_cast(Imm), STI); + + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { + uint16_t Lo16 = static_cast(Imm); + assert(Lo16 == static_cast(Imm >> 16)); + uint32_t Encoding = getLit16Encoding(Lo16, STI); + assert(Encoding != 255 && "packed constants can only be inline immediates"); + return Encoding; + } default: llvm_unreachable("invalid operand size"); } Index: lib/Target/AMDGPU/SIDefines.h =================================================================== --- lib/Target/AMDGPU/SIDefines.h +++ lib/Target/AMDGPU/SIDefines.h @@ -36,6 +36,7 @@ // TODO: Should this be spilt into VOP3 a and b? VOP3 = 1 << 10, + VOP3P = 1 << 12, VINTRP = 1 << 13, SDWA = 1 << 14, @@ -102,12 +103,14 @@ OPERAND_REG_INLINE_C_FP16, OPERAND_REG_INLINE_C_FP32, OPERAND_REG_INLINE_C_FP64, + OPERAND_REG_INLINE_C_V2FP16, + OPERAND_REG_INLINE_C_V2INT16, OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32, OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16, OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16, - OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_FP64, + OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_V2INT16, OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32, OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST, @@ -125,9 +128,12 @@ // NEG and SEXT share same bit-mask because they can't be set simultaneously. namespace SISrcMods { enum { - NEG = 1 << 0, // Floating-point negate modifier - ABS = 1 << 1, // Floating-point absolute modifier - SEXT = 1 << 0 // Integer sign-extend modifier + NEG = 1 << 0, // Floating-point negate modifier + ABS = 1 << 1, // Floating-point absolute modifier + SEXT = 1 << 0, // Integer sign-extend modifier + NEG_HI = ABS, // Floating-point negate high packed component modifier. + OP_SEL_0 = 1 << 2, + OP_SEL_1 = 1 << 3 }; } Index: lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- lib/Target/AMDGPU/SIInstrFormats.td +++ lib/Target/AMDGPU/SIInstrFormats.td @@ -31,6 +31,7 @@ field bit VOP2 = 0; field bit VOPC = 0; field bit VOP3 = 0; + field bit VOP3P = 0; field bit VINTRP = 0; field bit SDWA = 0; field bit DPP = 0; @@ -92,6 +93,7 @@ let TSFlags{8} = VOP2; let TSFlags{9} = VOPC; let TSFlags{10} = VOP3; + let TSFlags{12} = VOP3P; let TSFlags{13} = VINTRP; let TSFlags{14} = SDWA; Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -440,6 +440,14 @@ return get(Opcode).TSFlags & SIInstrFlags::DPP; } + static bool isVOP3P(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOP3P; + } + + bool isVOP3P(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP3P; + } + static bool isScalarUnit(const MachineInstr &MI) { return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD); } Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -458,6 +458,12 @@ let ParserMatchClass = MatchClass; } +class NamedOperandU32Default0 : + OperandWithDefaultOps { + let PrintMethod = "print"#Name; + let ParserMatchClass = MatchClass; +} + let OperandType = "OPERAND_IMMEDIATE" in { def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>; @@ -495,6 +501,11 @@ def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>; def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused">>; +def op_sel : NamedOperandU32Default0<"OpSel", NamedMatchClass<"OpSel">>; +def op_sel_hi : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>; +def neg_lo : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>; +def neg_hi : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>; + def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>; def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> { @@ -534,6 +545,7 @@ let ParserMethod = "parseRegOrImmWithFPInputMods"; let PredicateMethod = "isRegOrImmWithFP"#opSize#"InputMods"; } + def FP16InputModsMatchClass : FPInputModsMatchClass<16>; def FP32InputModsMatchClass : FPInputModsMatchClass<32>; def FP64InputModsMatchClass : FPInputModsMatchClass<64>; @@ -586,6 +598,33 @@ let PrintMethod = "printOperandAndIntInputMods"; } +class PackedFPInputModsMatchClass : AsmOperandClass { + let Name = "PackedFP"#opSize#"InputMods"; + let ParserMethod = "parseRegOrImm"; + let PredicateMethod = "isRegOrImm"; +// let PredicateMethod = "isPackedFP"#opSize#"InputMods"; +} + +class PackedIntInputModsMatchClass : AsmOperandClass { + let Name = "PackedInt"#opSize#"InputMods"; + let ParserMethod = "parseRegOrImm"; + let PredicateMethod = "isRegOrImm"; +// let PredicateMethod = "isPackedInt"#opSize#"InputMods"; +} + +def PackedF16InputModsMatchClass : PackedFPInputModsMatchClass<16>; +def PackedI16InputModsMatchClass : PackedIntInputModsMatchClass<16>; + +class PackedFPInputMods : InputMods { +// let PrintMethod = "printPackedFPInputMods"; +} + +class PackedIntInputMods : InputMods { + //let PrintMethod = "printPackedIntInputMods"; +} + +def PackedF16InputMods : PackedFPInputMods; +def PackedI16InputMods : PackedIntInputMods; //===----------------------------------------------------------------------===// // Complex patterns @@ -602,10 +641,13 @@ def VOP3Mods0Clamp0OMod : ComplexPattern; def VOP3Mods : ComplexPattern; def VOP3NoMods : ComplexPattern; - // VOP3Mods, but the input source is known to never be NaN. def VOP3Mods_nnan : ComplexPattern; +def VOP3PMods : ComplexPattern; +def VOP3PMods0 : ComplexPattern; + + //===----------------------------------------------------------------------===// // SI assembler operands //===----------------------------------------------------------------------===// @@ -728,12 +770,34 @@ // instructions for the given VT. class getVOPSrc0ForVT { bit isFP = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, v2f16.Value), 1, !if(!eq(VT.Value, f32.Value), 1, !if(!eq(VT.Value, f64.Value), 1, - 0))); - RegisterOperand ret = !if(isFP, - !if(!eq(VT.Size, 64), VSrc_f64, !if(!eq(VT.Size, 16), VSrc_f16, VSrc_f32)), - !if(!eq(VT.Size, 64), VSrc_b64, !if(!eq(VT.Size, 16), VSrc_b16, VSrc_b32))); + 0)))); + + RegisterOperand ret = + !if(isFP, + !if(!eq(VT.Size, 64), + VSrc_f64, + !if(!eq(VT.Value, f16.Value), + VSrc_f16, + !if(!eq(VT.Value, v2f16.Value), + VCSrc_v2f16, + VSrc_f32 + ) + ) + ), + !if(!eq(VT.Size, 64), + VSrc_b64, + !if(!eq(VT.Value, i16.Value), + VSrc_b16, + !if(!eq(VT.Value, v2i16.Value), + VCSrc_v2b16, + VSrc_b32 + ) + ) + ) + ); } // Returns the vreg register class to use for source operand given VT @@ -747,25 +811,38 @@ // given VT. class getVOP3SrcForVT { bit isFP = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, v2f16.Value), 1, !if(!eq(VT.Value, f32.Value), 1, !if(!eq(VT.Value, f64.Value), 1, - 0))); + 0)))); RegisterOperand ret = !if(!eq(VT.Size, 128), - VSrc_128, - !if(!eq(VT.Size, 64), + VSrc_128, + !if(!eq(VT.Size, 64), !if(isFP, - VCSrc_f64, - VCSrc_b64), + VCSrc_f64, + VCSrc_b64), !if(!eq(VT.Value, i1.Value), - SCSrc_b64, - !if(isFP, - !if(!eq(VT.Size, 16), VCSrc_f16, VCSrc_f32), - !if(!eq(VT.Size, 16), VCSrc_b16, VCSrc_b32) - ) - ) - ) - ); + SCSrc_b64, + !if(isFP, + !if(!eq(VT.Value, f16.Value), + VCSrc_f16, + !if(!eq(VT.Value, v2f16.Value), + VCSrc_v2f16, + VCSrc_f32 + ) + ), + !if(!eq(VT.Value, i16.Value), + VCSrc_b16, + !if(!eq(VT.Value, v2i16.Value), + VCSrc_v2b16, + VCSrc_b32 + ) + ) + ) + ) + ) + ); } // Returns 1 if the source arguments have modifiers, 0 if they do not. @@ -775,7 +852,8 @@ !if(!eq(SrcVT.Value, f16.Value), 1, !if(!eq(SrcVT.Value, f32.Value), 1, !if(!eq(SrcVT.Value, f64.Value), 1, - 0))); + !if(!eq(SrcVT.Value, v2f16.Value), 1, + 0)))); } class isIntType { @@ -786,6 +864,23 @@ 0))); } +class isPackedType { + bit ret = + !if(!eq(SrcVT.Value, v2i16.Value), 1, + !if(!eq(SrcVT.Value, v2f16.Value), 1, 0) + ); +} + +// Float or packed int +class isModifierType { + bit ret = + !if(!eq(SrcVT.Value, f16.Value), 1, + !if(!eq(SrcVT.Value, f32.Value), 1, + !if(!eq(SrcVT.Value, f64.Value), 1, + !if(!eq(SrcVT.Value, v2f16.Value), 1, + !if(!eq(SrcVT.Value, v2i16.Value), 1, + 0))))); +} // Return type of input modifiers operand for specified input operand class getSrcMod { @@ -793,6 +888,7 @@ !if(!eq(VT.Value, f32.Value), 1, !if(!eq(VT.Value, f64.Value), 1, 0))); + bit isPacked = isPackedType.ret; Operand ret = !if(!eq(VT.Size, 64), !if(isFP, FP64InputMods, Int64InputMods), !if(isFP, @@ -823,8 +919,8 @@ // Returns the input arguments for VOP3 instructions for the given SrcVT. class getIns64 { + bit HasModifiers, bit HasOMod, + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> { dag ret = !if (!eq(NumSrcArgs, 0), @@ -843,9 +939,13 @@ !if (!eq(NumSrcArgs, 2), !if (!eq(HasModifiers, 1), // VOP 2 with modifiers - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - clampmod:$clamp, omod:$omod) + !if( !eq(HasOMod, 1), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, omod:$omod), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp)) /* else */, // VOP2 without modifiers (ins Src0RC:$src0, Src1RC:$src1) @@ -853,16 +953,57 @@ /* NumSrcArgs == 3 */, !if (!eq(HasModifiers, 1), // VOP3 with modifiers - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - Src2Mod:$src2_modifiers, Src2RC:$src2, - clampmod:$clamp, omod:$omod) + !if (!eq(HasOMod, 1), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2, + clampmod:$clamp, omod:$omod), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2, + clampmod:$clamp)) /* else */, // VOP3 without modifiers (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2) /* endif */ )))); } +/// XXX - src1 may only allow VGPRs? + +// The modifiers (except clamp) are dummy operands for the benefit of +// printing and parsing. They defer their values to looking at the +// srcN_modifiers for what to print. +class getInsVOP3P { + dag ret = !if (!eq(NumSrcArgs, 2), + !if (HasClamp, + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, + op_sel:$op_sel, op_sel_hi:$op_sel_hi, + neg_lo:$neg_lo, neg_hi:$neg_hi), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + op_sel:$op_sel, op_sel_hi:$op_sel_hi, + neg_lo:$neg_lo, neg_hi:$neg_hi)), + // else NumSrcArgs == 3 + !if (HasClamp, + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2, + clampmod:$clamp, + op_sel:$op_sel, op_sel_hi:$op_sel_hi, + neg_lo:$neg_lo, neg_hi:$neg_hi), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2, + op_sel:$op_sel, op_sel_hi:$op_sel_hi, + neg_lo:$neg_lo, neg_hi:$neg_hi)) + ); +} + class getInsDPP { @@ -946,7 +1087,8 @@ // Returns the assembly string for the inputs and outputs of a VOP3 // instruction. -class getAsm64 { +class getAsm64 { string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); string src1 = !if(!eq(NumSrcArgs, 1), "", @@ -956,7 +1098,26 @@ string ret = !if(!eq(HasModifiers, 0), getAsm32.ret, - dst#", "#src0#src1#src2#"$clamp"#"$omod"); + dst#", "#src0#src1#src2#"$clamp"#!if(HasOMod, "$omod", "")); +} + +// Returns the assembly string for the inputs and outputs of a VOP3P +// instruction. +class getAsmVOP3P { + string dst = " $vdst"; + string src0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); + string src1 = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1", + " $src1,")); + string src2 = !if(!eq(NumSrcArgs, 3), " $src2", ""); + + string mods = !if(HasModifiers, "$neg_lo$neg_hi", ""); + string clamp = !if(HasClamp, "$clamp", ""); + + // Each modifier is printed as an array of bits for each operand, so + // all operands are printed as part of src0_modifiers. + string ret = dst#", "#src0#src1#src2#"$op_sel$op_sel_hi"#mods#clamp; } class getAsmDPP { @@ -1068,7 +1229,7 @@ field bit HasSrc2 = !if(!eq(Src2VT.Value, untyped.Value), 0, 1); // TODO: Modifiers logic is somewhat adhoc here, to be refined later - field bit HasModifiers = isFloatType.ret; + field bit HasModifiers = isModifierType.ret; field bit HasSrc0FloatMods = isFloatType.ret; field bit HasSrc1FloatMods = isFloatType.ret; @@ -1082,12 +1243,18 @@ field bit HasSrc1Mods = !if(HasModifiers, BitOr.ret, 0); field bit HasSrc2Mods = !if(HasModifiers, BitOr.ret, 0); - field bit HasOMod = HasModifiers; field bit HasClamp = HasModifiers; field bit HasSDWAClamp = HasSrc0; + field bit IsPacked = isPackedType.ret; + field bit HasOpSel = IsPacked; + field bit HasOMod = !if(HasOpSel, 0, HasModifiers); field bit HasExt = getHasExt.ret; + field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods); + field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods); + field Operand Src2PackedMod = !if(HasSrc2FloatMods, PackedF16InputMods, PackedI16InputMods); + field dag Outs = !if(HasDst,(outs DstRC:$vdst),(outs)); // VOP3b instructions are a special case with a second explicit @@ -1099,7 +1266,12 @@ field dag Ins32 = getIns32.ret; field dag Ins64 = getIns64.ret; + HasModifiers, HasOMod, Src0Mod, Src1Mod, + Src2Mod>.ret; + field dag InsVOP3P = getInsVOP3P.ret; + field dag InsDPP = getInsDPP.ret; field dag InsSDWA = getInsSDWA.ret; field string Asm32 = getAsm32.ret; - field string Asm64 = getAsm64.ret; + field string Asm64 = getAsm64.ret; + field string AsmVOP3P = getAsmVOP3P.ret; field string AsmDPP = getAsmDPP.ret; field string AsmSDWA = getAsmSDWA.ret; } @@ -1128,6 +1301,13 @@ def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>; def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>; +def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>; +def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>; +def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>; + +def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>; +def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>; + def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>; def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -606,6 +606,12 @@ def : BitConvert ; def : BitConvert ; def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; // 64-bit bitcast def : BitConvert ; Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -133,7 +133,7 @@ // TODO: Do we need to set DwarfRegAlias on register tuples? // SGPR 32-bit registers -def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, +def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add (sequence "SGPR%u", 0, 103))> { // Give all SGPR classes higher priority than VGPR classes, because // we want to spill SGPRs to VGPRs. @@ -184,7 +184,7 @@ (add (decimate (shl SGPR_32, 15), 4))]>; // Trap handler TMP 32-bit registers -def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32], 32, +def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, (add (sequence "TTMP%u", 0, 11))> { let isAllocatable = 0; } @@ -202,7 +202,8 @@ (add (decimate (shl TTMP_32, 3), 4))]>; // VGPR 32-bit registers -def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, +// i16/f16 only on VI+ +def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add (sequence "VGPR%u", 0, 255))> { let AllocationPriority = 1; let Size = 32; @@ -263,7 +264,7 @@ // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. -def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, +def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> { @@ -276,7 +277,7 @@ } // Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, +def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI)> { let AllocationPriority = 7; } @@ -372,7 +373,7 @@ let Size = 32; } -def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, +def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add VGPR_32, SReg_32)> { let isAllocatable = 0; } @@ -423,6 +424,18 @@ let OperandType = opType#"_FP64"; let ParserMatchClass = RegImmMatcher; } + + def _v2b16 : RegisterOperand(rc#"_32")> { + let OperandType = opType#"_V2INT16"; + let ParserMatchClass = RegImmMatcher; + let DecoderMethod = "decodeOperand_VSrcV216"; + } + + def _v2f16 : RegisterOperand(rc#"_32")> { + let OperandType = opType#"_V2FP16"; + let ParserMatchClass = RegImmMatcher; + let DecoderMethod = "decodeOperand_VSrcV216"; + } } } Index: lib/Target/AMDGPU/SOPInstructions.td =================================================================== --- lib/Target/AMDGPU/SOPInstructions.td +++ lib/Target/AMDGPU/SOPInstructions.td @@ -438,6 +438,11 @@ def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">; } // End Defs = [SCC] +let SubtargetPredicate = isGFX9 in { + def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">; + def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">; + def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">; +} //===----------------------------------------------------------------------===// // SOPK Instructions @@ -1207,6 +1212,9 @@ def S_BFE_I64_vi : SOP2_Real_vi <0x28, S_BFE_I64>; def S_CBRANCH_G_FORK_vi : SOP2_Real_vi <0x29, S_CBRANCH_G_FORK>; def S_ABSDIFF_I32_vi : SOP2_Real_vi <0x2a, S_ABSDIFF_I32>; +def S_PACK_LL_B32_B16_vi : SOP2_Real_vi <0x32, S_PACK_LL_B32_B16>; +def S_PACK_LH_B32_B16_vi : SOP2_Real_vi <0x33, S_PACK_LH_B32_B16>; +def S_PACK_HH_B32_B16_vi : SOP2_Real_vi <0x34, S_PACK_HH_B32_B16>; def S_MOVK_I32_vi : SOPK_Real_vi <0x00, S_MOVK_I32>; def S_CMOVK_I32_vi : SOPK_Real_vi <0x01, S_CMOVK_I32>; Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -301,6 +301,8 @@ case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: return 2; default: @@ -323,6 +325,9 @@ LLVM_READNONE bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi); +LLVM_READNONE +bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi); + bool isUniformMMO(const MachineMemOperand *MMO); /// \returns The encoding that will be used for \p ByteOffset in the SMRD Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -564,6 +564,7 @@ case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: return true; default: return false; @@ -682,6 +683,14 @@ Val == 0x3118; // 1/2pi } +bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) { + assert(HasInv2Pi); + + int16_t Lo16 = static_cast(Literal); + int16_t Hi16 = static_cast(Literal >> 16); + return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi); +} + bool isUniformMMO(const MachineMemOperand *MMO) { const Value *Ptr = MMO->getValue(); // UndefValue means this is a load of a kernel input. These are uniform. Index: lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP1Instructions.td +++ lib/Target/AMDGPU/VOP1Instructions.td @@ -237,7 +237,7 @@ src0_sel:$src0_sel); let Asm32 = getAsm32<1, 1>.ret; - let Asm64 = getAsm64<1, 1, 0>.ret; + let Asm64 = getAsm64<1, 1, 0, 1>.ret; let AsmDPP = getAsmDPP<1, 1, 0>.ret; let AsmSDWA = getAsmSDWA<1, 1, 0>.ret; Index: lib/Target/AMDGPU/VOP2Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP2Instructions.td +++ lib/Target/AMDGPU/VOP2Instructions.td @@ -182,7 +182,7 @@ class VOP_MAC : VOPProfile <[vt, vt, vt, vt]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64, 3, - HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret; + HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, VGPR_32:$src2, // stub argument @@ -194,6 +194,7 @@ clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); let Asm32 = getAsm32<1, 2, vt>.ret; + let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret; let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret; let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret; let HasSrc2 = 0; @@ -204,13 +205,13 @@ def VOP_MAC_F16 : VOP_MAC { // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives // 'not a string initializer' error. - let Asm64 = getAsm64<1, 2, HasModifiers, f16>.ret; + let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, f16>.ret; } def VOP_MAC_F32 : VOP_MAC { // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives // 'not a string initializer' error. - let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret; + let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, f32>.ret; } // Write out to vcc or arbitrary SGPR. Index: lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP3Instructions.td +++ lib/Target/AMDGPU/VOP3Instructions.td @@ -29,6 +29,26 @@ ret1)); } +class getVOP3PModPat { + list ret3 = [(set P.DstVT:$vdst, + (node (P.Src0VT !if(P.HasClamp, (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), + (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))), + (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))))]; + + list ret2 = [(set P.DstVT:$vdst, + (node !if(P.HasClamp, (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)), + (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))), + (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers))))]; + + list ret1 = [(set P.DstVT:$vdst, + (node (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))]; + + list ret = !if(!eq(P.NumSrcArgs, 3), ret3, + !if(!eq(P.NumSrcArgs, 2), ret2, + ret1)); +} + class getVOP3Pat { list ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))]; list ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]; @@ -263,6 +283,10 @@ } // End Predicates = [isVI] +let SubtargetPredicate = isGFX9 in { +def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile>; +} + //===----------------------------------------------------------------------===// // Target @@ -449,3 +473,5 @@ defm V_LSHRREV_B64 : VOP3_Real_vi <0x290>; defm V_ASHRREV_I64 : VOP3_Real_vi <0x291>; defm V_TRIG_PREOP_F64 : VOP3_Real_vi <0x292>; + +defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>; Index: lib/Target/AMDGPU/VOP3PInstructions.td =================================================================== --- /dev/null +++ lib/Target/AMDGPU/VOP3PInstructions.td @@ -0,0 +1,82 @@ +//===-- VOP3PInstructions.td - Vector Instruction Defintions --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// VOP3P Classes +//===----------------------------------------------------------------------===// + +class VOP3PInst : + VOP3P_Pseudo.ret, getVOP3Pat.ret) +>; + +// Non-packed instructions that use the VOP3P encoding. i.e. where +// omod/abs are used. +class VOP3_VOP3PInst : + VOP3P_Pseudo.ret, getVOP3Pat.ret) +>; + +let isCommutable = 1 in { +def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile>; +def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile>; +def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile>; +def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile>; +def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile>; + +def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile>; +def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile>; +def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile>; +def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile>; + +def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile>; +def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile>; +def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile>; +def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile>; +} + +def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile>; +def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile>; +def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile>; + +// XXX - Commutable? +def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile>; +def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile>; +def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile>; + + +multiclass VOP3P_Real_vi op> { + def _vi : VOP3P_Real(NAME), SIEncodingFamily.VI>, + VOP3Pe (NAME).Pfl> { + let AssemblerPredicates = [HasVOP3PInsts]; + let DecoderNamespace = "VI"; + } +} + +defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>; +defm V_PK_ADD_I16 : VOP3P_Real_vi <0x382>; +defm V_PK_SUB_I16 : VOP3P_Real_vi <0x383>; +defm V_PK_LSHLREV_B16 : VOP3P_Real_vi <0x384>; +defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x385>; +defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x386>; +defm V_PK_MAX_I16 : VOP3P_Real_vi <0x387>; +defm V_PK_MIN_I16 : VOP3P_Real_vi <0x388>; + +defm V_PK_ADD_U16 : VOP3P_Real_vi <0x38a>; +defm V_PK_MAX_U16 : VOP3P_Real_vi <0x38c>; +defm V_PK_MIN_U16 : VOP3P_Real_vi <0x38d>; +defm V_PK_FMA_F16 : VOP3P_Real_vi <0x38e>; +defm V_PK_ADD_F16 : VOP3P_Real_vi <0x38f>; +defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>; +defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>; +defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>; + +defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>; +defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>; +defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>; Index: lib/Target/AMDGPU/VOPInstructions.td =================================================================== --- lib/Target/AMDGPU/VOPInstructions.td +++ lib/Target/AMDGPU/VOPInstructions.td @@ -68,8 +68,9 @@ let hasPostISelHook = 1; } -class VOP3_Pseudo pattern=[], bit VOP3Only = 0> : - InstSI , +class VOP3_Pseudo pattern = [], + bit VOP3Only = 0, bit isVOP3P = 0> : + InstSI , VOP , SIMCInstr, MnemonicAlias { @@ -79,7 +80,7 @@ let UseNamedOperandTable = 1; string Mnemonic = opName; - string AsmOperands = P.Asm64; + string AsmOperands = !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64); let Size = 8; let mayLoad = 0; @@ -105,18 +106,24 @@ let AsmVariantName = AMDGPUAsmVariants.VOP3; let AsmMatchConverter = !if(!eq(VOP3Only,1), - "cvtVOP3", + !if(!and(P.IsPacked, isVOP3P), "cvtVOP3P", "cvtVOP3"), !if(!eq(P.HasModifiers, 1), "cvtVOP3_2_mod", "")); VOPProfile Pfl = P; } +class VOP3P_Pseudo pattern = []> : + VOP3_Pseudo { + let VOP3P = 1; +} + class VOP3_Real : InstSI , SIMCInstr { let isPseudo = 0; let isCodeGenOnly = 0; + let UseNamedOperandTable = 1; let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; @@ -130,6 +137,11 @@ let TSFlags = ps.TSFlags; } +// XXX - Is there any reason to distingusih this from regular VOP3 +// here? +class VOP3P_Real : + VOP3_Real; + class VOP3a : Enc64 { bits<2> src0_modifiers; bits<9> src0; @@ -197,6 +209,42 @@ let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); } +class VOP3Pe op, VOPProfile P> : Enc64 { + bits<8> vdst; + // neg, neg_hi, op_sel put in srcN_modifiers + bits<4> src0_modifiers; + bits<9> src0; + bits<4> src1_modifiers; + bits<9> src1; + bits<4> src2_modifiers; + bits<9> src2; + bits<1> clamp; + + let Inst{7-0} = vdst; + let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0 + let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1 + let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2 + + let Inst{11} = !if(P.HasOpSel, src0_modifiers{2}, 0); // op_sel(0) + let Inst{12} = !if(P.HasOpSel, src1_modifiers{2}, 0); // op_sel(1) + let Inst{13} = !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2) + + let Inst{14} = !if(P.HasOpSel, src2_modifiers{3}, 0); // op_sel_hi(2) + + let Inst{15} = !if(P.HasClamp, clamp{0}, 0); + + let Inst{25-16} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = !if(P.HasSrc0, src0, 0); + let Inst{49-41} = !if(P.HasSrc1, src1, 0); + let Inst{58-50} = !if(P.HasSrc2, src2, 0); + let Inst{59} = !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel_hi(0) + let Inst{60} = !if(P.HasOpSel, src1_modifiers{3}, 0); // op_sel_hi(1) + let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo) + let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo) + let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo) +} + class VOP3be_si op, VOPProfile P> : VOP3be

{ let Inst{25-17} = op; } @@ -348,3 +396,4 @@ include "VOP1Instructions.td" include "VOP2Instructions.td" include "VOP3Instructions.td" +include "VOP3PInstructions.td" Index: test/MC/AMDGPU/literalv216-err.s =================================================================== --- /dev/null +++ test/MC/AMDGPU/literalv216-err.s @@ -0,0 +1,22 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx901 -show-encoding %s 2>&1 | FileCheck -check-prefix=GFX9 %s + +v_pk_add_f16 v1, -17, v2 +// GFX9: :19: error: invalid operand for instruction + +v_pk_add_f16 v1, 65, v2 +// GFX9: :18: error: invalid operand for instruction + +v_pk_add_f16 v1, 64.0, v2 +// GFX9: :18: error: invalid operand for instruction + +v_pk_add_f16 v1, -0.15915494, v2 +// GFX9: :19: error: invalid operand for instruction + +v_pk_add_f16 v1, -0.0, v2 +// GFX9: :19: error: invalid operand for instruction + +v_pk_add_f16 v1, -32768, v2 +// GFX9: :19: error: invalid operand for instruction + +v_pk_add_f16 v1, 32767, v2 +// GFX9: :18: error: invalid operand for instruction Index: test/MC/AMDGPU/literalv216.s =================================================================== --- /dev/null +++ test/MC/AMDGPU/literalv216.s @@ -0,0 +1,112 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx901 -show-encoding %s | FileCheck -check-prefix=GFX9 %s + +v_pk_add_f16 v1, 0, v2 +// GFX9: v_pk_add_f16 v1, 0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x80,0x04,0x02,0x18] + +v_pk_add_f16 v1, 0.0, v2 +// GFX9: v_pk_add_f16 v1, 0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x80,0x04,0x02,0x18] + +v_pk_add_f16 v1, v2, 0 +// GFX9: v_pk_add_f16 v1, v2, 0 ; encoding: [0x01,0x00,0x8f,0xd3,0x02,0x01,0x01,0x18] + +v_pk_add_f16 v1, v2, 0.0 +// GFX9: v_pk_add_f16 v1, v2, 0 ; encoding: [0x01,0x00,0x8f,0xd3,0x02,0x01,0x01,0x18] + +v_pk_add_f16 v1, 1.0, v2 +// GFX9: v_pk_add_f16 v1, 1.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf2,0x04,0x02,0x18] + +v_pk_add_f16 v1, -1.0, v2 +// GFX9: v_pk_add_f16 v1, -1.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf3,0x04,0x02,0x18] + +v_pk_add_f16 v1, -0.5, v2 +// GFX9: v_pk_add_f16 v1, -0.5, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf1,0x04,0x02,0x18] + +v_pk_add_f16 v1, 0.5, v2 +// GFX9: v_pk_add_f16 v1, 0.5, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf0,0x04,0x02,0x18] + +v_pk_add_f16 v1, 2.0, v2 +// GFX9: v_pk_add_f16 v1, 2.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf4,0x04,0x02,0x18] + +v_pk_add_f16 v1, -2.0, v2 +// GFX9: v_pk_add_f16 v1, -2.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf5,0x04,0x02,0x18] + +v_pk_add_f16 v1, 4.0, v2 +// GFX9: v_pk_add_f16 v1, 4.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf6,0x04,0x02,0x18] + +v_pk_add_f16 v1, -4.0, v2 +// GFX9: v_pk_add_f16 v1, -4.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf7,0x04,0x02,0x18] + +v_pk_add_f16 v1, 0.15915494, v2 +// GFX9: v_pk_add_f16 v1, 0.15915494, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf8,0x04,0x02,0x18] + +v_pk_add_f16 v1, -1, v2 +// GFX9: v_pk_add_f16 v1, -1, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xc1,0x04,0x02,0x18] + +v_pk_add_f16 v1, -2, v2 +// GFX9: v_pk_add_f16 v1, -2, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xc2,0x04,0x02,0x18] + +v_pk_add_f16 v1, -3, v2 +// GFX9: v_pk_add_f16 v1, -3, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xc3,0x04,0x02,0x18] + +v_pk_add_f16 v1, -16, v2 +// GFX9: v_pk_add_f16 v1, -16, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xd0,0x04,0x02,0x18] + +v_pk_add_f16 v1, 1, v2 +// GFX9: v_pk_add_f16 v1, 1, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x81,0x04,0x02,0x18] + +v_pk_add_f16 v1, 2, v2 +// GFX9: v_pk_add_f16 v1, 2, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x82,0x04,0x02,0x18] + +v_pk_add_f16 v1, 3, v2 +// GFX9: v_pk_add_f16 v1, 3, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x83,0x04,0x02,0x18] + +v_pk_add_f16 v1, 4, v2 +// GFX9: v_pk_add_f16 v1, 4, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x84,0x04,0x02,0x18] + +v_pk_add_f16 v1, 15, v2 +// GFX9: v_pk_add_f16 v1, 15, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x8f,0x04,0x02,0x18] + +v_pk_add_f16 v1, 16, v2 +// GFX9: v_pk_add_f16 v1, 16, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x90,0x04,0x02,0x18] + +v_pk_add_f16 v1, 63, v2 +// GFX9: v_pk_add_f16 v1, 63, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xbf,0x04,0x02,0x18] + +v_pk_add_f16 v1, 64, v2 +// GFX9: v_pk_add_f16 v1, 64, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xc0,0x04,0x02,0x18] + +v_pk_add_f16 v1, 0x0001, v2 +// GFX9: v_pk_add_f16 v1, 1, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x81,0x04,0x02,0x18] + +v_pk_add_f16 v1, 0xffff, v2 +// GFX9: v_pk_add_f16 v1, -1, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xc1,0x04,0x02,0x18] + +v_pk_add_f16 v1, 0x3c00, v2 +// GFX9: v_pk_add_f16 v1, 1.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf2,0x04,0x02,0x18] + +v_pk_add_f16 v1, 0xbc00, v2 +// GFX9: v_pk_add_f16 v1, -1.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf3,0x04,0x02,0x18] + +v_pk_add_f16 v1, 0x3800, v2 +// GFX9: v_pk_add_f16 v1, 0.5, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf0,0x04,0x02,0x18] + +v_pk_add_f16 v1, 0xb800, v2 +// GFX9: v_pk_add_f16 v1, -0.5, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf1,0x04,0x02,0x18] + +v_pk_add_f16 v1, 0x4000, v2 +// GFX9: v_pk_add_f16 v1, 2.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf4,0x04,0x02,0x18] + +v_pk_add_f16 v1, 0xc000, v2 +// GFX9: v_pk_add_f16 v1, -2.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf5,0x04,0x02,0x18] + +v_pk_add_f16 v1, 0x4400, v2 +// GFX9: v_pk_add_f16 v1, 4.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf6,0x04,0x02,0x18] + +v_pk_add_f16 v1, 0xc400, v2 +// GFX9: v_pk_add_f16 v1, -4.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf7,0x04,0x02,0x18] + +v_pk_add_f16 v1, 0x3118, v2 +// GFX9: v_pk_add_f16 v1, 0.15915494, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf8,0x04,0x02,0x18] + +v_pk_add_f16 v1, 65535, v2 +// GFX9: v_pk_add_f16 v1, -1, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xc1,0x04,0x02,0x18] Index: test/MC/AMDGPU/vop3p-err.s =================================================================== --- /dev/null +++ test/MC/AMDGPU/vop3p-err.s @@ -0,0 +1,113 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx901 -show-encoding %s 2>&1 | FileCheck -check-prefix=GFX9 %s + +// GFX9: 31: error: failed parsing operand. +v_pk_add_u16 v1, v2, v3 op_sel + +// GFX9: 32: error: failed parsing operand. +v_pk_add_u16 v1, v2, v3 op_sel: + +// GFX9: 33: error: failed parsing operand. +v_pk_add_u16 v1, v2, v3 op_sel:[ + +// GFX9: 33: error: failed parsing operand. +v_pk_add_u16 v1, v2, v3 op_sel:[] + +// GFX9: 34: error: failed parsing operand. +v_pk_add_u16 v1, v2, v3 op_sel:[,] + +// XXGFX9: 34: error: failed parsing operand. +// v_pk_add_u16 v1, v2, v3 op_sel:[0] + +// GFX9: 35: error: failed parsing operand. +v_pk_add_u16 v1, v2, v3 op_sel:[0,] + +// XXGFX9: 36: error: failed parsing operand. +// v_pk_add_u16 v1, v2, v3 op_sel:[,0] + +// GFX9: 36: error: failed parsing operand. +v_pk_add_u16 v1, v2, v3 op_sel:[0,2] + +// GFX9: 35: error: failed parsing operand. +v_pk_add_u16 v1, v2, v3 op_sel:[2,0] + +// GFX9: 33: error: failed parsing operand. +v_pk_add_u16 v1, v2, v3 op_sel:[-1,0] + +// GFX9: 35: error: failed parsing operand. +v_pk_add_u16 v1, v2, v3 op_sel:[0,-1] + +// GFX9: 40: error: not a valid operand. +v_pk_add_u16 v1, v2, v3 op_sel:[0,0,0,0] + +// XXGFX9: invalid operand for instruction +v_pk_add_u16 v1, v2, v3 neg_lo:[0,0] + +// +// Regular modifiers on packed instructions +// + +// FIXME: should be invalid operand for instruction +// GFX9: :18: error: not a valid operand. +v_pk_add_f16 v1, |v2|, v3 + +// GFX9: :21: error: not a valid operand. +v_pk_add_f16 v1, abs(v2), v3 + +// GFX9: :22: error: not a valid operand. +v_pk_add_f16 v1, v2, |v3| + +// GFX9: :25: error: not a valid operand. +v_pk_add_f16 v1, v2, abs(v3) + +// GFX9: :19: error: invalid operand for instruction +v_pk_add_f16 v1, -v2, v3 + +// GFX9: :23: error: invalid operand for instruction +v_pk_add_f16 v1, v2, -v3 + +// GFX9: :21: error: not a valid operand. +v_pk_add_u16 v1, abs(v2), v3 + +// GFX9: :19: error: invalid operand for instruction +v_pk_add_u16 v1, -v2, v3 + + +// +// Packed operands on the non-packed VOP3P instructions +// + +// GFX9: invalid operand for instruction +v_mad_mix_f32 v1, v2, v3, v4 op_sel:[0,0,0] + +// GFX9: invalid operand for instruction +v_mad_mix_f32 v1, v2, v3, v4 op_sel_hi:[0,0,0] + +// GFX9: invalid operand for instruction +v_mad_mix_f32 v1, v2, v3, v4 neg_lo:[0,0,0] + +// GFX9: invalid operand for instruction +v_mad_mix_f32 v1, v2, v3, v4 neg_hi:[0,0,0] + +// GFX9: invalid operand for instruction +v_mad_mixlo_f16 v1, v2, v3, v4 op_sel:[0,0,0] + +// GFX9: invalid operand for instruction +v_mad_mixlo_f16 v1, v2, v3, v4 op_sel_hi:[0,0,0] + +// GFX9: invalid operand for instruction +v_mad_mixlo_f16 v1, v2, v3, v4 neg_lo:[0,0,0] + +// GFX9: invalid operand for instruction +v_mad_mixlo_f16 v1, v2, v3, v4 neg_hi:[0,0,0] + +// GFX9: invalid operand for instruction +v_mad_mixhi_f16 v1, v2, v3, v4 op_sel:[0,0,0] + +// GFX9: invalid operand for instruction +v_mad_mixhi_f16 v1, v2, v3, v4 op_sel_hi:[0,0,0] + +// GFX9: invalid operand for instruction +v_mad_mixhi_f16 v1, v2, v3, v4 neg_lo:[0,0,0] + +// GFX9: invalid operand for instruction +v_mad_mixhi_f16 v1, v2, v3, v4 neg_hi:[0,0,0] Index: test/MC/AMDGPU/vop3p.s =================================================================== --- /dev/null +++ test/MC/AMDGPU/vop3p.s @@ -0,0 +1,216 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx901 -show-encoding %s | FileCheck -check-prefix=GFX9 %s + +// +// Test op_sel/op_sel_hi +// + +v_pk_add_u16 v1, v2, v3 +// GFX9: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x00,0x8a,0xd3,0x02,0x07,0x02,0x18] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,0] +// GFX9: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x00,0x8a,0xd3,0x02,0x07,0x02,0x18] + +v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,1] +// GFX9: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x00,0x8a,0xd3,0x02,0x07,0x02,0x18] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1] +// GFX9: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x00,0x8a,0xd3,0x02,0x07,0x02,0x18] + +v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] +// GFX9: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] ; encoding: [0x01,0x00,0x8a,0xd3,0x02,0x07,0x02,0x00] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0] +// GFX9: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] ; encoding: [0x01,0x00,0x8a,0xd3,0x02,0x07,0x02,0x00] + +v_pk_add_u16 v1, v2, v3 op_sel:[1,0] +// GFX9: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] ; encoding: [0x01,0x08,0x8a,0xd3,0x02,0x07,0x02,0x18] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,1] +// GFX9: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] ; encoding: [0x01,0x10,0x8a,0xd3,0x02,0x07,0x02,0x18] + +v_pk_add_u16 v1, v2, v3 op_sel:[1,1] +// GFX9: v_pk_add_u16 v1, v2, v3 op_sel:[1,1] ; encoding: [0x01,0x18,0x8a,0xd3,0x02,0x07,0x02,0x18] + +v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,1] +// GFX9: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,1] ; encoding: [0x01,0x00,0x8a,0xd3,0x02,0x07,0x02,0x10] + +v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,0] +// GFX9: v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,0] ; encoding: [0x01,0x00,0x8a,0xd3,0x02,0x07,0x02,0x08] + +v_pk_add_u16 v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1] +// GFX9: v_pk_add_u16 v1, v2, v3 op_sel:[1,1] ; encoding: [0x01,0x18,0x8a,0xd3,0x02,0x07,0x02,0x18] + +v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0] +// GFX9: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0] ; encoding: [0x01,0x08,0x8a,0xd3,0x02,0x07,0x02,0x08] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1] +// GFX9: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1] ; encoding: [0x01,0x10,0x8a,0xd3,0x02,0x07,0x02,0x10] + +v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1] +// GFX9: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x01,0x08,0x8a,0xd3,0x02,0x07,0x02,0x10] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0] +// GFX9: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0x01,0x10,0x8a,0xd3,0x02,0x07,0x02,0x08] + +// +// Test src2 op_sel/op_sel_hi +// + +v_pk_fma_f16 v8, v0, s0, v1 +// GFX9: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x8e,0xd3,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,0] neg_hi:[0,0,0] +// GFX9: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x8e,0xd3,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[1,1,1] neg_lo:[0,0,0] neg_hi:[0,0,0] +// GFX9: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x8e,0xd3,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[1,1,1] +// GFX9: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x8e,0xd3,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[0,0,0] +// GFX9: v_pk_fma_f16 v8, v0, s0, v1 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x8e,0xd3,0x00,0x01,0x04,0x04] + +v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1] +// GFX9: v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x8e,0xd3,0x00,0x01,0x04,0x04] + +// +// Test neg_lo/neg_hi +// + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] +// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x8e,0xd3,0x00,0x01,0x04,0xfc] + +v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,1,1] +// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x8e,0xd3,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] neg_hi:[1,1,1] +// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x8e,0xd3,0x00,0x01,0x04,0xfc] + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,0,0] +// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x8e,0xd3,0x00,0x01,0x04,0x3c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,1,0] +// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x8e,0xd3,0x00,0x01,0x04,0x5c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,1] +// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x8e,0xd3,0x00,0x01,0x04,0x9c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,0,0] +// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x8e,0xd3,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,1,0] +// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x8e,0xd3,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,0,1] +// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x8e,0xd3,0x00,0x01,0x04,0x1c] + + +// Test clamp +v_pk_fma_f16 v8, v0, s0, v1 clamp +// GFX9: v_pk_fma_f16 v8, v0, s0, v1 clamp ; encoding: [0x08,0xc0,0x8e,0xd3,0x00,0x01,0x04,0x1c] + +v_pk_add_u16 v1, v2, v3 clamp +// GFX9: v_pk_add_u16 v1, v2, v3 clamp ; encoding: [0x01,0x80,0x8a,0xd3,0x02,0x07,0x02,0x18] + +v_pk_min_i16 v0, v1, v2 clamp +// GFX9: v_pk_min_i16 v0, v1, v2 clamp ; encoding: [0x00,0x80,0x88,0xd3,0x01,0x05,0x02,0x18] + +// +// Instruction tests: +// + +v_pk_mul_lo_u16 v0, v1, v2 +// GFX9: v_pk_mul_lo_u16 v0, v1, v2 ; encoding: [0x00,0x00,0x81,0xd3,0x01,0x05,0x02,0x18] + +v_pk_add_i16 v0, v1, v2 +// GFX9: v_pk_add_i16 v0, v1, v2 ; encoding: [0x00,0x00,0x82,0xd3,0x01,0x05,0x02,0x18] + +v_pk_sub_i16 v0, v1, v2 +// GFX9: v_pk_sub_i16 v0, v1, v2 ; encoding: [0x00,0x00,0x83,0xd3,0x01,0x05,0x02,0x18] + +v_pk_lshlrev_b16 v0, v1, v2 +// GFX9: v_pk_lshlrev_b16 v0, v1, v2 ; encoding: [0x00,0x00,0x84,0xd3,0x01,0x05,0x02,0x18] + +v_pk_lshrrev_b16 v0, v1, v2 +// GFX9: v_pk_lshrrev_b16 v0, v1, v2 ; encoding: [0x00,0x00,0x85,0xd3,0x01,0x05,0x02,0x18] + +v_pk_ashrrev_i16 v0, v1, v2 +// GFX9: v_pk_ashrrev_i16 v0, v1, v2 ; encoding: [0x00,0x00,0x86,0xd3,0x01,0x05,0x02,0x18] + +v_pk_max_i16 v0, v1, v2 +// GFX9: v_pk_max_i16 v0, v1, v2 ; encoding: [0x00,0x00,0x87,0xd3,0x01,0x05,0x02,0x18] + +v_pk_min_i16 v0, v1, v2 +// GFX9: v_pk_min_i16 v0, v1, v2 ; encoding: [0x00,0x00,0x88,0xd3,0x01,0x05,0x02,0x18] + +v_pk_add_u16 v0, v1, v2 +// GFX9: v_pk_add_u16 v0, v1, v2 ; encoding: [0x00,0x00,0x8a,0xd3,0x01,0x05,0x02,0x18] + +v_pk_max_u16 v0, v1, v2 +// GFX9: v_pk_max_u16 v0, v1, v2 ; encoding: [0x00,0x00,0x8c,0xd3,0x01,0x05,0x02,0x18] + +v_pk_min_u16 v0, v1, v2 +// GFX9: v_pk_min_u16 v0, v1, v2 ; encoding: [0x00,0x00,0x8d,0xd3,0x01,0x05,0x02,0x18] + +v_pk_fma_f16 v0, v1, v2, v3 +// GFX9: v_pk_fma_f16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x8e,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_add_f16 v0, v1, v2 +// GFX9: v_pk_add_f16 v0, v1, v2 ; encoding: [0x00,0x00,0x8f,0xd3,0x01,0x05,0x02,0x18] + +v_pk_mul_f16 v0, v1, v2 +// GFX9: v_pk_mul_f16 v0, v1, v2 ; encoding: [0x00,0x00,0x90,0xd3,0x01,0x05,0x02,0x18] + +v_pk_min_f16 v0, v1, v2 +// GFX9: v_pk_min_f16 v0, v1, v2 ; encoding: [0x00,0x00,0x91,0xd3,0x01,0x05,0x02,0x18] + +v_pk_max_f16 v0, v1, v2 +// GFX9: v_pk_max_f16 v0, v1, v2 ; encoding: [0x00,0x00,0x92,0xd3,0x01,0x05,0x02,0x18] + +v_mad_mix_f32 v0, v1, v2, v3 +// GFX9: v_mad_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mixlo_f16 v0, v1, v2, v3 +// GFX9: v_mad_mixlo_f16 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa1,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mixhi_f16 v0, v1, v2, v3 +// GFX9: v_mad_mixhi_f16 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa2,0xd3,0x01,0x05,0x0e,0x04] + + +// +// Regular source modifiers on non-packed instructions +// + +v_mad_mix_f32 v0, abs(v1), v2, v3 +// GFX9: v_mad_mix_f32 v0, |v1|, v2, v3 ; encoding: [0x00,0x01,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mix_f32 v0, v1, abs(v2), v3 +// GFX9: v_mad_mix_f32 v0, v1, |v2|, v3 ; encoding: [0x00,0x02,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mix_f32 v0, v1, v2, abs(v3) +// GFX9: v_mad_mix_f32 v0, v1, v2, |v3| ; encoding: [0x00,0x04,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mix_f32 v0, -v1, v2, v3 +// GFX9: v_mad_mix_f32 v0, -v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x24] + +v_mad_mix_f32 v0, v1, -v2, v3 +// GFX9: v_mad_mix_f32 v0, v1, -v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x44] + +v_mad_mix_f32 v0, v1, v2, -v3 +// GFX9: v_mad_mix_f32 v0, v1, v2, -v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x84] + +v_mad_mix_f32 v0, -abs(v1), v2, v3 +// GFX9: v_mad_mix_f32 v0, -|v1|, v2, v3 ; encoding: [0x00,0x01,0xa0,0xd3,0x01,0x05,0x0e,0x24] + +v_mad_mix_f32 v0, v1, -abs(v2), v3 +// GFX9: v_mad_mix_f32 v0, v1, -|v2|, v3 ; encoding: [0x00,0x02,0xa0,0xd3,0x01,0x05,0x0e,0x44] + +v_mad_mix_f32 v0, v1, v2, -abs(v3) +// GFX9: v_mad_mix_f32 v0, v1, v2, -|v3| ; encoding: [0x00,0x04,0xa0,0xd3,0x01,0x05,0x0e,0x84] + +v_mad_mixlo_f16 v0, abs(v1), -v2, abs(v3) +// GFX9: v_mad_mixlo_f16 v0, |v1|, -v2, |v3| ; encoding: [0x00,0x05,0xa1,0xd3,0x01,0x05,0x0e,0x44] + +v_mad_mixhi_f16 v0, -v1, abs(v2), -abs(v3) +// GFX9: v_mad_mixhi_f16 v0, -v1, |v2|, -|v3| ; encoding: [0x00,0x06,0xa2,0xd3,0x01,0x05,0x0e,0xa4]