diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -8255,6 +8255,11 @@ if (OpIdx == -1) break; + int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); + + if (ModIdx == -1) + continue; + uint32_t ModVal = 0; if ((OpSel & (1 << J)) != 0) @@ -8269,8 +8274,6 @@ if ((NegHi & (1 << J)) != 0) ModVal |= SISrcMods::NEG_HI; - int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); - Inst.getOperand(ModIdx).setImm(Inst.getOperand(ModIdx).getImm() | ModVal); } } @@ -8636,7 +8639,9 @@ if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod) != -1) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); } - if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) { + if (Desc.TSFlags & SIInstrFlags::VOP3P) + cvtVOP3P(Inst, Operands, OptionalIdx); + else if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOpSel); } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -162,6 +162,7 @@ DecodeStatus convertSDWAInst(MCInst &MI) const; DecodeStatus convertDPP8Inst(MCInst &MI) const; DecodeStatus convertMIMGInst(MCInst &MI) const; + DecodeStatus convertVOP3PDPPInst(MCInst &MI) const; MCOperand decodeOperand_VGPR_32(unsigned Val) const; MCOperand decodeOperand_VRegOrLds_32(unsigned Val) const; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -441,8 +441,11 @@ MI = MCInst(); // clear Res = tryDecodeInst(DecoderTableDPPGFX1196, MI, DecW, Address); - if (Res) + if (Res) { + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) + convertVOP3PDPPInst(MI); break; + } } // Reinitialize Bytes Bytes = Bytes_.slice(0, MaxInstBytesNum); @@ -729,18 +732,20 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { unsigned Opc = MI.getOpcode(); unsigned DescNumOps = MCII->get(Opc).getNumOperands(); - - // Insert dummy unused src modifiers. - if (MI.getNumOperands() < DescNumOps && - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1) - insertNamedMCOperand(MI, MCOperand::createImm(0), - AMDGPU::OpName::src0_modifiers); - - if (MI.getNumOperands() < DescNumOps && - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1) - insertNamedMCOperand(MI, MCOperand::createImm(0), - AMDGPU::OpName::src1_modifiers); - + if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) { + convertVOP3PDPPInst(MI); + } else { + // Insert dummy unused src modifiers. + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src0_modifiers); + + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src1_modifiers); + } return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail; } @@ -882,6 +887,56 @@ return MCDisassembler::Success; } +// Opsel and neg bits are used in src_modifiers and standalone operands. Autogen +// decoder only adds to src_modifiers, so manually add the bits to the other +// operands. +DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const { + unsigned Opc = MI.getOpcode(); + unsigned DescNumOps = MCII->get(Opc).getNumOperands(); + + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vdst_in); + + const int ModOps[] = {AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers}; + unsigned OpSel = 0; + unsigned OpSelHi = 0; + unsigned NegLo = 0; + unsigned NegHi = 0; + for (int J = 0; J < 3; ++J) { + int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); + if (OpIdx == -1) + break; + unsigned Val = MI.getOperand(OpIdx).getImm(); + + OpSel |= !!(Val & SISrcMods::OP_SEL_0) << J; + OpSelHi |= !!(Val & SISrcMods::OP_SEL_1) << J; + NegLo |= !!(Val & SISrcMods::NEG) << J; + NegHi |= !!(Val & SISrcMods::NEG_HI) << J; + } + + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(OpSel), + AMDGPU::OpName::op_sel); + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(OpSelHi), + AMDGPU::OpName::op_sel_hi); + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(NegLo), + AMDGPU::OpName::neg_lo); + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(NegHi), + AMDGPU::OpName::neg_hi); + + return MCDisassembler::Success; +} + DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI, int ImmLitIdx) const { assert(HasLiteral && "Should have decoded a literal"); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1496,6 +1496,7 @@ def VOP3PMods : ComplexPattern; def VOP3PModsDOT : ComplexPattern; +def DotIUVOP3PMods : ComplexPattern; def VOP3OpSel : ComplexPattern; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -10,19 +10,33 @@ // VOP3P Classes //===----------------------------------------------------------------------===// +class VOP3P_Profile : VOP3_Profile { + let IsVOP3P = 1; + let HasExtVOP3DPP = HasDPP; + // We do not want to print src modifiers for vop3p because the bits are + // overloaded in meaning and the logic in printOperandAndFPInputMods is + // wrong for vop3p + let AsmVOP3DPPBase = AsmVOP3P; +} + // Used for FMA_MIX* and MAD_MIX* insts // Their operands are only sort of f16 operands. Depending on // op_sel_hi, these may be interpreted as f32. The inline immediate // values are really f16 converted to f32, so we treat these as f16 // operands. class VOP3P_Mix_Profile : VOP3_Profile { + bit useTiedOutput = 0> : VOP3P_Profile { bit UseTiedOutput = useTiedOutput; dag srcs = (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); + dag dpp_srcs = + (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, + FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, + FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); // FIXME: clampmod0 misbehaves with the non-default vdst_in // following it. For now workaround this by requiring clamp @@ -35,8 +49,10 @@ // We use Ins64 because that is the one which populates InOperandList // due to the logic in class VOP3_Pseudo let Ins64 = !con(srcs, mods); + let InsVOP3Base = !con(dpp_srcs, mods); let Asm64 = "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp"; + let AsmVOP3DPPBase = Asm64; } multiclass VOP3PInst.ret, getVOP3Pat.ret)>; + let SubtargetPredicate = isGFX11Plus in { + if P.HasExtVOP3DPP then + def _dpp : VOP3_DPP_Pseudo { + let VOP3P = 1; + let PseudoInstr = OpName #"_dpp"; + } + } // end SubtargetPredicate = isGFX11Plus } // Non-packed instructions that use the VOP3P encoding. @@ -54,36 +77,45 @@ let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", ""); let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", ""); } + let SubtargetPredicate = isGFX11Plus in { + if P.HasExtVOP3DPP then + def _dpp : VOP3_DPP_Pseudo { + let VOP3P = 1; + let PseudoInstr = OpName#"_dpp"; + let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", ""); + let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", ""); + } + } // end SubtargetPredicate = isGFX11Plus } let isCommutable = 1 in { -defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile>; -defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile>; +defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile>; +defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile>; let FPDPRounding = 1 in { -defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile, any_fma>; -defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile, any_fadd>; -defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile, any_fmul>; +defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3P_Profile, any_fma>; +defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3P_Profile, any_fadd>; +defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3P_Profile, any_fmul>; } // End FPDPRounding = 1 -defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile, fmaxnum_like>; -defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile, fminnum_like>; +defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3P_Profile, fmaxnum_like>; +defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3P_Profile, fminnum_like>; -defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile, add>; -defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile>; -defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile, mul>; +defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3P_Profile, add>; +defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3P_Profile>; +defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3P_Profile, mul>; -defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile, smin>; -defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile, umin>; -defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile, smax>; -defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile, umax>; +defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3P_Profile, smin>; +defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3P_Profile, umin>; +defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3P_Profile, smax>; +defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3P_Profile, umax>; } -defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile>; -defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile, sub>; +defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3P_Profile>; +defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3P_Profile, sub>; -defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile, clshl_rev_16>; -defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile, cashr_rev_16>; -defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile, clshr_rev_16>; +defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3P_Profile, clshl_rev_16>; +defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3P_Profile, cashr_rev_16>; +defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3P_Profile, clshr_rev_16>; let SubtargetPredicate = HasVOP3PInsts in { @@ -296,34 +328,63 @@ let SubtargetPredicate = HasDot2Insts in { defm V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", - VOP3_Profile, int_amdgcn_sdot2, 1>; + VOP3P_Profile, int_amdgcn_sdot2, 1>; defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", - VOP3_Profile, int_amdgcn_udot2, 1>; + VOP3P_Profile, int_amdgcn_udot2, 1>; } // End SubtargetPredicate = HasDot2Insts let SubtargetPredicate = HasDot7Insts in { defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", - VOP3_Profile, + VOP3P_Profile, AMDGPUfdot2, 1/*ExplicitClamp*/>; defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", - VOP3_Profile, int_amdgcn_udot4, 1>; + VOP3P_Profile, int_amdgcn_udot4, 1>; defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", - VOP3_Profile, int_amdgcn_udot8, 1>; + VOP3P_Profile, int_amdgcn_udot8, 1>; } // End SubtargetPredicate = HasDot7Insts let SubtargetPredicate = HasDot1Insts in { defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", - VOP3_Profile, int_amdgcn_sdot4, 1>; + VOP3P_Profile, int_amdgcn_sdot4, 1>; defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", - VOP3_Profile, int_amdgcn_sdot8, 1>; + VOP3P_Profile, int_amdgcn_sdot8, 1>; } // End SubtargetPredicate = HasDot1Insts + +let SubtargetPredicate = HasDot8Insts in { + +defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16", + VOP3P_Profile, + null_frag, 1>; + +} // End SubtargetPredicate = HasDot8Insts + } // End let IsDOT = 1 +multiclass VOP3PDOTIUInst { + let IsDOT = 1 in + defm NAME : VOP3PInst, + null_frag, 1>; + // Dot-iu instructions consider input as signed if imod neg bits are set. Thus + // Dot-iu Intrinsics have extra operands and require separate codegen pattern. + def : GCNPat < (intrinsic_node (DotIUVOP3PMods i32:$src0_mods), i32:$src0, + (DotIUVOP3PMods i32:$src1_mods), i32:$src1, + i32:$src2, (i1 timm:$clamp)), + (!cast(NAME) $src0_mods, i32:$src0, + $src1_mods, i32:$src1, + (i32 8), i32:$src2, i1:$clamp) + >; +} + +let SubtargetPredicate = HasDot8Insts in { +defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", null_frag>; +defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", null_frag>; +} // End SubtargetPredicate = HasDot8Insts + def : UDot2Pat; def : SDot2Pat; @@ -364,18 +425,18 @@ def VDst_512 : VOPDstOperand; def VDst_1024 : VOPDstOperand; -def VOPProfileAccRead : VOP3_Profile { +def VOPProfileAccRead : VOP3P_Profile { let Src0RC64 = ARegSrc_32; } -def VOPProfileAccWrite : VOP3_Profile { +def VOPProfileAccWrite : VOP3P_Profile { let DstRC = ADst_32; let Src0RC64 = VCSrc_b32; } class VOPProfileMAI - : VOP3_Profile { + : VOP3P_Profile { let DstRC = _DstRC; let Src0RC64 = SrcABRC; let Src1RC64 = SrcABRC; @@ -386,7 +447,9 @@ let HasOMod = 0; let HasModifiers = 0; let Asm64 = "$vdst, $src0, $src1, $src2$cbsz$abid$blgp"; + let AsmVOP3DPPBase = Asm64; let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp); + let InsVOP3Base = Ins64; // Dst and SrcC cannot partially overlap if SrcC/Dst is bigger than 4 VGPRs. // We then create two versions of the instruction: with tied dst and src2 // and with the earlyclobber flag on the dst. This is stricter than the @@ -601,10 +664,10 @@ } let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in { - defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile, any_fma>; - defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile, any_fmul>; - defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3_Profile, any_fadd>; - defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3_Profile>; + defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3P_Profile, any_fma>; + defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3P_Profile, any_fmul>; + defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile, any_fadd>; + defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile>; } // End SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">; @@ -614,6 +677,72 @@ // Begin Real Encodings //===----------------------------------------------------------------------===// +class VOP3P_DPP16 op, VOP_DPP_Pseudo ps, int subtarget, + string opName = ps.OpName> + : VOP3P_DPP, SIMCInstr { + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + let AssemblerPredicate = HasDPP16; + let SubtargetPredicate = HasDPP16; + let OtherPredicates = ps.OtherPredicates; +} + +class VOP3P_DPP8_Base op, VOP_Pseudo ps, string opName = ps.OpName> + : VOP3P_DPP8 { + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + let OtherPredicates = ps.OtherPredicates; +} + +//===----------------------------------------------------------------------===// +// GFX11. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX11Plus, + DecoderNamespace = "GFX11" in { + + multiclass VOP3P_Real_gfx11 op, string backing_ps_name = NAME, + string asmName = !cast(NAME).Mnemonic> { + def _gfx11 : VOP3P_Real(backing_ps_name), + SIEncodingFamily.GFX11, asmName>, + VOP3Pe_gfx11(backing_ps_name).Pfl>; + } + + multiclass VOP3P_Real_dpp_gfx11 op, string backing_ps_name = NAME, + string asmName = !cast(NAME).Mnemonic> { + defvar ps = !cast(backing_ps_name); + def _dpp_gfx11 + : VOP3P_DPP16(backing_ps_name #"_dpp"), + SIEncodingFamily.GFX11> { + let AsmString = asmName #ps.Pfl.AsmVOP3DPP16; + let DecoderNamespace = "DPPGFX11"; + } + } + + multiclass VOP3P_Real_dpp8_gfx11 op, string backing_ps_name = NAME, + string asmName = !cast(NAME).Mnemonic> { + defvar ps = !cast(backing_ps_name); + def _dpp8_gfx11 : VOP3P_DPP8_Base { + let AsmString = asmName #ps.Pfl.AsmVOP3DPP8; + let DecoderNamespace = "DPP8GFX11"; + } + } + + multiclass VOP3P_Realtriple_gfx11 op, string backing_ps_name = NAME, + string asmName = !cast(NAME).Mnemonic> + : VOP3P_Real_gfx11, + VOP3P_Real_dpp_gfx11, + VOP3P_Real_dpp8_gfx11; +} // End AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11" + +defm V_DOT4_I32_IU8 : VOP3P_Real_gfx11 <0x16>; +defm V_DOT8_I32_IU4 : VOP3P_Real_gfx11 <0x18>; +defm V_DOT2_F32_BF16 : VOP3P_Real_gfx11 <0x1a>; + //===----------------------------------------------------------------------===// // GFX8 (VI) //===----------------------------------------------------------------------===// @@ -841,35 +970,41 @@ // GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1 in { +let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1 in { multiclass VOP3P_Real_gfx10 op> { def _gfx10 : VOP3P_Real(NAME), SIEncodingFamily.GFX10>, VOP3Pe_gfx10 (NAME).Pfl>; } -} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1 - -defm V_PK_MAD_I16 : VOP3P_Real_gfx10<0x00>; -defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10<0x01>; -defm V_PK_ADD_I16 : VOP3P_Real_gfx10<0x02>; -defm V_PK_SUB_I16 : VOP3P_Real_gfx10<0x03>; -defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x04>; -defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x05>; -defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x06>; -defm V_PK_MAX_I16 : VOP3P_Real_gfx10<0x07>; -defm V_PK_MIN_I16 : VOP3P_Real_gfx10<0x08>; -defm V_PK_MAD_U16 : VOP3P_Real_gfx10<0x09>; -defm V_PK_ADD_U16 : VOP3P_Real_gfx10<0x0a>; -defm V_PK_SUB_U16 : VOP3P_Real_gfx10<0x0b>; -defm V_PK_MAX_U16 : VOP3P_Real_gfx10<0x0c>; -defm V_PK_MIN_U16 : VOP3P_Real_gfx10<0x0d>; -defm V_PK_FMA_F16 : VOP3P_Real_gfx10<0x0e>; -defm V_PK_ADD_F16 : VOP3P_Real_gfx10<0x0f>; -defm V_PK_MUL_F16 : VOP3P_Real_gfx10<0x10>; -defm V_PK_MIN_F16 : VOP3P_Real_gfx10<0x11>; -defm V_PK_MAX_F16 : VOP3P_Real_gfx10<0x12>; -defm V_FMA_MIX_F32 : VOP3P_Real_gfx10<0x20>; -defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10<0x21>; -defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10<0x22>; +} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1 + +multiclass VOP3P_Real_gfx10_gfx11 op> + : VOP3P_Real_gfx10, VOP3P_Real_gfx11; + +multiclass VOP3P_Real_gfx10_gfx11_Triple op> + : VOP3P_Real_gfx10, VOP3P_Realtriple_gfx11; + +defm V_PK_MAD_I16 : VOP3P_Real_gfx10_gfx11<0x00>; +defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10_gfx11<0x01>; +defm V_PK_ADD_I16 : VOP3P_Real_gfx10_gfx11<0x02>; +defm V_PK_SUB_I16 : VOP3P_Real_gfx10_gfx11<0x03>; +defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10_gfx11<0x04>; +defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10_gfx11<0x05>; +defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10_gfx11<0x06>; +defm V_PK_MAX_I16 : VOP3P_Real_gfx10_gfx11<0x07>; +defm V_PK_MIN_I16 : VOP3P_Real_gfx10_gfx11<0x08>; +defm V_PK_MAD_U16 : VOP3P_Real_gfx10_gfx11<0x09>; +defm V_PK_ADD_U16 : VOP3P_Real_gfx10_gfx11<0x0a>; +defm V_PK_SUB_U16 : VOP3P_Real_gfx10_gfx11<0x0b>; +defm V_PK_MAX_U16 : VOP3P_Real_gfx10_gfx11<0x0c>; +defm V_PK_MIN_U16 : VOP3P_Real_gfx10_gfx11<0x0d>; +defm V_PK_FMA_F16 : VOP3P_Real_gfx10_gfx11<0x0e>; +defm V_PK_ADD_F16 : VOP3P_Real_gfx10_gfx11<0x0f>; +defm V_PK_MUL_F16 : VOP3P_Real_gfx10_gfx11<0x10>; +defm V_PK_MIN_F16 : VOP3P_Real_gfx10_gfx11<0x11>; +defm V_PK_MAX_F16 : VOP3P_Real_gfx10_gfx11<0x12>; +defm V_FMA_MIX_F32 : VOP3P_Real_gfx10_gfx11_Triple <0x20>; +defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x21>; +defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x22>; let SubtargetPredicate = HasDot2Insts in { @@ -880,9 +1015,9 @@ let SubtargetPredicate = HasDot7Insts in { -defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>; -defm V_DOT4_U32_U8 : VOP3P_Real_gfx10 <0x17>; -defm V_DOT8_U32_U4 : VOP3P_Real_gfx10 <0x19>; +defm V_DOT2_F32_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x13>; +defm V_DOT4_U32_U8 : VOP3P_Real_gfx10_gfx11 <0x17>; +defm V_DOT8_U32_U4 : VOP3P_Real_gfx10_gfx11 <0x19>; } // End SubtargetPredicate = HasDot7Insts diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -316,7 +316,6 @@ class VOP3Pe op, VOPProfile P> : Enc64 { bits<8> vdst; - // neg, neg_hi, op_sel put in srcN_modifiers bits<4> src0_modifiers; bits<9> src0; bits<4> src1_modifiers; @@ -412,6 +411,8 @@ let Inst{31-23} = 0x198; //encoding } +class VOP3Pe_gfx11 op, VOPProfile P> : VOP3Pe_gfx10; + class VOP3be_gfx6_gfx7 op, VOPProfile p> : VOP3be

{ let Inst{25-17} = op; } @@ -705,6 +706,39 @@ let Inst{58-50} = !if(P.HasSrc2, src2, 0); } +class VOP3P_DPPe_Common_Base op, VOPProfile P> : Enc96 { + bits<4> src0_modifiers; + bits<4> src1_modifiers; + bits<4> src2_modifiers; + bits<1> clamp; + + let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0 + let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1 + let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2 + let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0) + let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1) + let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2) + let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, ?); // op_sel_hi(2) + let Inst{15} = !if(P.HasClamp, clamp{0}, 0); + let Inst{22-16} = op; + let Inst{31-23} = 0x198; // encoding + let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, ?); // op_sel_hi(0) + let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, ?); // op_sel_hi(1) + let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo) + let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo) + let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo) +} + +class VOP3P_DPPe_Common op, VOPProfile P> : VOP3P_DPPe_Common_Base { + bits<8> vdst; + bits<9> src1; + bits<9> src2; + + let Inst{7-0} = vdst; + let Inst{49-41} = !if(P.HasSrc1, src1, 0); + let Inst{58-50} = !if(P.HasSrc2, src2, 0); +} + class VOP_DPP_Pseudo pattern=[], dag Ins = P.InsDPP, string asmOps = P.AsmDPP> : InstSI , @@ -847,6 +881,25 @@ let Inst{95-92} = row_mask; } +class VOP3P_DPP op, string OpName, VOPProfile P, bit IsDPP16, + dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP), + string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> : + VOP3_DPP_Base, VOP3P_DPPe_Common, + VOP3_DPPe_Fields { + + let VOP3P = 1; + + let Inst{40-32} = 0xfa; + let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{80-72} = dpp_ctrl; + let Inst{82} = !if(IsDPP16, fi, ?); + let Inst{83} = bound_ctrl; + + // Inst{87-84} ignored by hw + let Inst{91-88} = bank_mask; + let Inst{95-92} = row_mask; +} + class VOP_DPP8e : Enc64 { bits<8> src0; bits<24> dpp8; @@ -905,6 +958,16 @@ let Inst{95-72} = dpp8{23-0}; } +class VOP3P_DPP8 op, string OpName, VOPProfile P> : + VOP3_DPP8_Base, VOP3P_DPPe_Common, + VOP3_DPP8e_Fields { + + let VOP3P = 1; + let Inst{40-32} = fi; + let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{95-72} = dpp8{23-0}; +} + def DPP8Mode { int FI_0 = 0xE9; int FI_1 = 0xEA; diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_dpp.s b/llvm/test/MC/AMDGPU/gfx11_asm_dpp.s --- a/llvm/test/MC/AMDGPU/gfx11_asm_dpp.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_dpp.s @@ -88,6 +88,23 @@ // W32: encoding: [0x05,0x6a,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x92,0x44,0x92] // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error +; VOP3P +v_fma_mix_f32 v0, v1, v2, v3 dpp8:[2,2,2,2,4,4,4,4] +// GFX11: encoding: [0x00,0x00,0x20,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x92,0x44,0x92] + +v_fma_mix_f32 v0, v1, v2, v3 clamp dpp8:[2,2,2,2,4,4,4,4] fi:1 +// GFX11: encoding: [0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92] + +v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3) dpp8:[2,2,2,2,4,4,4,4] +// GFX11: encoding: [0x00,0x05,0x21,0xcc,0xe9,0x04,0x0e,0x44,0x01,0x92,0x44,0x92] + +; For test purpose only. OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to all 1 +v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3) op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:[2,2,2,2,4,4,4,4] +// GFX11: encoding: [0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92] + +v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05] + ; DPP ; VOP1->3 @@ -191,3 +208,16 @@ // W64: encoding: [0xf3,0xea,0x00,0xd7,0xfa,0x04,0x02,0x00,0xf3,0x79,0x04,0xf5] // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error + +; VOP3P +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[0,0,0] quad_perm:[2,2,3,1] bound_ctrl:0 fi:1 +// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff] + +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] bank_mask:0xe +// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe] + +v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,0] row_ror:7 bank_mask:0x1 bound_ctrl:0 +// GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 bound_ctrl:1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1] + +v_fma_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0 +// GFX11: v_fma_mixhi_f16_e64_dpp v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0 bank_mask:0xf ; encoding: [0x00,0xc0,0x22,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x78,0x00,0x0f] diff --git a/llvm/test/MC/AMDGPU/gfx11_err.s b/llvm/test/MC/AMDGPU/gfx11_err.s --- a/llvm/test/MC/AMDGPU/gfx11_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_err.s @@ -44,3 +44,20 @@ v_add3_u32_e64_dpp v5, v1, s1, v0 dpp8:[7,6,5,4,3,2,1,0] // GFX11: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +// On GFX11, v_dot8_i32_i4 is a valid SP3 alias for v_dot8_i32_iu4. +// However, we intentionally leave it unimplemented because on other +// processors v_dot8_i32_i4 denotes an instruction of a different +// behaviour, which is considered potentially dangerous. +v_dot8_i32_i4 v0, v1, v2, v3 +// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +// On GFX11, v_dot4_i32_i8 is a valid SP3 alias for v_dot4_i32_iu8. +// However, we intentionally leave it unimplemented because on other +// processors v_dot4_i32_i8 denotes an instruction of a different +// behaviour, which is considered potentially dangerous. +v_dot4_i32_i8 v0, v1, v2, v3 +// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_dot4c_i32_i8 v0, v1, v2 +// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx11_vop3p.s b/llvm/test/MC/AMDGPU/gfx11_vop3p.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_vop3p.s @@ -0,0 +1,213 @@ +// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1100 %s | FileCheck --check-prefix=GFX11 %s + +v_pk_fma_f16 v8, v0, s0, v1 clamp +// GFX11: encoding: [0x08,0xc0,0x0e,0xcc,0x00,0x01,0x04,0x1c] + +v_pk_add_u16 v1, v2, v3 clamp +// GFX11: encoding: [0x01,0xc0,0x0a,0xcc,0x02,0x07,0x02,0x18] + +v_pk_min_i16 v0, v1, v2 clamp +// GFX11: encoding: [0x00,0xc0,0x08,0xcc,0x01,0x05,0x02,0x18] + +v_pk_mul_lo_u16 v0, v1, v2 +// GFX11: encoding: [0x00,0x40,0x01,0xcc,0x01,0x05,0x02,0x18] + +v_pk_add_i16 v0, v1, v2 +// GFX11: encoding: [0x00,0x40,0x02,0xcc,0x01,0x05,0x02,0x18] + +v_pk_sub_i16 v0, v1, v2 +// GFX11: encoding: [0x00,0x40,0x03,0xcc,0x01,0x05,0x02,0x18] + +v_pk_lshlrev_b16 v0, v1, v2 +// GFX11: encoding: [0x00,0x40,0x04,0xcc,0x01,0x05,0x02,0x18] + +v_pk_lshrrev_b16 v0, v1, v2 +// GFX11: encoding: [0x00,0x40,0x05,0xcc,0x01,0x05,0x02,0x18] + +v_pk_ashrrev_i16 v0, v1, v2 +// GFX11: encoding: [0x00,0x40,0x06,0xcc,0x01,0x05,0x02,0x18] + +v_pk_max_i16 v0, v1, v2 +// GFX11: encoding: [0x00,0x40,0x07,0xcc,0x01,0x05,0x02,0x18] + +v_pk_min_i16 v0, v1, v2 +// GFX11: encoding: [0x00,0x40,0x08,0xcc,0x01,0x05,0x02,0x18] + +v_pk_add_u16 v0, v1, v2 +// GFX11: encoding: [0x00,0x40,0x0a,0xcc,0x01,0x05,0x02,0x18] + +v_pk_max_u16 v0, v1, v2 +// GFX11: encoding: [0x00,0x40,0x0c,0xcc,0x01,0x05,0x02,0x18] + +v_pk_min_u16 v0, v1, v2 +// GFX11: encoding: [0x00,0x40,0x0d,0xcc,0x01,0x05,0x02,0x18] + +v_pk_fma_f16 v0, v1, v2, v3 +// GFX11: encoding: [0x00,0x40,0x0e,0xcc,0x01,0x05,0x0e,0x1c] + +v_pk_add_f16 v0, v1, v2 +// GFX11: encoding: [0x00,0x40,0x0f,0xcc,0x01,0x05,0x02,0x18] + +v_pk_mul_f16 v0, v1, v2 +// GFX11: encoding: [0x00,0x40,0x10,0xcc,0x01,0x05,0x02,0x18] + +v_pk_min_f16 v0, v1, v2 +// GFX11: encoding: [0x00,0x40,0x11,0xcc,0x01,0x05,0x02,0x18] + +v_pk_max_f16 v0, v1, v2 +// GFX11: encoding: [0x00,0x40,0x12,0xcc,0x01,0x05,0x02,0x18] + +// +// Test op_sel/op_sel_hi +// + +v_pk_add_u16 v1, v2, v3 +// GFX11: encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,0] +// GFX11: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18] + +v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,1] +// GFX11: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1] +// GFX11: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18] + +v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] +// GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x00] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0] +// GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x00] + +v_pk_add_u16 v1, v2, v3 op_sel:[1,0] +// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x18] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,1] +// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x18] + +v_pk_add_u16 v1, v2, v3 op_sel:[1,1] +// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,1] ; encoding: [0x01,0x58,0x0a,0xcc,0x02,0x07,0x02,0x18] + +v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,1] +// GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,1] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x10] + +v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,0] +// GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x08] + +v_pk_add_u16 v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1] +// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,1] ; encoding: [0x01,0x58,0x0a,0xcc,0x02,0x07,0x02,0x18] + +v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0] +// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x08] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1] +// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x10] + +v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1] +// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x10] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0] +// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x08] + +// +// Test src2 op_sel/op_sel_hi +// +v_pk_fma_f16 v8, v0, s0, v1 +// GFX11: encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,0] neg_hi:[0,0,0] +// GFX11: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[1,1,1] neg_lo:[0,0,0] neg_hi:[0,0,0] +// GFX11: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[1,1,1] +// GFX11: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[0,0,0] +// GFX11: v_pk_fma_f16 v8, v0, s0, v1 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x0e,0xcc,0x00,0x01,0x04,0x04] + +v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1] +// GFX11: v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x0e,0xcc,0x00,0x01,0x04,0x04] + +// +// Test neg_lo/neg_hi +// + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] +// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0xfc] + +v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,1,1] +// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] neg_hi:[1,1,1] +// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0xfc] + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,0,0] +// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x3c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,1,0] +// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x5c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,1] +// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x9c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,0,0] +// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x0e,0xcc,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,1,0] +// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x0e,0xcc,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,0,1] +// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x0e,0xcc,0x00,0x01,0x04,0x1c] + +// +// DOT +// +v_dot4_i32_iu8 v3, v4, v5, v6 +// GFX11: v_dot4_i32_iu8 v3, v4, v5, v6 ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x1a,0x1c] + +v_dot4_i32_iu8 v3, v4, v5, 0xf neg_lo:[1,1] +// GFX11: v_dot4_i32_iu8 v3, v4, v5, 15 neg_lo:[1,1,0] ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x7a] + +v_dot4_u32_u8 v3, v4, v5, v6 +// GFX11: v_dot4_u32_u8 v3, v4, v5, v6 ; encoding: [0x03,0x40,0x17,0xcc,0x04,0x0b,0x1a,0x1c] + +v_dot4_i32_iu8 v3, v4, v5, 0xf +// GFX11: v_dot4_i32_iu8 v3, v4, v5, 15 ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x1a] + +v_dot8_i32_iu4 v3, v4, v5, 0xf neg_lo:[1,0] +// GFX11: v_dot8_i32_iu4 v3, v4, v5, 15 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x18,0xcc,0x04,0x0b,0x3e,0x3a] + +v_dot8_i32_iu4 v3, v4, v5, v0 neg_lo:[0,0] +// GFX11: v_dot8_i32_iu4 v3, v4, v5, v0 ; encoding: [0x03,0x40,0x18,0xcc,0x04,0x0b,0x02,0x1c] + +v_dot8_u32_u4 v0, v1, v2, v3 +// GFX11: v_dot8_u32_u4 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x19,0xcc,0x01,0x05,0x0e,0x1c] + +v_dot2_f32_f16 v0, v1, v2, v3 +// GFX11: v_dot2_f32_f16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c] + +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] +// GFX11: v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] ; encoding: [0x00,0x45,0x13,0xcc,0x01,0x05,0x0e,0x7c] + +v_dot2_f32_bf16 v0, v1, v2, v3 +// GFX11: v_dot2_f32_bf16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x1a,0xcc,0x01,0x05,0x0e,0x1c] + +v_dot2_f32_bf16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] +// GFX11: v_dot2_f32_bf16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] ; encoding: [0x00,0x45,0x1a,0xcc,0x01,0x05,0x0e,0x3c] + +// +// FMA_MIX +// +v_fma_mix_f32 v0, v1, v2, v3 +// GFX11: v_fma_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0x20,0xcc,0x01,0x05,0x0e,0x04] + +v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1] +// GFX11: v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x00,0x20,0x20,0xcc,0x01,0x05,0x0e,0x04] + +v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3) +// GFX11: v_fma_mixlo_f16 v0, |v1|, -v2, |v3| ; encoding: [0x00,0x05,0x21,0xcc,0x01,0x05,0x0e,0x44] + +v_fma_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp +// GFX11: v_fma_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp ; encoding: [0x00,0xc0,0x22,0xcc,0x01,0x05,0x0e,0x1c] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt @@ -14550,6 +14550,27 @@ # GFX11: v_dot2acc_f32_f16 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x04] 0x01,0x05,0x0a,0x04 +# GFX11: v_dot4_i32_iu8 v3, v4, v5, v6 ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x1a,0x1c] +0x03,0x40,0x16,0xcc,0x04,0x0b,0x1a,0x1c + +# GFX11: v_dot4_i32_iu8 v3, v4, v5, 15 neg_lo:[1,1,0] ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x7a] +0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x7a + +# GFX11: v_dot4_i32_iu8 v3, v4, v5, 15 ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x1a] +0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x1a + +# GFX11: v_dot4_u32_u8 v3, v4, v5, v6 ; encoding: [0x03,0x40,0x17,0xcc,0x04,0x0b,0x1a,0x1c] +0x03,0x40,0x17,0xcc,0x04,0x0b,0x1a,0x1c + +# GFX11: v_dot8_i32_iu4 v3, v4, v5, 15 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x18,0xcc,0x04,0x0b,0x3e,0x3a] +0x03,0x40,0x18,0xcc,0x04,0x0b,0x3e,0x3a + +# GFX11: v_dot8_i32_iu4 v3, v4, v5, v0 ; encoding: [0x03,0x40,0x18,0xcc,0x04,0x0b,0x02,0x1c] +0x03,0x40,0x18,0xcc,0x04,0x0b,0x02,0x1c + +# GFX11: v_dot8_u32_u4 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x19,0xcc,0x01,0x05,0x0e,0x1c] +0x00,0x40,0x19,0xcc,0x01,0x05,0x0e,0x1c + # GFX11: v_exp_f32_e32 v255, v1 ; encoding: [0x01,0x4b,0xfe,0x7f] 0x01,0x4b,0xfe,0x7f @@ -23246,6 +23267,132 @@ # GFX11: v_xor3_b32 v5, vcc_lo, v2, v3 ; encoding: [0x05,0x00,0x40,0xd6,0x6a,0x04,0x0e,0x04] 0x05,0x00,0x40,0xd6,0x6a,0x04,0x0e,0x04 +# GFX11: v_pk_add_f16 v0, v1, v2 ; encoding: [0x00,0x40,0x0f,0xcc,0x01,0x05,0x02,0x18] +0x00,0x40,0x0f,0xcc,0x01,0x05,0x02,0x18 + +# GFX11: v_pk_add_i16 v0, v1, v2 ; encoding: [0x00,0x40,0x02,0xcc,0x01,0x05,0x02,0x18] +0x00,0x40,0x02,0xcc,0x01,0x05,0x02,0x18 + +# GFX11: v_pk_add_u16 v0, v1, v2 ; encoding: [0x00,0x40,0x0a,0xcc,0x01,0x05,0x02,0x18] +0x00,0x40,0x0a,0xcc,0x01,0x05,0x02,0x18 + +# GFX11: v_pk_add_u16 v1, v2, v3 clamp ; encoding: [0x01,0xc0,0x0a,0xcc,0x02,0x07,0x02,0x18] +0x01,0xc0,0x0a,0xcc,0x02,0x07,0x02,0x18 + +# GFX11: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18] +0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18 + +# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x18] +0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x18 + +# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x10] +0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x10 + +# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x08] +0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x08 + +# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x18] +0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x18 + +# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x10] +0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x10 + +# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x08] +0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x08 + +# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,1] ; encoding: [0x01,0x58,0x0a,0xcc,0x02,0x07,0x02,0x18] +0x01,0x58,0x0a,0xcc,0x02,0x07,0x02,0x18 + +# GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x00] +0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x00 + +# GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,1] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x10] +0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x10 + +# GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x08] +0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x08 + +# GFX11: v_pk_ashrrev_i16 v0, v1, v2 ; encoding: [0x00,0x40,0x06,0xcc,0x01,0x05,0x02,0x18] +0x00,0x40,0x06,0xcc,0x01,0x05,0x02,0x18 + +# GFX11: v_pk_fma_f16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x0e,0xcc,0x01,0x05,0x0e,0x1c] +0x00,0x40,0x0e,0xcc,0x01,0x05,0x0e,0x1c + +# GFX11: v_pk_fma_f16 v8, v0, s0, v1 clamp ; encoding: [0x08,0xc0,0x0e,0xcc,0x00,0x01,0x04,0x1c] +0x08,0xc0,0x0e,0xcc,0x00,0x01,0x04,0x1c + +# GFX11: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c] +0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c + +# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x0e,0xcc,0x00,0x01,0x04,0x1c] +0x08,0x44,0x0e,0xcc,0x00,0x01,0x04,0x1c + +# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x0e,0xcc,0x00,0x01,0x04,0x1c] +0x08,0x42,0x0e,0xcc,0x00,0x01,0x04,0x1c + +# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x0e,0xcc,0x00,0x01,0x04,0x1c] +0x08,0x41,0x0e,0xcc,0x00,0x01,0x04,0x1c + +# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0x1c] +0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0x1c + +# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x9c] +0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x9c + +# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x5c] +0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x5c + +# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x3c] +0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x3c + +# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0xfc] +0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0xfc + +# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0xfc] +0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0xfc + +# GFX11: v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x0e,0xcc,0x00,0x01,0x04,0x04] +0x08,0x60,0x0e,0xcc,0x00,0x01,0x04,0x04 + +# GFX11: v_pk_fma_f16 v8, v0, s0, v1 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x0e,0xcc,0x00,0x01,0x04,0x04] +0x08,0x00,0x0e,0xcc,0x00,0x01,0x04,0x04 + +# GFX11: v_pk_lshlrev_b16 v0, v1, v2 ; encoding: [0x00,0x40,0x04,0xcc,0x01,0x05,0x02,0x18] +0x00,0x40,0x04,0xcc,0x01,0x05,0x02,0x18 + +# GFX11: v_pk_lshrrev_b16 v0, v1, v2 ; encoding: [0x00,0x40,0x05,0xcc,0x01,0x05,0x02,0x18] +0x00,0x40,0x05,0xcc,0x01,0x05,0x02,0x18 + +# GFX11: v_pk_max_f16 v0, v1, v2 ; encoding: [0x00,0x40,0x12,0xcc,0x01,0x05,0x02,0x18] +0x00,0x40,0x12,0xcc,0x01,0x05,0x02,0x18 + +# GFX11: v_pk_max_i16 v0, v1, v2 ; encoding: [0x00,0x40,0x07,0xcc,0x01,0x05,0x02,0x18] +0x00,0x40,0x07,0xcc,0x01,0x05,0x02,0x18 + +# GFX11: v_pk_max_u16 v0, v1, v2 ; encoding: [0x00,0x40,0x0c,0xcc,0x01,0x05,0x02,0x18] +0x00,0x40,0x0c,0xcc,0x01,0x05,0x02,0x18 + +# GFX11: v_pk_min_f16 v0, v1, v2 ; encoding: [0x00,0x40,0x11,0xcc,0x01,0x05,0x02,0x18] +0x00,0x40,0x11,0xcc,0x01,0x05,0x02,0x18 + +# GFX11: v_pk_min_i16 v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x08,0xcc,0x01,0x05,0x02,0x18] +0x00,0xc0,0x08,0xcc,0x01,0x05,0x02,0x18 + +# GFX11: v_pk_min_i16 v0, v1, v2 ; encoding: [0x00,0x40,0x08,0xcc,0x01,0x05,0x02,0x18] +0x00,0x40,0x08,0xcc,0x01,0x05,0x02,0x18 + +# GFX11: v_pk_min_u16 v0, v1, v2 ; encoding: [0x00,0x40,0x0d,0xcc,0x01,0x05,0x02,0x18] +0x00,0x40,0x0d,0xcc,0x01,0x05,0x02,0x18 + +# GFX11: v_pk_mul_f16 v0, v1, v2 ; encoding: [0x00,0x40,0x10,0xcc,0x01,0x05,0x02,0x18] +0x00,0x40,0x10,0xcc,0x01,0x05,0x02,0x18 + +# GFX11: v_pk_mul_lo_u16 v0, v1, v2 ; encoding: [0x00,0x40,0x01,0xcc,0x01,0x05,0x02,0x18] +0x00,0x40,0x01,0xcc,0x01,0x05,0x02,0x18 + +# GFX11: v_pk_sub_i16 v0, v1, v2 ; encoding: [0x00,0x40,0x03,0xcc,0x01,0x05,0x02,0x18] +0x00,0x40,0x03,0xcc,0x01,0x05,0x02,0x18 + # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0x00] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0x00] 0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0x00 @@ -24560,6 +24707,69 @@ # GFX11: v_xnor_b32_e64_dpp v8, v5, v2 quad_perm:[1,0,2,3] row_mask:0x1 bank_mask:0x0 ; encoding: [0x08,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x05,0xe1,0x00,0x10] 0x08,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x05,0xe1,0x00,0x10 +# GFX11: v_dot2_f32_f16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c] +0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c + +# GFX11: v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] ; encoding: [0x00,0x45,0x13,0xcc,0x01,0x05,0x0e,0x7c] +0x00,0x45,0x13,0xcc,0x01,0x05,0x0e,0x7c + +# GFX11: v_dot2_f32_bf16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x1a,0xcc,0x01,0x05,0x0e,0x1c] +0x00,0x40,0x1a,0xcc,0x01,0x05,0x0e,0x1c + +# GFX11: v_dot2_f32_bf16 v0, v1, v2, v3 neg_lo:[1,0,0] neg_hi:[1,0,1] ; encoding: [0x00,0x45,0x1a,0xcc,0x01,0x05,0x0e,0x3c] +0x00,0x45,0x1a,0xcc,0x01,0x05,0x0e,0x3c + +# GFX11: v_fma_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0x20,0xcc,0x01,0x05,0x0e,0x04] +0x00,0x00,0x20,0xcc,0x01,0x05,0x0e,0x04 + +# GFX11: v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x00,0x20,0x20,0xcc,0x01,0x05,0x0e,0x04] +0x00,0x20,0x20,0xcc,0x01,0x05,0x0e,0x04 + +# GFX11: v_fma_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp ; encoding: [0x00,0xc0,0x22,0xcc,0x01,0x05,0x0e,0x1c] +0x00,0xc0,0x22,0xcc,0x01,0x05,0x0e,0x1c + +# GFX11: v_fma_mixlo_f16 v0, |v1|, -v2, |v3| ; encoding: [0x00,0x05,0x21,0xcc,0x01,0x05,0x0e,0x44] +0x00,0x05,0x21,0xcc,0x01,0x05,0x0e,0x44 + +# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05] +0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05 + +# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe] +0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe + +# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff] +0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff + +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0x00] +0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0x00 + +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x04,0x00] +0xfa,0x04,0x0a,0x04,0x01,0xe4,0x04,0x00 + +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x04,0x01,0x77,0x39,0x05] +0xe9,0x04,0x0a,0x04,0x01,0x77,0x39,0x05 + +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0a,0x04,0x01,0x77,0x39,0x05] +0xea,0x04,0x0a,0x04,0x01,0x77,0x39,0x05 + +# GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[2,2,2,2,4,4,4,4] fi:1 ; encoding: [0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92] +0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92 + +# GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x00,0x20,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x92,0x44,0x92] +0x00,0x00,0x20,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x92,0x44,0x92 + +# GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 bound_ctrl:1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1] +0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1 + +# GFX11: v_fma_mixhi_f16_e64_dpp v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0 bank_mask:0xf ; encoding: [0x00,0xc0,0x22,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x78,0x00,0x0f] +0x00,0xc0,0x22,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x78,0x00,0x0f + +# GFX11: v_fma_mixlo_f16_e64_dpp v0, |v1|, -v2, |v3| dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x05,0x21,0xcc,0xe9,0x04,0x0e,0x44,0x01,0x92,0x44,0x92] +0x00,0x05,0x21,0xcc,0xe9,0x04,0x0e,0x44,0x01,0x92,0x44,0x92 + +# GFX11: v_fma_mixlo_f16_e64_dpp v0, |v1|, -v2, |v3| op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92] +0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92 + # GFX11: v_permlane64_b32 v5, v1 ; encoding: [0x01,0xcf,0x0a,0x7e] 0x01,0xcf,0x0a,0x7e