diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1629,6 +1629,8 @@ void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands); void cvtVOP3(MCInst &Inst, const OperandVector &Operands); void cvtVOP3P(MCInst &Inst, const OperandVector &Operands); + void cvtVOP3P(MCInst &Inst, const OperandVector &Operands, + OptionalImmIndexMap &OptionalIdx); void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands); @@ -7491,16 +7493,13 @@ cvtVOP3(Inst, Operands, OptionalIdx); } -void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, - const OperandVector &Operands) { - OptionalImmIndexMap OptIdx; +void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, + OptionalImmIndexMap &OptIdx) { const int Opc = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opc); const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0; - cvtVOP3(Inst, Operands, OptIdx); - if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1) { assert(!IsPacked); Inst.addOperand(Inst.getOperand(0)); @@ -7509,7 +7508,10 @@ // FIXME: This is messy. Parse the modifiers as if it was a normal VOP3 // instruction, and then figure out where to actually put the modifiers - addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel); + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + if (OpSelIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel); + } int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi); if (OpSelHiIdx != -1) { @@ -7520,7 +7522,6 @@ int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo); if (NegLoIdx != -1) { - assert(IsPacked); addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo); addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi); } @@ -7532,16 +7533,16 @@ AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers }; - int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); - - unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + unsigned OpSel = 0; unsigned OpSelHi = 0; unsigned NegLo = 0; unsigned NegHi = 0; - if (OpSelHiIdx != -1) { + if (OpSelIdx != -1) + OpSel = Inst.getOperand(OpSelIdx).getImm(); + + if (OpSelHiIdx != -1) OpSelHi = Inst.getOperand(OpSelHiIdx).getImm(); - } if (NegLoIdx != -1) { int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi); @@ -7574,6 +7575,12 @@ } } +void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) { + OptionalImmIndexMap OptIdx; + cvtVOP3(Inst, Operands, OptIdx); + cvtVOP3P(Inst, Operands, OptIdx); +} + //===----------------------------------------------------------------------===// // dpp //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -10,71 +10,82 @@ // VOP3P Classes //===----------------------------------------------------------------------===// -class VOP3PInst : - VOP3P_Pseudo.ret, getVOP3Pat.ret) ->; +// Used for FMA_MIX* and MAD_MIX* insts +// Their operands are only sort of f16 operands. Depending on +// op_sel_hi, these may be interpreted as f32. The inline immediate +// values are really f16 converted to f32, so we treat these as f16 +// operands. +class VOP3P_Mix_Profile : VOP3_Profile { + bit UseTiedOutput = useTiedOutput; + + dag srcs = + (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, + FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, + FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); + + // FIXME: clampmod0 misbehaves with the non-default vdst_in + // following it. For now workaround this by requiring clamp + // in tied patterns. This should use undef_tied_input, but it + // seems underdeveloped and doesn't apply the right register + // class constraints. + dag mods = !con(!if(UseTiedOutput, (ins clampmod:$clamp, VGPR_32:$vdst_in), + (ins clampmod0:$clamp)), + (ins op_sel0:$op_sel, op_sel_hi0:$op_sel_hi)); + // We use Ins64 because that is the one which populates InOperandList + // due to the logic in class VOP3_Pseudo + let Ins64 = !con(srcs, mods); + let Asm64 = + "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp"; +} + +multiclass VOP3PInst { + def NAME : VOP3P_Pseudo.ret, + getVOP3Pat.ret)>; +} + // Non-packed instructions that use the VOP3P encoding. // VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed. -class VOP3_VOP3PInst : - VOP3P_Pseudo { - // These operands are only sort of f16 operands. Depending on - // op_sel_hi, these may be interpreted as f32. The inline immediate - // values are really f16 converted to f32, so we treat these as f16 - // operands. - let InOperandList = - !con( - !con( - (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, - FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, - FP16InputMods:$src2_modifiers, VCSrc_f16:$src2), - // FIXME: clampmod0 misbehaves with the non-default vdst_in - // following it. For now workaround this by requiring clamp - // in tied patterns. This should use undef_tied_input, but it - // seems underdeveloped and doesn't apply the right register - // class constraints. - !if(UseTiedOutput, (ins clampmod:$clamp, VGPR_32:$vdst_in), - (ins clampmod0:$clamp))), - (ins op_sel0:$op_sel, op_sel_hi0:$op_sel_hi)); - - let Constraints = !if(UseTiedOutput, "$vdst = $vdst_in", ""); - let DisableEncoding = !if(UseTiedOutput, "$vdst_in", ""); - let AsmOperands = - "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp"; +multiclass VOP3_VOP3PInst { + def NAME : VOP3P_Pseudo { + let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", ""); + let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", ""); + } } let isCommutable = 1 in { -def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile>; -def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile>; +defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile>; +defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile>; let FPDPRounding = 1 in { -def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile, any_fma>; -def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile, any_fadd>; -def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile, any_fmul>; +defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile, any_fma>; +defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile, any_fadd>; +defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile, any_fmul>; } // End FPDPRounding = 1 -def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile, fmaxnum_like>; -def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile, fminnum_like>; +defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile, fmaxnum_like>; +defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile, fminnum_like>; -def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile, add>; -def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile>; -def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile, mul>; +defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile, add>; +defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile>; +defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile, mul>; -def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile, smin>; -def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile, umin>; -def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile, smax>; -def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile, umax>; +defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile, smin>; +defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile, umin>; +defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile, smax>; +defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile, umax>; } -def V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile>; -def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile, sub>; +defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile>; +defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile, sub>; -def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile, lshl_rev>; -def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile, ashr_rev>; -def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile, lshr_rev>; +defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile, lshl_rev>; +defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile, ashr_rev>; +defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile, lshr_rev>; let SubtargetPredicate = HasVOP3PInsts in { @@ -169,14 +180,14 @@ // Size of src arguments (16/32) is controlled by op_sel. // For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi. let isCommutable = 1, mayRaiseFPException = 0 in { -def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile>; +defm V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3P_Mix_Profile>; let FPDPRounding = 1 in { // Clamp modifier is applied after conversion to f16. -def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile, 1>; +defm V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3P_Mix_Profile>; let ClampLo = 0, ClampHi = 1 in { -def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile, 1>; +defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile>; } } // End FPDPRounding = 1 } @@ -188,14 +199,14 @@ // Essentially the same as the mad_mix versions let SubtargetPredicate = HasFmaMixInsts in { let isCommutable = 1 in { -def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile>; +defm V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3P_Mix_Profile>; let FPDPRounding = 1 in { // Clamp modifier is applied after conversion to f16. -def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile, 1>; +defm V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3P_Mix_Profile>; let ClampLo = 0, ClampHi = 1 in { -def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile, 1>; +defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile>; } } // End FPDPRounding = 1 } @@ -287,30 +298,30 @@ let IsDOT = 1 in { let SubtargetPredicate = HasDot2Insts in { -def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", +defm V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile, int_amdgcn_sdot2, 1>; -def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", +defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile, int_amdgcn_udot2, 1>; } // End SubtargetPredicate = HasDot2Insts let SubtargetPredicate = HasDot7Insts in { -def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", +defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile, AMDGPUfdot2, 1/*ExplicitClamp*/>; -def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", +defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile, int_amdgcn_udot4, 1>; -def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", +defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile, int_amdgcn_udot8, 1>; } // End SubtargetPredicate = HasDot7Insts let SubtargetPredicate = HasDot1Insts in { -def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", +defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile, int_amdgcn_sdot4, 1>; -def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", +defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile, int_amdgcn_sdot8, 1>; } // End SubtargetPredicate = HasDot1Insts @@ -324,7 +335,7 @@ def : GCNPat < !cast(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y, (add_oneuse lhs, (!cast("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))), - (!cast("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; + (!cast("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; foreach Type = ["U", "I"] in let SubtargetPredicate = !cast("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in @@ -332,7 +343,7 @@ !cast(!foldl((add_oneuse i32:$src2, (!cast("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), [1, 2, 3, 4, 5, 6, 7], lhs, y, (NonACAdd_oneuse lhs, (!cast("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))), - (!cast("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; + (!cast("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; // Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase // in the compile time. Directly handle the pattern generated by the FE here. @@ -342,7 +353,7 @@ !cast(!foldl((add_oneuse i32:$src2, (!cast("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), [7, 1, 2, 3, 4, 5, 6], lhs, y, (NonACAdd_oneuse lhs, (!cast("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))), - (!cast("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; + (!cast("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; def ADst_32 : VOPDstOperand; def ADst_64 : VOPDstOperand; @@ -471,10 +482,10 @@ } // End Predicates = [isGFX90APlus] let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in { - def V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile, any_fma>; - def V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile, any_fmul>; - def V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3_Profile, any_fadd>; - def V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3_Profile>; + defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile, any_fma>; + defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile, any_fmul>; + defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3_Profile, any_fadd>; + defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3_Profile>; } // End SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;