Index: llvm/lib/Target/AMDGPU/AMDGPU.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPU.td +++ llvm/lib/Target/AMDGPU/AMDGPU.td @@ -279,6 +279,12 @@ "Image Gather4 D16 hardware bug" >; +def FeatureMADIntraFwdBug : SubtargetFeature<"mad-intra-fwd-bug", + "HasMADIntraFwdBug", + "true", + "MAD_U64/I64 intra instruction forwarding bug" +>; + class SubtargetFeatureLDSBankCount : SubtargetFeature < "ldsbankcount"#Value, "LDSBankCount", @@ -1299,7 +1305,8 @@ FeatureImageInsts, FeaturePackedTID, FeatureVcmpxPermlaneHazard, - FeatureBackOffBarrier]>; + FeatureBackOffBarrier, + FeatureMADIntraFwdBug]>; def FeatureISAVersion11_0_0 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, @@ -1782,6 +1789,10 @@ def HasUnalignedAccessMode : Predicate<"Subtarget->hasUnalignedAccessMode()">, AssemblerPredicate<(all_of FeatureUnalignedAccessMode)>; +def HasMADIntraFwdBug : Predicate<"Subtarget->hasMADIntraFwdBug()">; + +def HasNotMADIntraFwdBug : Predicate<"!Subtarget->hasMADIntraFwdBug()">; + // Include AMDGPU TD files include "SISchedule.td" include "GCNProcessors.td" Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1008,9 +1008,9 @@ SDLoc SL(N); bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; unsigned Opc; - if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11) - Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 - : AMDGPU::V_MAD_U64_U32_gfx11_e64; + if (Subtarget->hasMADIntraFwdBug()) + Opc = Signed ? AMDGPU::V_MAD_I64_I32_strict_e64 + : AMDGPU::V_MAD_U64_U32_strict_e64; else Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; @@ -1026,9 +1026,9 @@ SDLoc SL(N); bool Signed = N->getOpcode() == ISD::SMUL_LOHI; unsigned Opc; - if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11) - Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 - : AMDGPU::V_MAD_U64_U32_gfx11_e64; + if (Subtarget->hasMADIntraFwdBug()) + Opc = Signed ? AMDGPU::V_MAD_I64_I32_strict_e64 + : AMDGPU::V_MAD_U64_U32_strict_e64; else Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -465,9 +465,9 @@ const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; unsigned Opc; - if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11) - Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64 - : AMDGPU::V_MAD_I64_I32_gfx11_e64; + if (Subtarget->hasMADIntraFwdBug()) + Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_strict_e64 + : AMDGPU::V_MAD_I64_I32_strict_e64; else Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64; I.setDesc(TII.get(Opc)); Index: llvm/lib/Target/AMDGPU/GCNSubtarget.h =================================================================== --- llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -193,6 +193,7 @@ bool HasImageStoreD16Bug = false; bool HasImageGather4D16Bug = false; bool HasGFX11FullVGPRs = false; + bool HasMADIntraFwdBug = false; bool HasVOPDInsts = false; // Dummy feature to use for assembler in tablegen. @@ -910,6 +911,8 @@ bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } + bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; } + bool hasNSAEncoding() const { return HasNSAEncoding; } unsigned getNSAMaxSize() const { return NSAMaxSize; } Index: llvm/lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -289,18 +289,17 @@ } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] } // End SubtargetPredicate = isGFX7Plus -let isCommutable = 1 in { -let SchedRW = [WriteIntMul, WriteSALU] in { -let SubtargetPredicate = isGFX7GFX8GFX9GFX10 in { -defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; -defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; -} -let SubtargetPredicate = isGFX11Only, Constraints = "@earlyclobber $vdst" in { -defm V_MAD_U64_U32_gfx11 : VOP3Inst <"v_mad_u64_u32_gfx11", VOP3b_I64_I1_I32_I32_I64>; -defm V_MAD_I64_I32_gfx11 : VOP3Inst <"v_mad_i64_i32_gfx11", VOP3b_I64_I1_I32_I32_I64>; -} // End SubtargetPredicate = isGFX11Only, Constraints = "@earlyclobber $vdst" -} // End SchedRW = [WriteIntMul, WriteSALU] -} // End isCommutable = 1 +let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in { + let SubtargetPredicate = isGFX7Plus, OtherPredicates = [HasNotMADIntraFwdBug] in { + defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; + defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; + } + let SubtargetPredicate = isGFX11Only, OtherPredicates = [HasMADIntraFwdBug], + Constraints = "@earlyclobber $vdst" in { + defm V_MAD_U64_U32_strict : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; + defm V_MAD_I64_I32_strict : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; + } +} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] let FPDPRounding = 1 in { @@ -658,10 +657,11 @@ >; } -let SubtargetPredicate = isGFX9GFX10 in // exclude pre-GFX9 where it was slow -defm : IMAD32_Pats; -let SubtargetPredicate = isGFX11Only in -defm : IMAD32_Pats; +// exclude pre-GFX9 where it was slow +let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in + defm : IMAD32_Pats; +let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in + defm : IMAD32_Pats; def VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0, @@ -919,8 +919,8 @@ defm V_DOT2_BF16_BF16 : VOP3Dot_Realtriple_gfx11<0x267>; defm V_DIV_SCALE_F32 : VOP3be_Real_gfx11<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">; defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">; -defm V_MAD_U64_U32_gfx11 : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">; -defm V_MAD_I64_I32_gfx11 : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32_gfx11", "v_mad_i64_i32">; +defm V_MAD_U64_U32 : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32", "v_mad_u64_u32">; +defm V_MAD_I64_I32 : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32", "v_mad_i64_i32">; defm V_ADD_NC_U16 : VOP3Only_Realtriple_gfx11<0x303>; defm V_SUB_NC_U16 : VOP3Only_Realtriple_gfx11<0x304>; defm V_MUL_LO_U16_t16 : VOP3Only_Realtriple_t16_gfx11<0x305, "v_mul_lo_u16">; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir @@ -24,8 +24,8 @@ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 - ; GFX11-NEXT: [[V_MAD_U64_U32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_gfx11_e64_]], implicit [[V_MAD_U64_U32_gfx11_e64_1]] + ; GFX11-NEXT: [[V_MAD_U64_U32_strict_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_strict_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_strict_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_strict_e64_]], implicit [[V_MAD_U64_U32_strict_e64_1]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -57,8 +57,8 @@ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 - ; GFX11-NEXT: [[V_MAD_I64_I32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_gfx11_e64_]], implicit [[V_MAD_I64_I32_gfx11_e64_1]] + ; GFX11-NEXT: [[V_MAD_I64_I32_strict_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_strict_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_strict_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_strict_e64_]], implicit [[V_MAD_I64_I32_strict_e64_1]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2