Index: llvm/lib/Target/AMDGPU/AMDGPU.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPU.td +++ llvm/lib/Target/AMDGPU/AMDGPU.td @@ -279,6 +279,12 @@ "Image Gather4 D16 hardware bug" >; +def FeatureMADIntraFwdBug : SubtargetFeature<"mad-intra-fwd-bug", + "HasMADIntraFwdBug", + "true", + "MAD_U64/I64 intra instruction forwarding bug" +>; + class SubtargetFeatureLDSBankCount : SubtargetFeature < "ldsbankcount"#Value, "LDSBankCount", @@ -922,7 +928,8 @@ FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess + FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, + FeatureMADIntraFwdBug ] >; @@ -1765,6 +1772,10 @@ def HasUnalignedAccessMode : Predicate<"Subtarget->hasUnalignedAccessMode()">, AssemblerPredicate<(all_of FeatureUnalignedAccessMode)>; +def HasMADIntraFwdBug : Predicate<"Subtarget->hasMADIntraFwdBug()">; + +def HasNotMADIntraFwdBug : Predicate<"!Subtarget->hasMADIntraFwdBug()">; + // Include AMDGPU TD files include "SISchedule.td" include "GCNProcessors.td" Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1009,9 +1009,9 @@ SDLoc SL(N); bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; unsigned Opc; - if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11) - Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 - : AMDGPU::V_MAD_U64_U32_gfx11_e64; + if (Subtarget->hasMADIntraFwdBug()) + Opc = Signed ? AMDGPU::V_MAD_I64_I32_strict_e64 + : AMDGPU::V_MAD_U64_U32_strict_e64; else Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; @@ -1027,9 +1027,9 @@ SDLoc SL(N); bool Signed = N->getOpcode() == ISD::SMUL_LOHI; unsigned Opc; - if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11) - Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 - : AMDGPU::V_MAD_U64_U32_gfx11_e64; + if (Subtarget->hasMADIntraFwdBug()) + Opc = Signed ? AMDGPU::V_MAD_I64_I32_strict_e64 + : AMDGPU::V_MAD_U64_U32_strict_e64; else Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -465,9 +465,9 @@ const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; unsigned Opc; - if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11) - Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64 - : AMDGPU::V_MAD_I64_I32_gfx11_e64; + if (Subtarget->hasMADIntraFwdBug()) + Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_strict_e64 + : AMDGPU::V_MAD_I64_I32_strict_e64; else Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64; I.setDesc(TII.get(Opc)); Index: llvm/lib/Target/AMDGPU/GCNSubtarget.h =================================================================== --- llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -191,6 +191,7 @@ bool HasFlatSegmentOffsetBug = false; bool HasImageStoreD16Bug = false; bool HasImageGather4D16Bug = false; + bool HasMADIntraFwdBug = false; bool HasVOPDInsts = false; // Dummy feature to use for assembler in tablegen. @@ -906,6 +907,8 @@ bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } + bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; } + bool hasNSAEncoding() const { return HasNSAEncoding; } unsigned getNSAMaxSize() const { return NSAMaxSize; } Index: llvm/lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -285,18 +285,17 @@ } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] } // End SubtargetPredicate = isGFX7Plus -let isCommutable = 1 in { -let SchedRW = [WriteIntMul, WriteSALU] in { -let SubtargetPredicate = isGFX7GFX8GFX9GFX10 in { -defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; -defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; -} -let SubtargetPredicate = isGFX11Only, Constraints = "@earlyclobber $vdst" in { -defm V_MAD_U64_U32_gfx11 : VOP3Inst <"v_mad_u64_u32_gfx11", VOP3b_I64_I1_I32_I32_I64>; -defm V_MAD_I64_I32_gfx11 : VOP3Inst <"v_mad_i64_i32_gfx11", VOP3b_I64_I1_I32_I32_I64>; -} // End SubtargetPredicate = isGFX11Only, Constraints = "@earlyclobber $vdst" -} // End SchedRW = [WriteIntMul, WriteSALU] -} // End isCommutable = 1 +let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in { + let SubtargetPredicate = isGFX7Plus, Predicates = [HasNotMADIntraFwdBug] in { + defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; + defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; + } + let SubtargetPredicate = isGFX11Only, Predicates = [HasMADIntraFwdBug], + Constraints = "@earlyclobber $vdst" in { + defm V_MAD_U64_U32_strict : VOP3Inst <"v_mad_u64_u32_strict", VOP3b_I64_I1_I32_I32_I64>; + defm V_MAD_I64_I32_strict : VOP3Inst <"v_mad_i64_i32_strict", VOP3b_I64_I1_I32_I32_I64>; + } +} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] let FPDPRounding = 1 in { @@ -654,10 +653,11 @@ >; } -let SubtargetPredicate = isGFX9GFX10 in // exclude pre-GFX9 where it was slow -defm : IMAD32_Pats; -let SubtargetPredicate = isGFX11Only in -defm : IMAD32_Pats; +// exclude pre-GFX9 where it was slow +let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in + defm : IMAD32_Pats; +let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in + defm : IMAD32_Pats; def VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { let Src0RC64 = VRegSrc_32; @@ -918,8 +918,14 @@ defm V_DOT2_BF16_BF16 : VOP3Dot_Realtriple_gfx11<0x267>; defm V_DIV_SCALE_F32 : VOP3be_Real_gfx11<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">; defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">; -defm V_MAD_U64_U32_gfx11 : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">; -defm V_MAD_I64_I32_gfx11 : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32_gfx11", "v_mad_i64_i32">; +let OtherPredicates = [HasMADIntraFwdBug] in { +defm V_MAD_U64_U32_strict : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_strict", "v_mad_u64_u32">; +defm V_MAD_I64_I32_strict : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32_strict", "v_mad_i64_i32">; +} +let OtherPredicates = [HasNotMADIntraFwdBug], DecoderNamespace = "" in { +defm V_MAD_U64_U32 : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32", "v_mad_u64_u32">; +defm V_MAD_I64_I32 : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32", "v_mad_i64_i32">; +} defm V_ADD_NC_U16 : VOP3Only_Realtriple_gfx11<0x303>; defm V_SUB_NC_U16 : VOP3Only_Realtriple_gfx11<0x304>; defm V_MUL_LO_U16 : VOP3Only_Realtriple_gfx11<0x305>; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir @@ -24,8 +24,8 @@ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 - ; GFX11-NEXT: [[V_MAD_U64_U32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_gfx11_e64_]], implicit [[V_MAD_U64_U32_gfx11_e64_1]] + ; GFX11-NEXT: [[V_MAD_U64_U32_strict_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_strict_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_strict_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_strict_e64_]], implicit [[V_MAD_U64_U32_strict_e64_1]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -57,8 +57,8 @@ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 - ; GFX11-NEXT: [[V_MAD_I64_I32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_gfx11_e64_]], implicit [[V_MAD_I64_I32_gfx11_e64_1]] + ; GFX11-NEXT: [[V_MAD_I64_I32_strict_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_strict_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_strict_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_strict_e64_]], implicit [[V_MAD_I64_I32_strict_e64_1]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2