diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -566,11 +566,19 @@ "Has v_pk_fmac_f16 instruction" >; -def FeatureAtomicFaddInsts : SubtargetFeature<"atomic-fadd-insts", - "HasAtomicFaddInsts", +def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts", + "HasAtomicFaddNoRtnInsts", "true", "Has buffer_atomic_add_f32, buffer_atomic_pk_add_f16, global_atomic_add_f32, " - "global_atomic_pk_add_f16 instructions", + "global_atomic_pk_add_f16 instructions that don't return original value", + [FeatureFlatGlobalInsts] +>; + +def FeatureAtomicFaddRtnInsts : SubtargetFeature<"atomic-fadd-rtn-insts", + "HasAtomicFaddRtnInsts", + "true", + "Has buffer_atomic_add_f32, buffer_atomic_pk_add_f16, global_atomic_add_f32, " + "global_atomic_pk_add_f16 instructions that return original value", [FeatureFlatGlobalInsts] >; @@ -994,7 +1002,7 @@ FeatureDot7Insts, FeatureMAIInsts, FeaturePkFmacF16Inst, - FeatureAtomicFaddInsts, + FeatureAtomicFaddNoRtnInsts, FeatureSupportsSRAMECC, FeatureMFMAInlineLiteralBug, FeatureImageGather4D16Bug]>; @@ -1026,7 +1034,8 @@ FeaturePackedFP32Ops, FeatureMAIInsts, FeaturePkFmacF16Inst, - FeatureAtomicFaddInsts, + FeatureAtomicFaddNoRtnInsts, + FeatureAtomicFaddRtnInsts, FeatureImageInsts, FeatureMadMacF32Insts, FeatureSupportsSRAMECC, @@ -1062,7 +1071,8 @@ FeaturePackedFP32Ops, FeatureMAIInsts, FeaturePkFmacF16Inst, - FeatureAtomicFaddInsts, + FeatureAtomicFaddNoRtnInsts, + FeatureAtomicFaddRtnInsts, FeatureSupportsSRAMECC, FeaturePackedTID, FeatureArchitectedFlatScratch, @@ -1555,15 +1565,11 @@ def HasFmaLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts()">, AssemblerPredicate<(any_of FeatureGFX10_3Insts)>; -def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">, - AssemblerPredicate<(all_of FeatureAtomicFaddInsts)>; +def HasAtomicFaddNoRtnInsts : Predicate<"Subtarget->hasAtomicFaddNoRtnInsts()">, + AssemblerPredicate<(all_of FeatureAtomicFaddNoRtnInsts)>; -// Differentiate between two functionally equivalent, but incompatible -// encoding-wise FP atomics between gfx90* and gfx940 -def HasAtomicFaddInstsGFX90X : Predicate<"Subtarget->hasAtomicFaddInsts()">, - AssemblerPredicate<(all_of FeatureAtomicFaddInsts, (not FeatureGFX940Insts))>; -def HasAtomicFaddInstsGFX940 : Predicate<"Subtarget->hasAtomicFaddInsts()">, - AssemblerPredicate<(all_of FeatureAtomicFaddInsts, FeatureGFX940Insts)>; +def HasAtomicFaddRtnInsts : Predicate<"Subtarget->hasAtomicFaddRtnInsts()">, + AssemblerPredicate<(all_of FeatureAtomicFaddRtnInsts)>; def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">, AssemblerPredicate<(all_of FeatureDsSrc2Insts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1300,7 +1300,7 @@ if (ST.hasGFX940Insts()) Atomic.legalFor({{V2S16, LocalPtr}}); } - if (ST.hasAtomicFaddInsts()) + if (ST.hasAtomicFaddNoRtnInsts()) Atomic.legalFor({{S32, GlobalPtr}}); if (ST.hasGFX90AInsts()) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -259,7 +259,8 @@ HasDot7Insts(false), HasMAIInsts(false), HasPkFmacF16Inst(false), - HasAtomicFaddInsts(false), + HasAtomicFaddNoRtnInsts(false), + HasAtomicFaddRtnInsts(false), SupportsSRAMECC(false), EnableSRAMECC(false), HasNoSdstCMPX(false), diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1109,7 +1109,7 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>; -let SubtargetPredicate = HasAtomicFaddInsts in { +let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN < "buffer_atomic_add_f32", VGPR_32, f32 >; @@ -1117,7 +1117,7 @@ "buffer_atomic_pk_add_f16", VGPR_32, v2f16 >; -let OtherPredicates = [isGFX90APlus] in { +let OtherPredicates = [HasAtomicFaddRtnInsts] in { defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN < "buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_32 >; @@ -1533,15 +1533,17 @@ >; } -let SubtargetPredicate = HasAtomicFaddInsts in { +let SubtargetPredicate = HasAtomicFaddNoRtnInsts in { defm : BufferAtomicPatterns_NO_RTN; defm : BufferAtomicPatterns_NO_RTN; } -let SubtargetPredicate = isGFX90APlus in { +let SubtargetPredicate = HasAtomicFaddRtnInsts in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16">; +} +let SubtargetPredicate = isGFX90APlus in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">; @@ -2544,13 +2546,9 @@ def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>; } // End AssemblerPredicate = isGFX8GFX9 -let SubtargetPredicate = HasAtomicFaddInsts in { - defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>; defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>; -} // End SubtargetPredicate = HasAtomicFaddInsts - let SubtargetPredicate = isGFX90APlus in { defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_vi<0x4f>; defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Real_Atomic_vi<0x50>; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -800,23 +800,23 @@ } // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1 let is_flat_global = 1 in { -let OtherPredicates = [HasAtomicFaddInsts] in { +let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN < "global_atomic_add_f32", VGPR_32, f32 >; defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < "global_atomic_pk_add_f16", VGPR_32, v2f16 >; -} // End OtherPredicates = [HasAtomicFaddInsts] +} // End OtherPredicates = [HasAtomicFaddNoRtnInsts] -let OtherPredicates = [isGFX90APlus] in { +let OtherPredicates = [HasAtomicFaddRtnInsts] in { defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN < "global_atomic_add_f32", VGPR_32, f32, int_amdgcn_global_atomic_fadd >; defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN < "global_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_global_atomic_fadd >; -} // End OtherPredicates = [isGFX90APlus] +} // End OtherPredicates = [HasAtomicFaddRtnInsts] } // End is_flat_global = 1 //===----------------------------------------------------------------------===// @@ -1268,14 +1268,17 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>; } -let OtherPredicates = [HasAtomicFaddInsts] in { +let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { defm : GlobalFLATNoRtnAtomicPats ; defm : GlobalFLATNoRtnAtomicPats ; } -let OtherPredicates = [isGFX90APlus] in { +let OtherPredicates = [HasAtomicFaddRtnInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_global", v2f16>; +} + +let OtherPredicates = [isGFX90APlus] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>; @@ -1596,7 +1599,8 @@ defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>; defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>; -let SubtargetPredicate = HasAtomicFaddInstsGFX90X in { +let SubtargetPredicate = isGFX8GFX9NotGFX940 in { +// These instructions are encoded differently on gfx90* and gfx940. defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_vi <0x04d, 0>; defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e, 0>; } @@ -1626,12 +1630,11 @@ def _SADDR_RTN_gfx940 : FLAT_Real_gfx940 (NAME#"_SADDR_RTN")>; } -let SubtargetPredicate = HasAtomicFaddInstsGFX940 in { +let SubtargetPredicate = isGFX940Plus in { + // These instructions are encoded differently on gfx90* and gfx940. defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_gfx940 <0x04d>; defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_gfx940 <0x04e>; -} -let SubtargetPredicate = isGFX940Plus in { defm FLAT_ATOMIC_ADD_F64 : FLAT_Real_Atomics_gfx940<0x4f, FLAT_ATOMIC_ADD_F64>; defm FLAT_ATOMIC_MIN_F64 : FLAT_Real_Atomics_gfx940<0x50, FLAT_ATOMIC_MIN_F64>; defm FLAT_ATOMIC_MAX_F64 : FLAT_Real_Atomics_gfx940<0x51, FLAT_ATOMIC_MAX_F64>; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -144,7 +144,8 @@ bool HasDot7Insts; bool HasMAIInsts; bool HasPkFmacF16Inst; - bool HasAtomicFaddInsts; + bool HasAtomicFaddNoRtnInsts; + bool HasAtomicFaddRtnInsts; bool SupportsSRAMECC; // This should not be used directly. 'TargetID' tracks the dynamic settings @@ -713,9 +714,9 @@ return HasPkFmacF16Inst; } - bool hasAtomicFaddInsts() const { - return HasAtomicFaddInsts; - } + bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; } + + bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; } bool hasNoSdstCMPX() const { return HasNoSdstCMPX; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12451,7 +12451,7 @@ unsigned AS = RMW->getPointerAddressSpace(); if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) && - Subtarget->hasAtomicFaddInsts()) { + Subtarget->hasAtomicFaddNoRtnInsts()) { if (Subtarget->hasGFX940Insts()) return AtomicExpansionKind::None; diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll @@ -23,4 +23,4 @@ ret void } -attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-features"="+atomic-fadd-insts" "amdgpu-unsafe-fp-atomics"="true" } +attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-features"="+atomic-fadd-no-rtn-insts,+atomic-fadd-rtn-insts" "amdgpu-unsafe-fp-atomics"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -800,5 +800,5 @@ } attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } -attributes #1 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-cpu"="gfx803" "target-features"="+atomic-fadd-insts" "amdgpu-unsafe-fp-atomics"="true" } +attributes #1 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-cpu"="gfx803" "target-features"="+atomic-fadd-no-rtn-insts,+atomic-fadd-rtn-insts" "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { "amdgpu-unsafe-fp-atomics"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll @@ -99,4 +99,4 @@ ret void } -attributes #0 = { "target-cpu"="gfx803" "target-features"="+atomic-fadd-insts" } +attributes #0 = { "target-cpu"="gfx803" "target-features"="+atomic-fadd-no-rtn-insts,+atomic-fadd-rtn-insts" }