diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -215,7 +215,7 @@ TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f64, "dd*1d", "t", "gfx90a-insts") TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f32, "ff*1f", "t", "gfx90a-insts") -TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2f16, "V2hV2h*1V2h", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2f16, "V2hV2h*1V2h", "t", "atomic-buffer-global-pk-add-f16-insts") TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmin_f64, "dd*1d", "t", "gfx90a-insts") TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmax_f64, "dd*1d", "t", "gfx90a-insts") @@ -227,10 +227,10 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f32, "ff*3f", "t", "gfx8-insts") TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_f32, "ff*0f", "t", "gfx940-insts") -TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2f16, "V2hV2h*0V2h", "t", "gfx940-insts") -TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "gfx940-insts") -TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "gfx940-insts") -TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "gfx940-insts") +TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2f16, "V2hV2h*0V2h", "t", "atomic-flat-pk-add-16-insts") +TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "atomic-flat-pk-add-16-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst") +TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts") //===----------------------------------------------------------------------===// // Deep learning builtins. diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -257,9 +257,13 @@ case GK_GFX940: Features["gfx940-insts"] = true; Features["fp8-insts"] = true; + Features["atomic-ds-pk-add-16-insts"] = true; + Features["atomic-flat-pk-add-16-insts"] = true; + Features["atomic-global-pk-add-bf16-inst"] = true; [[fallthrough]]; case GK_GFX90A: Features["gfx90a-insts"] = true; + Features["atomic-buffer-global-pk-add-f16-insts"] = true; [[fallthrough]]; case GK_GFX908: Features["dot3-insts"] = true; diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -72,9 +72,9 @@ // GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX909: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX90A: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX90A: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX90C: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX940: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX940: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx90a-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx11-err.cl copy from clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx90a-err.cl copy to clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx11-err.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx90a-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx11-err.cl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1100 \ // RUN: -verify -S -o - %s // REQUIRES: amdgpu-registered-target @@ -6,12 +6,13 @@ typedef half __attribute__((ext_vector_type(2))) half2; typedef short __attribute__((ext_vector_type(2))) short2; -void test_atomic_fadd(__global half2 *addrh2, half2 xh2, + +void test_atomic_fadd(__global half2 *addrh2, __local half2 *addrh2l, half2 xh2, __global short2 *addrs2, __local short2 *addrs2l, short2 xs2, __global float *addrf, float xf) { - __builtin_amdgcn_flat_atomic_fadd_f32(addrf, xf); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_f32' needs target feature gfx940-insts}} - __builtin_amdgcn_flat_atomic_fadd_v2f16(addrh2, xh2); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_v2f16' needs target feature gfx940-insts}} - __builtin_amdgcn_flat_atomic_fadd_v2bf16(addrs2, xs2); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_v2bf16' needs target feature gfx940-insts}} - __builtin_amdgcn_global_atomic_fadd_v2bf16(addrs2, xs2); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_v2bf16' needs target feature gfx940-insts}} - __builtin_amdgcn_ds_atomic_fadd_v2bf16(addrs2l, xs2); // expected-error{{'__builtin_amdgcn_ds_atomic_fadd_v2bf16' needs target feature gfx940-insts}} + __builtin_amdgcn_flat_atomic_fadd_v2f16(addrh2, xh2); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_v2f16' needs target feature atomic-flat-pk-add-16-insts}} + __builtin_amdgcn_flat_atomic_fadd_v2bf16(addrs2, xs2); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_v2bf16' needs target feature atomic-flat-pk-add-16-insts}} + __builtin_amdgcn_global_atomic_fadd_v2bf16(addrs2, xs2); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_v2bf16' needs target feature atomic-global-pk-add-bf16-inst}} + __builtin_amdgcn_global_atomic_fadd_v2f16(addrh2, xh2); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_v2f16' needs target feature atomic-buffer-global-pk-add-f16-insts}} + __builtin_amdgcn_ds_atomic_fadd_v2bf16(addrs2l, xs2); // expected-error{{'__builtin_amdgcn_ds_atomic_fadd_v2bf16' needs target feature atomic-ds-pk-add-16-insts}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl @@ -10,7 +10,7 @@ half2 *half_rtn; float *fp_rtn; double *rtn; - *half_rtn = __builtin_amdgcn_global_atomic_fadd_v2f16(addrh2, xh2); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_v2f16' needs target feature gfx90a-insts}} + *half_rtn = __builtin_amdgcn_global_atomic_fadd_v2f16(addrh2, xh2); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_v2f16' needs target feature atomic-buffer-global-pk-add-f16-insts}} *fp_rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_f32' needs target feature gfx90a-insts}} *rtn = __builtin_amdgcn_global_atomic_fadd_f64(addr, x); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_f64' needs target feature gfx90a-insts}} *rtn = __builtin_amdgcn_global_atomic_fmax_f64(addr, x); // expected-error{{'__builtin_amdgcn_global_atomic_fmax_f64' needs target feature gfx90a-insts}} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx90a-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx90a-err.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx90a-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx90a-err.cl @@ -10,8 +10,8 @@ __global short2 *addrs2, __local short2 *addrs2l, short2 xs2, __global float *addrf, float xf) { __builtin_amdgcn_flat_atomic_fadd_f32(addrf, xf); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_f32' needs target feature gfx940-insts}} - __builtin_amdgcn_flat_atomic_fadd_v2f16(addrh2, xh2); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_v2f16' needs target feature gfx940-insts}} - __builtin_amdgcn_flat_atomic_fadd_v2bf16(addrs2, xs2); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_v2bf16' needs target feature gfx940-insts}} - __builtin_amdgcn_global_atomic_fadd_v2bf16(addrs2, xs2); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_v2bf16' needs target feature gfx940-insts}} - __builtin_amdgcn_ds_atomic_fadd_v2bf16(addrs2l, xs2); // expected-error{{'__builtin_amdgcn_ds_atomic_fadd_v2bf16' needs target feature gfx940-insts}} + __builtin_amdgcn_flat_atomic_fadd_v2f16(addrh2, xh2); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_v2f16' needs target feature atomic-flat-pk-add-16-insts}} + __builtin_amdgcn_flat_atomic_fadd_v2bf16(addrs2, xs2); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_v2bf16' needs target feature atomic-flat-pk-add-16-insts}} + __builtin_amdgcn_global_atomic_fadd_v2bf16(addrs2, xs2); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_v2bf16' needs target feature atomic-global-pk-add-bf16-inst}} + __builtin_amdgcn_ds_atomic_fadd_v2bf16(addrs2l, xs2); // expected-error{{'__builtin_amdgcn_ds_atomic_fadd_v2bf16' needs target feature atomic-ds-pk-add-16-insts}} } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -622,6 +622,19 @@ "Has v_pk_fmac_f16 instruction" >; +def FeatureAtomicDsPkAdd16Insts : SubtargetFeature<"atomic-ds-pk-add-16-insts", + "HasAtomicDsPkAdd16Insts", + "true", + "Has ds_pk_add_bf16, ds_pk_add_f16, ds_pk_add_rtn_bf16, " + "ds_pk_add_rtn_f16 instructions" +>; + +def FeatureAtomicFlatPkAdd16Insts : SubtargetFeature<"atomic-flat-pk-add-16-insts", + "HasAtomicFlatPkAdd16Insts", + "true", + "Has flat_atomic_pk_add_f16 and flat_atomic_pk_add_bf16 instructions" +>; + def FeatureAtomicFaddRtnInsts : SubtargetFeature<"atomic-fadd-rtn-insts", "HasAtomicFaddRtnInsts", "true", @@ -638,15 +651,30 @@ [FeatureFlatGlobalInsts] >; -def FeatureAtomicPkFaddNoRtnInsts - : SubtargetFeature<"atomic-pk-fadd-no-rtn-insts", - "HasAtomicPkFaddNoRtnInsts", +def FeatureAtomicBufferGlobalPkAddF16NoRtnInsts + : SubtargetFeature<"atomic-buffer-global-pk-add-f16-no-rtn-insts", + "HasAtomicBufferGlobalPkAddF16NoRtnInsts", "true", "Has buffer_atomic_pk_add_f16 and global_atomic_pk_add_f16 instructions that " "don't return original value", [FeatureFlatGlobalInsts] >; +def FeatureAtomicBufferGlobalPkAddF16Insts : SubtargetFeature<"atomic-buffer-global-pk-add-f16-insts", + "HasAtomicBufferGlobalPkAddF16Insts", + "true", + "Has buffer_atomic_pk_add_f16 and global_atomic_pk_add_f16 instructions that " + "can return original value", + [FeatureFlatGlobalInsts] +>; + +def FeatureAtomicGlobalPkAddBF16Inst : SubtargetFeature<"atomic-global-pk-add-bf16-inst", + "HasAtomicGlobalPkAddBF16Inst", + "true", + "Has global_atomic_pk_add_bf16 instruction", + [FeatureFlatGlobalInsts] +>; + def FeatureFlatAtomicFaddF32Inst : SubtargetFeature<"flat-atomic-fadd-f32-inst", "HasFlatAtomicFaddF32Inst", @@ -1111,7 +1139,7 @@ FeatureMAIInsts, FeaturePkFmacF16Inst, FeatureAtomicFaddNoRtnInsts, - FeatureAtomicPkFaddNoRtnInsts, + FeatureAtomicBufferGlobalPkAddF16NoRtnInsts, FeatureSupportsSRAMECC, FeatureMFMAInlineLiteralBug, FeatureImageGather4D16Bug]>; @@ -1147,7 +1175,7 @@ FeaturePkFmacF16Inst, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, - FeatureAtomicPkFaddNoRtnInsts, + FeatureAtomicBufferGlobalPkAddF16Insts, FeatureImageInsts, FeatureMadMacF32Insts, FeatureSupportsSRAMECC, @@ -1181,6 +1209,8 @@ FeatureDot6Insts, FeatureDot7Insts, FeatureDot10Insts, + FeatureAtomicDsPkAdd16Insts, + FeatureAtomicFlatPkAdd16Insts, Feature64BitDPP, FeaturePackedFP32Ops, FeatureMAIInsts, @@ -1188,7 +1218,8 @@ FeaturePkFmacF16Inst, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, - FeatureAtomicPkFaddNoRtnInsts, + FeatureAtomicBufferGlobalPkAddF16Insts, + FeatureAtomicGlobalPkAddBF16Inst, FeatureFlatAtomicFaddF32Inst, FeatureSupportsSRAMECC, FeaturePackedTID, @@ -1804,13 +1835,25 @@ def HasFmaLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts()">, AssemblerPredicate<(any_of FeatureGFX10_3Insts)>; +def HasAtomicDsPkAdd16Insts : Predicate<"Subtarget->hasAtomicDsPkAdd16Insts()">, + AssemblerPredicate<(any_of FeatureAtomicDsPkAdd16Insts)>; + +def HasAtomicFlatPkAdd16Insts : Predicate<"Subtarget->hasAtomicFlatPkAdd16Insts()">, + AssemblerPredicate<(any_of FeatureAtomicFlatPkAdd16Insts)>; + def HasAtomicFaddRtnInsts : Predicate<"Subtarget->hasAtomicFaddRtnInsts()">, AssemblerPredicate<(all_of FeatureAtomicFaddRtnInsts)>; def HasAtomicFaddNoRtnInsts : Predicate<"Subtarget->hasAtomicFaddNoRtnInsts()">, AssemblerPredicate<(all_of FeatureAtomicFaddNoRtnInsts)>; -def HasAtomicPkFaddNoRtnInsts - : Predicate<"Subtarget->hasAtomicPkFaddNoRtnInsts()">, - AssemblerPredicate<(all_of FeatureAtomicPkFaddNoRtnInsts)>; +def HasAtomicBufferGlobalPkAddF16NoRtnInsts + : Predicate<"Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() || Subtarget->hasAtomicBufferGlobalPkAddF16Insts()">, + AssemblerPredicate<(any_of FeatureAtomicBufferGlobalPkAddF16NoRtnInsts, FeatureAtomicBufferGlobalPkAddF16Insts)>; +def HasAtomicBufferGlobalPkAddF16Insts + : Predicate<"Subtarget->hasAtomicBufferGlobalPkAddF16Insts()">, + AssemblerPredicate<(all_of FeatureAtomicBufferGlobalPkAddF16Insts)>; +def HasAtomicGlobalPkAddBF16Inst + : Predicate<"Subtarget->hasAtomicGlobalPkAddBF16Inst()">, + AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>; def HasFlatAtomicFaddF32Inst : Predicate<"Subtarget->hasFlatAtomicFaddF32Inst()">, AssemblerPredicate<(all_of FeatureFlatAtomicFaddF32Inst)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1353,7 +1353,7 @@ Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); if (ST.hasGFX90AInsts()) Atomic.legalFor({{S64, LocalPtr}}); - if (ST.hasGFX940Insts()) + if (ST.hasAtomicDsPkAdd16Insts()) Atomic.legalFor({{V2S16, LocalPtr}}); } if (ST.hasAtomicFaddInsts()) diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1111,7 +1111,7 @@ "buffer_atomic_add_f32", VGPR_32, f32 >; -let SubtargetPredicate = HasAtomicPkFaddNoRtnInsts in +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < "buffer_atomic_pk_add_f16", VGPR_32, v2f16 >; @@ -1121,7 +1121,7 @@ "buffer_atomic_add_f32", VGPR_32, f32, null_frag >; -let OtherPredicates = [isGFX90APlus] in +let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < "buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag >; @@ -1604,15 +1604,16 @@ let SubtargetPredicate = HasAtomicFaddNoRtnInsts in defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>; -let SubtargetPredicate = HasAtomicPkFaddNoRtnInsts in +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>; let SubtargetPredicate = HasAtomicFaddRtnInsts in defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>; -let SubtargetPredicate = isGFX90APlus in { - defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>; +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in +defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>; +let SubtargetPredicate = isGFX90APlus in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">; @@ -2884,11 +2885,11 @@ def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>; } // End AssemblerPredicate = isGFX8GFX9 -let SubtargetPredicate = HasAtomicFaddNoRtnInsts in { -defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>; defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>; +let SubtargetPredicate = HasAtomicFaddNoRtnInsts in { +defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>; } // End SubtargetPredicate = HasAtomicFaddNoRtnInsts let SubtargetPredicate = isGFX90APlus in { diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -494,12 +494,12 @@ defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VReg_64, "ds_add_f64">; } // End SubtargetPredicate = isGFX90APlus -let SubtargetPredicate = isGFX940Plus in { +let SubtargetPredicate = HasAtomicDsPkAdd16Insts in { defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_f16">; defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_f16", VGPR_32, "ds_pk_add_f16">; defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_bf16">; defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_bf16", VGPR_32, "ds_pk_add_bf16">; -} // End SubtargetPredicate = isGFX940Plus +} // End SubtargetPredicate = HasAtomicDsPkAdd16Insts defm DS_CMPSTORE_B32 : DS_1A2D_NORET_mc<"ds_cmpstore_b32">; defm DS_CMPSTORE_F32 : DS_1A2D_NORET_mc<"ds_cmpstore_f32">; @@ -1133,7 +1133,7 @@ def : DSAtomicRetPatIntrinsic; } -let SubtargetPredicate = isGFX940Plus in { +let SubtargetPredicate = HasAtomicDsPkAdd16Insts in { def : DSAtomicRetPat; let AddedComplexity = 1 in def : DSAtomicRetPat; @@ -1146,7 +1146,7 @@ (v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)), (DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0) >; -} +} // End SubtargetPredicate = HasAtomicDsPkAdd16Insts def : Pat < (SIds_ordered_count i32:$value, i16:$offset), diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -730,11 +730,13 @@ defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>; } // End SubtargetPredicate = isGFX90APlus -let SubtargetPredicate = isGFX940Plus in { +let SubtargetPredicate = HasAtomicFlatPkAdd16Insts in { defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16>; defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2f16>; +} // End SubtargetPredicate = HasAtomicFlatPkAdd16Insts + +let SubtargetPredicate = HasAtomicGlobalPkAddBF16Inst in defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2f16>; -} // End SubtargetPredicate = isGFX940Plus // GFX7-, GFX10-, GFX11-only flat instructions. let SubtargetPredicate = isGFX7GFX10GFX11 in { @@ -938,7 +940,7 @@ defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN < "global_atomic_add_f32", VGPR_32, f32 >; -let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in +let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < "global_atomic_pk_add_f16", VGPR_32, v2f16 >; @@ -946,7 +948,7 @@ defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN < "global_atomic_add_f32", VGPR_32, f32 >; -let OtherPredicates = [isGFX90APlus] in +let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN < "global_atomic_pk_add_f16", VGPR_32, v2f16 >; @@ -1505,7 +1507,7 @@ defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>; } -let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in { +let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in { defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>; defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>; } @@ -1516,14 +1518,17 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>; } +let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in { +defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>; +defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>; +} + let OtherPredicates = [isGFX90APlus] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>; defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>; defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>; -defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>; -defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>; defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>; @@ -1539,12 +1544,14 @@ defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", f32>; } -let OtherPredicates = [isGFX940Plus] in { +let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in { defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", v2f16>; defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>; } +let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>; + } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -152,9 +152,13 @@ bool HasMAIInsts = false; bool HasFP8Insts = false; bool HasPkFmacF16Inst = false; + bool HasAtomicDsPkAdd16Insts = false; + bool HasAtomicFlatPkAdd16Insts = false; bool HasAtomicFaddRtnInsts = false; bool HasAtomicFaddNoRtnInsts = false; - bool HasAtomicPkFaddNoRtnInsts = false; + bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false; + bool HasAtomicBufferGlobalPkAddF16Insts = false; + bool HasAtomicGlobalPkAddBF16Inst = false; bool HasFlatAtomicFaddF32Inst = false; bool SupportsSRAMECC = false; @@ -758,6 +762,10 @@ return HasPkFmacF16Inst; } + bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; } + + bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; } + bool hasAtomicFaddInsts() const { return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts; } @@ -766,7 +774,17 @@ bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; } - bool hasAtomicPkFaddNoRtnInsts() const { return HasAtomicPkFaddNoRtnInsts; } + bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const { + return HasAtomicBufferGlobalPkAddF16NoRtnInsts; + } + + bool hasAtomicBufferGlobalPkAddF16Insts() const { + return HasAtomicBufferGlobalPkAddF16Insts; + } + + bool hasAtomicGlobalPkAddBF16Inst() const { + return HasAtomicGlobalPkAddBF16Inst; + } bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }