Index: clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl =================================================================== --- clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl +++ clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl @@ -1,6 +1,9 @@ // RUN: %clang_cc1 -no-opaque-pointers -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \ // RUN: %s -S -emit-llvm -o - | FileCheck %s -check-prefix=CHECK +// RUN: %clang_cc1 -no-opaque-pointers -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: -S -o - %s | FileCheck -check-prefix=GFX90A %s + // REQUIRES: amdgpu-registered-target typedef half __attribute__((ext_vector_type(2))) half2; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -580,6 +580,22 @@ } } +multiclass local_addr_space_atomic_op { + def "_noret_local_addrspace" : + PatFrag<(ops node:$ptr, node:$data), + (!cast(NAME) node:$ptr, node:$data)>{ + let HasNoUse = true; + let AddressSpaces = LoadAddress_local.AddrSpaces; + let IsAtomic = 1; + } + def "_local_addrspace" : + PatFrag<(ops node:$ptr, node:$data), + (!cast(NAME) node:$ptr, node:$data)>{ + let AddressSpaces = LoadAddress_local.AddrSpaces; + let IsAtomic = 1; + } +} + defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op; defm int_amdgcn_flat_atomic_fadd_v2bf16 : noret_op; defm int_amdgcn_flat_atomic_fmin : noret_op; @@ -589,6 +605,7 @@ defm int_amdgcn_global_atomic_fadd_v2bf16 : noret_op; defm int_amdgcn_global_atomic_fmin : noret_op; defm int_amdgcn_global_atomic_fmax : noret_op; +defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op; defm int_amdgcn_ds_fadd_v2bf16 : noret_op; multiclass noret_binary_atomic_op { Index: llvm/lib/Target/AMDGPU/DSInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/DSInstructions.td +++ llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1121,6 +1121,16 @@ def : DSAtomicRetPat; let AddedComplexity = 1 in def : DSAtomicRetPat; + +class DSAtomicRetPatIntrinsic : GCNPat < + (vt (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value)), + (inst $ptr, getVregSrcForVT.ret:$value, offset:$offset, (i1 gds))> { +} + +def : DSAtomicRetPatIntrinsic; +let AddedComplexity = 1 in +def : DSAtomicRetPatIntrinsic; } let SubtargetPredicate = isGFX940Plus in { Index: llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -12,6 +12,7 @@ declare double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) declare double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %ptr, double %data) +declare double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %ptr, double %data) declare double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* nocapture, double, i32, i32, i1) @@ -952,6 +953,35 @@ ret double %ret } +define amdgpu_kernel void @local_atomic_fadd_f64_noret_from_flat_intrinsic(double addrspace(3)* %ptr, double %data) { +; GFX90A-LABEL: local_atomic_fadd_f64_noret_from_flat_intrinsic: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: ds_add_f64 v2, v[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %ptr, double %data) + ret void +} + +define double @local_atomic_fadd_f64_rtn_from_flat_intrinsic(double addrspace(3)* %ptr, double %data) { +; GFX90A-LABEL: local_atomic_fadd_f64_rtn_from_flat_intrinsic: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %ptr, double %data) + ret double %ret +} + define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(double addrspace(3)* %ptr) #1 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body