diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1480,12 +1480,23 @@ let OtherPredicates = [isGFX10Plus] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>; +defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>; +defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>; +} + +let OtherPredicates = [isGFX10Only] in { +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN_X2", "int_amdgcn_global_atomic_fmin", f64>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN_X2", "atomic_load_fmin_flat", f64>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX_X2", "atomic_load_fmax_flat", f64>; +defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN_X2", "int_amdgcn_flat_atomic_fmin", f64>; +defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX_X2", "int_amdgcn_flat_atomic_fmax", f64>; } let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-flat-atomics-f64.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-flat-atomics-f64.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-flat-atomics-f64.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -global-isel=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG +; RUN: llc < %s -global-isel=1 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL + +declare double @llvm.amdgcn.flat.atomic.fmin.f64.p1.f64(ptr %ptr, double %data) +declare double @llvm.amdgcn.flat.atomic.fmax.f64.p1.f64(ptr %ptr, double %data) + +define amdgpu_cs void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { +; GFX10-LABEL: flat_atomic_fmin_f64_noret: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] +; GFX10-NEXT: s_endpgm + %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p1.f64(ptr %ptr, double %data) + ret void +} + +define amdgpu_cs void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { +; GFX10-LABEL: flat_atomic_fmax_f64_noret: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] +; GFX10-NEXT: s_endpgm + %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p1.f64(ptr %ptr, double %data) + ret void +} + +define amdgpu_cs void @flat_atomic_fmin_f64_rtn(ptr %ptr, double %data, ptr %out) { +; GFX10-LABEL: flat_atomic_fmin_f64_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; GFX10-NEXT: s_endpgm + %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p1.f64(ptr %ptr, double %data) + store double %ret, ptr %out + ret void +} + +define amdgpu_cs void @flat_atomic_fmax_f64_rtn(ptr %ptr, double %data, ptr %out) { +; GFX10-LABEL: flat_atomic_fmax_f64_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; GFX10-NEXT: s_endpgm + %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p1.f64(ptr %ptr, double %data) + store double %ret, ptr %out + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10-GISEL: {{.*}} +; GFX10-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-flat-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-flat-atomics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-flat-atomics.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -global-isel=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG +; RUN: llc < %s -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG +; RUN: llc < %s -global-isel=1 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL +; RUN: llc < %s -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL + +declare float @llvm.amdgcn.flat.atomic.fmin.f32.p1.f32(ptr %ptr, float %data) +declare float @llvm.amdgcn.flat.atomic.fmax.f32.p1.f32(ptr %ptr, float %data) + +define amdgpu_cs void @flat_atomic_fmin_f32_noret(ptr %ptr, float %data) { +; GFX10-LABEL: flat_atomic_fmin_f32_noret: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_atomic_fmin v[0:1], v2 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_fmin_f32_noret: +; GFX11: ; %bb.0: +; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %ret = call float @llvm.amdgcn.flat.atomic.fmin.f32.p1.f32(ptr %ptr, float %data) + ret void +} + +define amdgpu_cs void @flat_atomic_fmax_f32_noret(ptr %ptr, float %data) { +; GFX10-LABEL: flat_atomic_fmax_f32_noret: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_atomic_fmax v[0:1], v2 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_fmax_f32_noret: +; GFX11: ; %bb.0: +; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %ret = call float @llvm.amdgcn.flat.atomic.fmax.f32.p1.f32(ptr %ptr, float %data) + ret void +} + +define amdgpu_cs float @flat_atomic_fmin_f32_rtn(ptr %ptr, float %data, ptr %out) { +; GFX10-LABEL: flat_atomic_fmin_f32_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_store_dword v[3:4], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: flat_atomic_fmin_f32_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b32 v[3:4], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog + %ret = call float @llvm.amdgcn.flat.atomic.fmin.f32.p1.f32(ptr %ptr, float %data) + store float %ret, ptr %out + ret float %ret +} + +define amdgpu_cs float @flat_atomic_fmax_f32_rtn(ptr %ptr, float %data, ptr %out) { +; GFX10-LABEL: flat_atomic_fmax_f32_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_store_dword v[3:4], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: flat_atomic_fmax_f32_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b32 v[3:4], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog + %ret = call float @llvm.amdgcn.flat.atomic.fmax.f32.p1.f32(ptr %ptr, float %data) + store float %ret, ptr %out + ret float %ret +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10-GISEL: {{.*}} +; GFX10-SDAG: {{.*}} +; GFX11-GISEL: {{.*}} +; GFX11-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-f64.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-f64.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-f64.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -global-isel=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG +; RUN: llc < %s -global-isel=1 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL + +declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) +declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) + +define amdgpu_cs void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) { +; GFX10-LABEL: global_atomic_fmin_f64_noret: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off +; GFX10-NEXT: s_endpgm + %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) + ret void +} + +define amdgpu_cs void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) { +; GFX10-LABEL: global_atomic_fmax_f64_noret: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off +; GFX10-NEXT: s_endpgm + %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) + ret void +} + +define amdgpu_cs void @global_atomic_fmin_f64_rtn(ptr addrspace(1) %ptr, double %data, ptr addrspace(1) %out) { +; GFX10-LABEL: global_atomic_fmin_f64_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX10-NEXT: s_endpgm + %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) + store double %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @global_atomic_fmax_f64_rtn(ptr addrspace(1) %ptr, double %data, ptr addrspace(1) %out) { +; GFX10-LABEL: global_atomic_fmax_f64_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX10-NEXT: s_endpgm + %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) + store double %ret, ptr addrspace(1) %out + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10-GISEL: {{.*}} +; GFX10-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll +++ /dev/null @@ -1,197 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=GFX10 - -; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX10 - -declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) -declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) -declare float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) %ptr, float %data) -declare float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - -define amdgpu_kernel void @global_atomic_fmin_f32_noret(ptr addrspace(1) %ptr, float %data) { -; GFX10-LABEL: global_atomic_fmin_f32_noret: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: global_atomic_fmin v0, v1, s[2:3] -; GFX10-NEXT: s_endpgm -; -; G_GFX10-LABEL: global_atomic_fmin_f32_noret: -; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 -; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX10-NEXT: global_atomic_fmin v1, v0, s[2:3] -; G_GFX10-NEXT: s_endpgm -main_body: - %ret = call float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -define amdgpu_kernel void @global_atomic_fmax_f32_noret(ptr addrspace(1) %ptr, float %data) { -; GFX10-LABEL: global_atomic_fmax_f32_noret: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: global_atomic_fmax v0, v1, s[2:3] -; GFX10-NEXT: s_endpgm -; -; G_GFX10-LABEL: global_atomic_fmax_f32_noret: -; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 -; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX10-NEXT: global_atomic_fmax v1, v0, s[2:3] -; G_GFX10-NEXT: s_endpgm -main_body: - %ret = call float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -define float @global_atomic_fmax_f32_rtn(ptr addrspace(1) %ptr, float %data) { -; GFX10-LABEL: global_atomic_fmax_f32_rtn: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; G_GFX10-LABEL: global_atomic_fmax_f32_rtn: -; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; G_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; G_GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc -; G_GFX10-NEXT: s_waitcnt vmcnt(0) -; G_GFX10-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret float %ret -} - -define float @global_atomic_fmin_f32_rtn(ptr addrspace(1) %ptr, float %data) { -; GFX10-LABEL: global_atomic_fmin_f32_rtn: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; G_GFX10-LABEL: global_atomic_fmin_f32_rtn: -; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; G_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; G_GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc -; G_GFX10-NEXT: s_waitcnt vmcnt(0) -; G_GFX10-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret float %ret -} - -define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) { -; GFX10-LABEL: global_atomic_fmin_f64_noret: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] -; GFX10-NEXT: s_endpgm -; -; G_GFX10-LABEL: global_atomic_fmin_f64_noret: -; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; G_GFX10-NEXT: v_mov_b32_e32 v2, 0 -; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX10-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] -; G_GFX10-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) { -; GFX10-LABEL: global_atomic_fmax_f64_noret: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] -; GFX10-NEXT: s_endpgm -; -; G_GFX10-LABEL: global_atomic_fmax_f64_noret: -; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; G_GFX10-NEXT: v_mov_b32_e32 v2, 0 -; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX10-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] -; G_GFX10-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define double @global_atomic_fmax_f64_rtn(ptr addrspace(1) %ptr, double %data) { -; GFX10-LABEL: global_atomic_fmax_f64_rtn: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; G_GFX10-LABEL: global_atomic_fmax_f64_rtn: -; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; G_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; G_GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc -; G_GFX10-NEXT: s_waitcnt vmcnt(0) -; G_GFX10-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - -define double @global_atomic_fmin_f64_rtn(ptr addrspace(1) %ptr, double %data) { -; GFX10-LABEL: global_atomic_fmin_f64_rtn: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; G_GFX10-LABEL: global_atomic_fmin_f64_rtn: -; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; G_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; G_GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc -; G_GFX10-NEXT: s_waitcnt vmcnt(0) -; G_GFX10-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -global-isel=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG +; RUN: llc < %s -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG +; RUN: llc < %s -global-isel=1 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL +; RUN: llc < %s -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL + +declare float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) %ptr, float %data) +declare float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + +define amdgpu_cs void @global_atomic_fmin_f32_noret(ptr addrspace(1) %ptr, float %data) { +; GFX10-LABEL: global_atomic_fmin_f32_noret: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fmin_f32_noret: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %ret = call float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + ret void +} + +define amdgpu_cs void @global_atomic_fmax_f32_noret(ptr addrspace(1) %ptr, float %data) { +; GFX10-LABEL: global_atomic_fmax_f32_noret: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fmax_f32_noret: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %ret = call float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + ret void +} + +define amdgpu_cs void @global_atomic_fmax_f32_rtn(ptr addrspace(1) %ptr, float %data, ptr addrspace(1) %out) { +; GFX10-LABEL: global_atomic_fmax_f32_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[3:4], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fmax_f32_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v[3:4], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %ret = call float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + store float %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @global_atomic_fmin_f32_rtn(ptr addrspace(1) %ptr, float %data, ptr addrspace(1) %out) { +; GFX10-LABEL: global_atomic_fmin_f32_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[3:4], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fmin_f32_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v[3:4], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %ret = call float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + store float %ret, ptr addrspace(1) %out + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10-GISEL: {{.*}} +; GFX10-SDAG: {{.*}} +; GFX11-GISEL: {{.*}} +; GFX11-SDAG: {{.*}}