diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11949,9 +11949,15 @@ .getValueAsString() != "true") return AtomicExpansionKind::CmpXChg; - if (Subtarget->hasGFX90AInsts()) + if (Subtarget->hasGFX90AInsts()) { + auto SSID = RMW->getSyncScopeID(); + if (SSID == SyncScope::System || + SSID == RMW->getContext().getOrInsertSyncScopeID("one-as")) + return AtomicExpansionKind::CmpXChg; + return (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS) ? AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; + } if (!Subtarget->hasGFX90AInsts() && AS != AMDGPUAS::GLOBAL_ADDRESS) return AtomicExpansionKind::CmpXChg; diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -415,19 +415,80 @@ ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: BB24_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc scc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_cbranch_execnz BB24_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_endpgm +main_body: + %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(double addrspace(1)* %ptr) #1 { +; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_endpgm +main_body: + %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(double addrspace(1)* %ptr) #1 { +; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: BB26_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_cbranch_execnz BB26_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst + %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("one-as") seq_cst ret void } @@ -440,7 +501,7 @@ ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: BB25_1: ; %atomicrmw.start +; GFX90A-NEXT: BB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 @@ -455,11 +516,11 @@ ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz BB25_1 +; GFX90A-NEXT: s_cbranch_execnz BB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst + %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst ret void } @@ -479,18 +540,79 @@ ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: BB29_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc scc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz BB29_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst + ret double %ret +} + +define double @global_atomic_fadd_f64_rtn_pat_agent(double addrspace(1)* %ptr, double %data) #1 { +; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_agent: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst + ret double %ret +} + +define double @global_atomic_fadd_f64_rtn_pat_system(double addrspace(1)* %ptr, double %data) #1 { +; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_system: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: BB31_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc scc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz BB31_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] main_body: - %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst + %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("one-as") seq_cst ret double %ret } @@ -522,20 +644,81 @@ ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: BB34_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc scc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_cbranch_execnz BB34_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_endpgm +main_body: + %ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(double* %ptr) #1 { +; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_endpgm +main_body: + %ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("agent") seq_cst + ret void +} + +define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(double* %ptr) #1 { +; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: BB36_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] scc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_cbranch_execnz BB36_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst + %ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("one-as") seq_cst ret void } @@ -543,18 +726,80 @@ ; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: BB37_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc scc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz BB37_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst + ret double %ret +} + +define double @flat_atomic_fadd_f64_rtn_pat_agent(double* %ptr) #1 { +; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_agent: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("agent") seq_cst + ret double %ret +} + +define double @flat_atomic_fadd_f64_rtn_pat_system(double* %ptr) #1 { +; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_system: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: BB39_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc scc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz BB39_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] main_body: - %ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst + %ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("one-as") seq_cst ret double %ret } @@ -696,7 +941,7 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: ds_read_b64 v[0:1], v0 -; GFX90A-NEXT: BB41_1: ; %atomicrmw.start +; GFX90A-NEXT: BB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], 4.0 @@ -708,7 +953,7 @@ ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz BB41_1 +; GFX90A-NEXT: s_cbranch_execnz BB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -59,15 +59,29 @@ ; GFX90A-LABEL: global_atomic_fadd_ret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: BB0_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc scc +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_cbranch_execnz BB0_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst @@ -156,7 +170,7 @@ ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm - %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst + %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef ret void } @@ -202,15 +216,12 @@ ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 -; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] scc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm - %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst + %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -289,7 +300,162 @@ ; GFX90A-NEXT: s_cbranch_execnz BB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm - %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst + %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(float addrspace(1)* %ptr) #0 { +; GFX900-LABEL: global_atomic_fadd_ret_f32_agent: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-NEXT: BB4_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX900-NEXT: s_cbranch_execnz BB4_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX900-NEXT: global_store_dword v[0:1], v0, off +; GFX900-NEXT: s_endpgm +; +; GFX908-LABEL: global_atomic_fadd_ret_f32_agent: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX908-NEXT: s_mov_b64 s[2:3], 0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v0, s4 +; GFX908-NEXT: BB4_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, 0 +; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX908-NEXT: s_cbranch_execnz BB4_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX908-NEXT: global_store_dword v[0:1], v0, off +; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: global_atomic_fadd_ret_f32_agent: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_endpgm + %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst + store float %result, float addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(float addrspace(1)* %ptr) #0 { +; GFX900-LABEL: global_atomic_fadd_ret_f32_system: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-NEXT: BB5_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX900-NEXT: s_cbranch_execnz BB5_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX900-NEXT: global_store_dword v[0:1], v0, off +; GFX900-NEXT: s_endpgm +; +; GFX908-LABEL: global_atomic_fadd_ret_f32_system: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX908-NEXT: s_mov_b64 s[2:3], 0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v0, s4 +; GFX908-NEXT: BB5_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, 0 +; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX908-NEXT: s_cbranch_execnz BB5_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX908-NEXT: global_store_dword v[0:1], v0, off +; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: global_atomic_fadd_ret_f32_system: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: BB5_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_cbranch_execnz BB5_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_endpgm + %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("one-as") seq_cst + store float %result, float addrspace(1)* undef ret void } @@ -302,7 +468,7 @@ ; GCN-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: BB4_1: ; %atomicrmw.start +; GCN-NEXT: BB6_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 @@ -314,12 +480,12 @@ ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GCN-NEXT: s_cbranch_execnz BB4_1 +; GCN-NEXT: s_cbranch_execnz BB6_1 ; GCN-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_endpgm - %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst + %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef ret void } @@ -335,7 +501,7 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: s_endpgm - %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst + %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void }