Index: llvm/include/llvm/CodeGen/AtomicExpandUtils.h =================================================================== --- llvm/include/llvm/CodeGen/AtomicExpandUtils.h +++ llvm/include/llvm/CodeGen/AtomicExpandUtils.h @@ -23,7 +23,7 @@ /// /* OUT */ %success, /* OUT */ %new_loaded) using CreateCmpXchgInstFun = function_ref &, Value *, Value *, Value *, Align, - AtomicOrdering, Value *&, Value *&)>; + AtomicOrdering, SyncScope::ID, Value *&, Value *&)>; /// Expand an atomic RMW instruction into a loop utilizing /// cmpxchg. You'll want to make sure your target machine likes cmpxchg Index: llvm/lib/CodeGen/AtomicExpandPass.cpp =================================================================== --- llvm/lib/CodeGen/AtomicExpandPass.cpp +++ llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -97,7 +97,7 @@ AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI); static Value *insertRMWCmpXchgLoop( IRBuilder<> &Builder, Type *ResultType, Value *Addr, Align AddrAlign, - AtomicOrdering MemOpOrder, + AtomicOrdering MemOpOrder, SyncScope::ID SSID, function_ref &, Value *)> PerformOp, CreateCmpXchgInstFun CreateCmpXchg); bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI); @@ -467,8 +467,8 @@ static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr, Value *Loaded, Value *NewVal, Align AddrAlign, - AtomicOrdering MemOpOrder, Value *&Success, - Value *&NewLoaded) { + AtomicOrdering MemOpOrder, SyncScope::ID SSID, + Value *&Success, Value *&NewLoaded) { Type *OrigTy = NewVal->getType(); // This code can go away when cmpxchg supports FP types. @@ -483,7 +483,7 @@ Value *Pair = Builder.CreateAtomicCmpXchg( Addr, Loaded, NewVal, AddrAlign, MemOpOrder, - AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); + AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID); Success = Builder.CreateExtractValue(Pair, 1, "success"); NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); @@ -768,6 +768,7 @@ void AtomicExpand::expandPartwordAtomicRMW( AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) { AtomicOrdering MemOpOrder = AI->getOrdering(); + SyncScope::ID SSID = AI->getSyncScopeID(); IRBuilder<> Builder(AI); @@ -788,7 +789,8 @@ if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) { OldResult = insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, PMV.AlignedAddrAlignment, MemOpOrder, - PerformPartwordOp, createCmpXchgInstFun); + SSID, PerformPartwordOp, + createCmpXchgInstFun); } else { assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC); OldResult = insertRMWLLSCLoop(Builder, PMV.WordType, PMV.AlignedAddr, @@ -1392,7 +1394,7 @@ Value *AtomicExpand::insertRMWCmpXchgLoop( IRBuilder<> &Builder, Type *ResultTy, Value *Addr, Align AddrAlign, - AtomicOrdering MemOpOrder, + AtomicOrdering MemOpOrder, SyncScope::ID SSID, function_ref &, Value *)> PerformOp, CreateCmpXchgInstFun CreateCmpXchg) { LLVMContext &Ctx = Builder.getContext(); @@ -1440,7 +1442,7 @@ MemOpOrder == AtomicOrdering::Unordered ? AtomicOrdering::Monotonic : MemOpOrder, - Success, NewLoaded); + SSID, Success, NewLoaded); assert(Success && NewLoaded); Loaded->addIncoming(NewLoaded, LoopBB); @@ -1477,7 +1479,7 @@ IRBuilder<> Builder(AI); Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop( Builder, AI->getType(), AI->getPointerOperand(), AI->getAlign(), - AI->getOrdering(), + AI->getOrdering(), AI->getSyncScopeID(), [&](IRBuilder<> &Builder, Value *Loaded) { return performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); @@ -1628,11 +1630,11 @@ expandAtomicRMWToCmpXchg( I, [this](IRBuilder<> &Builder, Value *Addr, Value *Loaded, Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder, - Value *&Success, Value *&NewLoaded) { + SyncScope::ID SSID, Value *&Success, Value *&NewLoaded) { // Create the CAS instruction normally... AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg( Addr, Loaded, NewVal, Alignment, MemOpOrder, - AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); + AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID); Success = Builder.CreateExtractValue(Pair, 1, "success"); NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); Index: llvm/test/CodeGen/AMDGPU/expand-atomicrmw-syncscope.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/expand-atomicrmw-syncscope.ll @@ -0,0 +1,76 @@ +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Check that syncscope it copied from atomicrmw to cmpxchg during expansion. +; There should be no scc unless we have system scope. + +; GCN-LABEL: {{^}}expand_atomicrmw_agent: +; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}} +define void @expand_atomicrmw_agent(float addrspace(1)* nocapture %arg) { +entry: + %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 syncscope("agent") monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}expand_atomicrmw_workgroup: +; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}} +define void @expand_atomicrmw_workgroup(float addrspace(1)* nocapture %arg) { +entry: + %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 syncscope("workgroup") monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}expand_atomicrmw_wavefront: +; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}} +define void @expand_atomicrmw_wavefront(float addrspace(1)* nocapture %arg) { +entry: + %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 syncscope("wavefront") monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}expand_atomicrmw_agent_one_as: +; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}} +define void @expand_atomicrmw_agent_one_as(float addrspace(1)* nocapture %arg) { +entry: + %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 syncscope("agent-one-as") monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}expand_atomicrmw_workgroup_one_as: +; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}} +define void @expand_atomicrmw_workgroup_one_as(float addrspace(1)* nocapture %arg) { +entry: + %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 syncscope("workgroup-one-as") monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}expand_atomicrmw_wavefront_one_as: +; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}} +define void @expand_atomicrmw_wavefront_one_as(float addrspace(1)* nocapture %arg) { +entry: + %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 syncscope("wavefront-one-as") monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}expand_atomicrmw_singlethread_one_as: +; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}} +define void @expand_atomicrmw_singlethread_one_as(float addrspace(1)* nocapture %arg) { +entry: + %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 syncscope("singlethread-one-as") monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}expand_atomicrmw_one_as: +; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc scc{{$}} +define void @expand_atomicrmw_one_as(float addrspace(1)* nocapture %arg) { +entry: + %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 syncscope("one-as") monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}expand_atomicrmw_system: +; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc scc{{$}} +define void @expand_atomicrmw_system(float addrspace(1)* nocapture %arg) { +entry: + %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 monotonic, align 4 + ret void +} Index: llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -474,7 +474,7 @@ ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -505,11 +505,8 @@ ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 -; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc scc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -596,7 +593,7 @@ ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -704,12 +701,13 @@ ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc scc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] @@ -783,12 +781,13 @@ ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc scc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] Index: llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -184,11 +184,8 @@ ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -369,11 +366,8 @@ ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -524,7 +518,7 @@ ; GFX900-NEXT: v_mov_b32_e32 v1, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol @@ -550,7 +544,7 @@ ; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: v_mov_b32_e32 v2, 0 ; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol @@ -577,7 +571,7 @@ ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -605,7 +599,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll =================================================================== --- llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -13,7 +13,7 @@ ; CI-NEXT: [[TMP2:%.*]] = bitcast float* [[PTR]] to i32* ; CI-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 ; CI-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; CI-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst +; CI-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; CI-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -30,7 +30,7 @@ ; GFX9-NEXT: [[TMP2:%.*]] = bitcast float* [[PTR]] to i32* ; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 ; GFX9-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst +; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX9-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -47,7 +47,7 @@ ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float* [[PTR]] to i32* ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -69,7 +69,7 @@ ; CI-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* ; CI-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 ; CI-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; CI-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst +; CI-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; CI-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -86,7 +86,7 @@ ; GFX9-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* ; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 ; GFX9-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst +; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX9-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -103,7 +103,7 @@ ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -125,7 +125,7 @@ ; CI-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* ; CI-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 ; CI-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; CI-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst +; CI-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; CI-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -142,7 +142,7 @@ ; GFX9-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* ; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 ; GFX9-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst +; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX9-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -159,7 +159,7 @@ ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -181,7 +181,7 @@ ; CI-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* ; CI-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 ; CI-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; CI-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst +; CI-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; CI-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -198,7 +198,7 @@ ; GFX9-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* ; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 ; GFX9-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst +; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX9-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -207,7 +207,7 @@ ; GFX9-NEXT: ret void ; ; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush( -; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst +; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 ; GFX908-NEXT: ret void ; %res = atomicrmw fadd float addrspace(1)* %ptr, float %value seq_cst @@ -224,7 +224,7 @@ ; CI-NEXT: [[TMP2:%.*]] = bitcast float addrspace(3)* [[PTR]] to i32 addrspace(3)* ; CI-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 ; CI-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; CI-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(3)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst +; CI-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(3)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; CI-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -233,11 +233,11 @@ ; CI-NEXT: ret float [[TMP6]] ; ; GFX9-LABEL: @test_atomicrmw_fadd_f32_local( -; GFX9-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(3)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst +; GFX9-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(3)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 ; GFX9-NEXT: ret float [[RES]] ; ; GFX908-LABEL: @test_atomicrmw_fadd_f32_local( -; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(3)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst +; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(3)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 ; GFX908-NEXT: ret float [[RES]] ; %res = atomicrmw fadd float addrspace(3)* %ptr, float %value seq_cst @@ -246,15 +246,15 @@ define half @test_atomicrmw_fadd_f16_flat(half* %ptr, half %value) { ; CI-LABEL: @test_atomicrmw_fadd_f16_flat( -; CI-NEXT: [[RES:%.*]] = atomicrmw fadd half* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst +; CI-NEXT: [[RES:%.*]] = atomicrmw fadd half* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 ; CI-NEXT: ret half [[RES]] ; ; GFX9-LABEL: @test_atomicrmw_fadd_f16_flat( -; GFX9-NEXT: [[RES:%.*]] = atomicrmw fadd half* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst +; GFX9-NEXT: [[RES:%.*]] = atomicrmw fadd half* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 ; GFX9-NEXT: ret half [[RES]] ; ; GFX908-LABEL: @test_atomicrmw_fadd_f16_flat( -; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd half* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst +; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd half* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 ; GFX908-NEXT: ret half [[RES]] ; %res = atomicrmw fadd half* %ptr, half %value seq_cst @@ -263,15 +263,15 @@ define half @test_atomicrmw_fadd_f16_global(half addrspace(1)* %ptr, half %value) { ; CI-LABEL: @test_atomicrmw_fadd_f16_global( -; CI-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst +; CI-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 ; CI-NEXT: ret half [[RES]] ; ; GFX9-LABEL: @test_atomicrmw_fadd_f16_global( -; GFX9-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst +; GFX9-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 ; GFX9-NEXT: ret half [[RES]] ; ; GFX908-LABEL: @test_atomicrmw_fadd_f16_global( -; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst +; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 ; GFX908-NEXT: ret half [[RES]] ; %res = atomicrmw fadd half addrspace(1)* %ptr, half %value seq_cst @@ -280,15 +280,15 @@ define half @test_atomicrmw_fadd_f16_local(half addrspace(3)* %ptr, half %value) { ; CI-LABEL: @test_atomicrmw_fadd_f16_local( -; CI-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(3)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst +; CI-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(3)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 ; CI-NEXT: ret half [[RES]] ; ; GFX9-LABEL: @test_atomicrmw_fadd_f16_local( -; GFX9-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(3)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst +; GFX9-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(3)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 ; GFX9-NEXT: ret half [[RES]] ; ; GFX908-LABEL: @test_atomicrmw_fadd_f16_local( -; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(3)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst +; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(3)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 2 ; GFX908-NEXT: ret half [[RES]] ; %res = atomicrmw fadd half addrspace(3)* %ptr, half %value seq_cst @@ -305,7 +305,7 @@ ; CI-NEXT: [[TMP2:%.*]] = bitcast double* [[PTR]] to i64* ; CI-NEXT: [[TMP3:%.*]] = bitcast double [[NEW]] to i64 ; CI-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; CI-NEXT: [[TMP5:%.*]] = cmpxchg i64* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst +; CI-NEXT: [[TMP5:%.*]] = cmpxchg i64* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; CI-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -322,7 +322,7 @@ ; GFX9-NEXT: [[TMP2:%.*]] = bitcast double* [[PTR]] to i64* ; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[NEW]] to i64 ; GFX9-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i64* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst +; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i64* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX9-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -339,7 +339,7 @@ ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double* [[PTR]] to i64* ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i64* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i64* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -361,7 +361,7 @@ ; CI-NEXT: [[TMP2:%.*]] = bitcast double addrspace(1)* [[PTR]] to i64 addrspace(1)* ; CI-NEXT: [[TMP3:%.*]] = bitcast double [[NEW]] to i64 ; CI-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; CI-NEXT: [[TMP5:%.*]] = cmpxchg i64 addrspace(1)* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst +; CI-NEXT: [[TMP5:%.*]] = cmpxchg i64 addrspace(1)* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; CI-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -378,7 +378,7 @@ ; GFX9-NEXT: [[TMP2:%.*]] = bitcast double addrspace(1)* [[PTR]] to i64 addrspace(1)* ; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[NEW]] to i64 ; GFX9-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i64 addrspace(1)* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst +; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i64 addrspace(1)* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX9-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -395,7 +395,7 @@ ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double addrspace(1)* [[PTR]] to i64 addrspace(1)* ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i64 addrspace(1)* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i64 addrspace(1)* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -417,7 +417,7 @@ ; CI-NEXT: [[TMP2:%.*]] = bitcast double addrspace(3)* [[PTR]] to i64 addrspace(3)* ; CI-NEXT: [[TMP3:%.*]] = bitcast double [[NEW]] to i64 ; CI-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; CI-NEXT: [[TMP5:%.*]] = cmpxchg i64 addrspace(3)* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst +; CI-NEXT: [[TMP5:%.*]] = cmpxchg i64 addrspace(3)* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; CI-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -434,7 +434,7 @@ ; GFX9-NEXT: [[TMP2:%.*]] = bitcast double addrspace(3)* [[PTR]] to i64 addrspace(3)* ; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[NEW]] to i64 ; GFX9-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i64 addrspace(3)* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst +; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i64 addrspace(3)* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX9-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -451,7 +451,7 @@ ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double addrspace(3)* [[PTR]] to i64 addrspace(3)* ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i64 addrspace(3)* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i64 addrspace(3)* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -463,4 +463,116 @@ ret double %res } +define float @test_atomicrmw_fadd_f32_global_agent(float addrspace(1)* %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_global_agent( +; CI-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CI-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret float [[TMP6]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_global_agent( +; GFX9-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX9-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_agent( +; GFX908-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fadd float addrspace(1)* %ptr, float %value syncscope("agent") monotonic + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_one_as(float addrspace(1)* %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_global_one_as( +; CI-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("one-as") monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CI-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret float [[TMP6]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_global_one_as( +; GFX9-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("one-as") monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX9-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_one_as( +; GFX908-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("one-as") monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fadd float addrspace(1)* %ptr, float %value syncscope("one-as") monotonic + ret float %res +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }