diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2000,6 +2000,14 @@ llvm_unreachable("Masked atomicrmw expansion unimplemented on this target"); } + /// Perform a atomicrmw expansion using a target-specific way. This is + /// expected to be called when masked atomicrmw and bit test atomicrmw don't + /// work, and the target supports another way to lower atomicrmw. + virtual void emitExpandAtomicRMW(AtomicRMWInst *AI) const { + llvm_unreachable( + "Generic atomicrmw expansion unimplemented on this target"); + } + /// Perform a bit test atomicrmw using a target-specific intrinsic. This /// represents the combined bit test intrinsic which will be lowered at a late /// stage by the backend. diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -605,6 +605,9 @@ } case TargetLoweringBase::AtomicExpansionKind::NotAtomic: return lowerAtomicRMWInst(AI); + case TargetLoweringBase::AtomicExpansionKind::Expand: + TLI->emitExpandAtomicRMW(AI); + return true; default: llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -493,6 +493,7 @@ AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override; AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; + void emitExpandAtomicRMW(AtomicRMWInst *AI) const override; const TargetRegisterClass *getRegClassFor(MVT VT, bool isDivergent) const override; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -30,6 +30,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" @@ -12864,6 +12865,17 @@ if (Ty->isDoubleTy() && Subtarget->hasGFX90AInsts()) return ReportUnsafeHWInst(AtomicExpansionKind::None); + // If it is in flat address space, and the type is float, we will try to + // expand it, if the target supports global atomic fadd. The reason we + // need that is, in the expansion, we emit the check of address space. If + // it is in global address space, we emit the global atomic fadd. + if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) { + if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) + return AtomicExpansionKind::Expand; + if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) + return AtomicExpansionKind::Expand; + } + return AtomicExpansionKind::CmpXChg; } @@ -13064,3 +13076,140 @@ } return false; } + +void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { + assert(Subtarget->hasAtomicFaddInsts() && + "target should have atomic fadd instructions"); + assert(AI->getType()->isFloatTy() && + AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS && + "generic atomicrmw expansion only supports FP32 operand in flat " + "address space"); + assert(AI->getOperation() == AtomicRMWInst::FAdd && + "only fadd is supported for now"); + + // Given: atomicrmw fadd float* %addr, float %val ordering + // + // With this expansion we produce the following code: + // [...] + // %int8ptr = bitcast float* %addr to i8* + // br label %atomicrmw.check.shared + // + // atomicrmw.check.shared: + // %is.shared = call i1 @llvm.amdgcn.is.shared(i8* %int8ptr) + // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private + // + // atomicrmw.shared: + // %cast.shared = addrspacecast float* %addr to float addrspace(3)* + // %loaded.shared = atomicrmw fadd float addrspace(3)* %cast.shared, + // float %val ordering + // br label %atomicrmw.phi + // + // atomicrmw.check.private: + // %is.private = call i1 @llvm.amdgcn.is.private(i8* %int8ptr) + // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global + // + // atomicrmw.private: + // %cast.private = addrspacecast float* %addr to float addrspace(5)* + // %loaded.private = load float, float addrspace(5)* %cast.private + // %val.new = fadd float %loaded.private, %val + // store float %val.new, float addrspace(5)* %cast.private + // br label %atomicrmw.phi + // + // atomicrmw.global: + // %cast.global = addrspacecast float* %addr to float addrspace(1)* + // %loaded.global = atomicrmw fadd float addrspace(1)* %cast.global, + // float %val ordering + // br label %atomicrmw.phi + // + // atomicrmw.phi: + // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ], + // [ %loaded.private, %atomicrmw.private ], + // [ %loaded.global, %atomicrmw.global ] + // br label %atomicrmw.end + // + // atomicrmw.end: + // [...] + + IRBuilder<> Builder(AI); + LLVMContext &Ctx = Builder.getContext(); + + BasicBlock *BB = Builder.GetInsertBlock(); + Function *F = BB->getParent(); + BasicBlock *ExitBB = + BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); + BasicBlock *CheckSharedBB = + BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB); + BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB); + BasicBlock *CheckPrivateBB = + BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB); + BasicBlock *PrivateBB = + BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB); + BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB); + BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB); + + Value *Val = AI->getValOperand(); + Type *ValTy = Val->getType(); + Value *Addr = AI->getPointerOperand(); + PointerType *PtrTy = cast(Addr->getType()); + + auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr, + Value *Val) -> Value * { + AtomicRMWInst *OldVal = + Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(), + AI->getOrdering(), AI->getSyncScopeID()); + SmallVector> MDs; + AI->getAllMetadata(MDs); + for (auto &P : MDs) + OldVal->setMetadata(P.first, P.second); + return OldVal; + }; + + std::prev(BB->end())->eraseFromParent(); + Builder.SetInsertPoint(BB); + Value *Int8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy()); + Builder.CreateBr(CheckSharedBB); + + Builder.SetInsertPoint(CheckSharedBB); + CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {}, + {Int8Ptr}, nullptr, "is.shared"); + Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB); + + Builder.SetInsertPoint(SharedBB); + Value *CastToLocal = Builder.CreateAddrSpaceCast( + Addr, + PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::LOCAL_ADDRESS)); + Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val); + Builder.CreateBr(PhiBB); + + Builder.SetInsertPoint(CheckPrivateBB); + CallInst *IsPrivate = Builder.CreateIntrinsic( + Intrinsic::amdgcn_is_private, {}, {Int8Ptr}, nullptr, "is.private"); + Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB); + + Builder.SetInsertPoint(PrivateBB); + Value *CastToPrivate = Builder.CreateAddrSpaceCast( + Addr, + PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::PRIVATE_ADDRESS)); + Value *LoadedPrivate = + Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private"); + Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new"); + Builder.CreateStore(NewVal, CastToPrivate); + Builder.CreateBr(PhiBB); + + Builder.SetInsertPoint(GlobalBB); + Value *CastToGlobal = Builder.CreateAddrSpaceCast( + Addr, + PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::GLOBAL_ADDRESS)); + Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val); + Builder.CreateBr(PhiBB); + + Builder.SetInsertPoint(PhiBB); + PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi"); + Loaded->addIncoming(LoadedShared, SharedBB); + Loaded->addIncoming(LoadedPrivate, PrivateBB); + Loaded->addIncoming(LoadedGlobal, GlobalBB); + Builder.CreateBr(ExitBB); + + AI->replaceAllUsesWith(Loaded); + AI->eraseFromParent(); +} diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -0,0 +1,45 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX908 %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX90A %s + +; CHECK-LABEL: syncscope_system: +; GFX908: s_getreg_b32 {{.+}}, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX908: s_cbranch_execnz {{.+}} +; GFX908: s_cbranch_execnz [[IS_SHARED:.+]] +; GFX908: s_getreg_b32 {{.+}}, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX908: s_cbranch_execz [[IS_PRIVATE:.+]] +; GFX908: global_atomic_add_f32 +; GFX908: [[IS_PRIVATE]]: +; GFX908: buffer_load_dword +; GFX908: v_add_f32_e32 +; GFX908: buffer_store_dword +; GFX908: [[IS_SHARED]]: +; GFX908: ds_add_f32 +; GFX908-NOT: flat_atomic_cmpswap +; GFX90A: flat_atomic_cmpswap +define void @syncscope_system(float* %addr, float noundef %val) #0 { +entry: + %0 = atomicrmw fadd float* %addr, float %val monotonic + ret void +} + +; CHECK-LABEL: syncscope_workgroup: +; CHECK: s_getreg_b32 {{.+}}, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; CHECK: s_cbranch_execnz {{.+}} +; CHECK: s_cbranch_execnz [[IS_SHARED:.+]] +; CHECK: s_getreg_b32 {{.+}}, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; CHECK: s_cbranch_execz [[IS_PRIVATE:.+]] +; CHECK: global_atomic_add_f32 +; CHECK: [[IS_PRIVATE]]: +; CHECK: buffer_load_dword +; CHECK: v_add_f32_e32 +; CHECK: buffer_store_dword +; CHECK: [[IS_SHARED]]: +; CHECK: ds_add_f32 +; CHECK-NOT: flat_atomic_cmpswap +define void @syncscope_workgroup(float* %addr, float noundef %val) #0 { +entry: + %0 = atomicrmw fadd float* %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" } diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll @@ -0,0 +1,347 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -atomic-expand %s | FileCheck -check-prefix=GFX908 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -atomic-expand %s | FileCheck -check-prefix=GFX90A %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -atomic-expand %s | FileCheck -check-prefix=GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -atomic-expand %s | FileCheck -check-prefix=GFX1100 %s + +define float @syncscope_system(float* %addr, float %val) #0 { +; GFX908-LABEL: @syncscope_system( +; GFX908-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32* +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: @syncscope_system( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32* +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: @syncscope_system( +; GFX940-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX1100-LABEL: @syncscope_system( +; GFX1100-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4 +; GFX1100-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX1100: atomicrmw.start: +; GFX1100-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX1100-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] +; GFX1100-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32* +; GFX1100-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX1100-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX1100-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX1100-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX1100-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX1100-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX1100-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX1100: atomicrmw.end: +; GFX1100-NEXT: ret float [[TMP6]] +; +; GFX11-LABEL: @syncscope_system( +; GFX11-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32* +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP6]] + %res = atomicrmw fadd float* %addr, float %val seq_cst + ret float %res +} + +define float @syncscope_workgroup_rtn(float* %addr, float %val) #0 { +; GFX908-LABEL: @syncscope_workgroup_rtn( +; GFX908-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32* +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("workgroup") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: @syncscope_workgroup_rtn( +; GFX90A-NEXT: [[TMP1:%.*]] = bitcast float* [[ADDR:%.*]] to i8* +; GFX90A-NEXT: br label [[ATOMICRMW_CHECK_SHARED:%.*]] +; GFX90A: atomicrmw.check.shared: +; GFX90A-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(i8* [[TMP1]]) +; GFX90A-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]] +; GFX90A: atomicrmw.shared: +; GFX90A-NEXT: [[TMP2:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(3)* +; GFX90A-NEXT: [[TMP3:%.*]] = atomicrmw fadd float addrspace(3)* [[TMP2]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX90A: atomicrmw.check.private: +; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(i8* [[TMP1]]) +; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX90A: atomicrmw.private: +; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(5)* +; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load float, float addrspace(5)* [[TMP4]], align 4 +; GFX90A-NEXT: [[VAL_NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]] +; GFX90A-NEXT: store float [[VAL_NEW]], float addrspace(5)* [[TMP4]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] +; GFX90A: atomicrmw.global: +; GFX90A-NEXT: [[TMP5:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(1)* +; GFX90A-NEXT: [[TMP6:%.*]] = atomicrmw fadd float addrspace(1)* [[TMP5]], float [[VAL]] syncscope("workgroup") seq_cst, align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] +; GFX90A: atomicrmw.phi: +; GFX90A-NEXT: [[LOADED_PHI:%.*]] = phi float [ [[TMP3]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_GLOBAL]] ] +; GFX90A-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[LOADED_PHI]] +; +; GFX940-LABEL: @syncscope_workgroup_rtn( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd float* [[ADDR:%.*]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4 +; GFX940-NEXT: ret float [[RES]] +; +; GFX1100-LABEL: @syncscope_workgroup_rtn( +; GFX1100-NEXT: [[RES:%.*]] = atomicrmw fadd float* [[ADDR:%.*]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4 +; GFX1100-NEXT: ret float [[RES]] +; +; GFX11-LABEL: @syncscope_workgroup_rtn( +; GFX11-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32* +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("workgroup") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP6]] + %res = atomicrmw fadd float* %addr, float %val syncscope("workgroup") seq_cst + ret float %res +} + +define void @syncscope_workgroup_nortn(float* %addr, float %val) #0 { +; GFX908-LABEL: @syncscope_workgroup_nortn( +; GFX908-NEXT: [[TMP1:%.*]] = bitcast float* [[ADDR:%.*]] to i8* +; GFX908-NEXT: br label [[ATOMICRMW_CHECK_SHARED:%.*]] +; GFX908: atomicrmw.check.shared: +; GFX908-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(i8* [[TMP1]]) +; GFX908-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]] +; GFX908: atomicrmw.shared: +; GFX908-NEXT: [[TMP2:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(3)* +; GFX908-NEXT: [[TMP3:%.*]] = atomicrmw fadd float addrspace(3)* [[TMP2]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4 +; GFX908-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX908: atomicrmw.check.private: +; GFX908-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(i8* [[TMP1]]) +; GFX908-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX908: atomicrmw.private: +; GFX908-NEXT: [[TMP4:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(5)* +; GFX908-NEXT: [[LOADED_PRIVATE:%.*]] = load float, float addrspace(5)* [[TMP4]], align 4 +; GFX908-NEXT: [[VAL_NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]] +; GFX908-NEXT: store float [[VAL_NEW]], float addrspace(5)* [[TMP4]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_PHI]] +; GFX908: atomicrmw.global: +; GFX908-NEXT: [[TMP5:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(1)* +; GFX908-NEXT: [[TMP6:%.*]] = atomicrmw fadd float addrspace(1)* [[TMP5]], float [[VAL]] syncscope("workgroup") seq_cst, align 4 +; GFX908-NEXT: br label [[ATOMICRMW_PHI]] +; GFX908: atomicrmw.phi: +; GFX908-NEXT: [[LOADED_PHI:%.*]] = phi float [ [[TMP3]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_GLOBAL]] ] +; GFX908-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @syncscope_workgroup_nortn( +; GFX90A-NEXT: [[TMP1:%.*]] = bitcast float* [[ADDR:%.*]] to i8* +; GFX90A-NEXT: br label [[ATOMICRMW_CHECK_SHARED:%.*]] +; GFX90A: atomicrmw.check.shared: +; GFX90A-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(i8* [[TMP1]]) +; GFX90A-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]] +; GFX90A: atomicrmw.shared: +; GFX90A-NEXT: [[TMP2:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(3)* +; GFX90A-NEXT: [[TMP3:%.*]] = atomicrmw fadd float addrspace(3)* [[TMP2]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX90A: atomicrmw.check.private: +; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(i8* [[TMP1]]) +; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX90A: atomicrmw.private: +; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(5)* +; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load float, float addrspace(5)* [[TMP4]], align 4 +; GFX90A-NEXT: [[VAL_NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]] +; GFX90A-NEXT: store float [[VAL_NEW]], float addrspace(5)* [[TMP4]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] +; GFX90A: atomicrmw.global: +; GFX90A-NEXT: [[TMP5:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(1)* +; GFX90A-NEXT: [[TMP6:%.*]] = atomicrmw fadd float addrspace(1)* [[TMP5]], float [[VAL]] syncscope("workgroup") seq_cst, align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] +; GFX90A: atomicrmw.phi: +; GFX90A-NEXT: [[LOADED_PHI:%.*]] = phi float [ [[TMP3]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_GLOBAL]] ] +; GFX90A-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @syncscope_workgroup_nortn( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd float* [[ADDR:%.*]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX1100-LABEL: @syncscope_workgroup_nortn( +; GFX1100-NEXT: [[RES:%.*]] = atomicrmw fadd float* [[ADDR:%.*]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4 +; GFX1100-NEXT: ret void +; +; GFX11-LABEL: @syncscope_workgroup_nortn( +; GFX11-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32* +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("workgroup") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void + %res = atomicrmw fadd float* %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +define float @no_unsafe(float* %addr, float %val) { +; GFX908-LABEL: @no_unsafe( +; GFX908-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32* +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("workgroup") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: @no_unsafe( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32* +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("workgroup") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: @no_unsafe( +; GFX940-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("workgroup") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX1100-LABEL: @no_unsafe( +; GFX1100-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4 +; GFX1100-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX1100: atomicrmw.start: +; GFX1100-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX1100-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] +; GFX1100-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32* +; GFX1100-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX1100-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX1100-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("workgroup") seq_cst seq_cst, align 4 +; GFX1100-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX1100-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX1100-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX1100-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX1100: atomicrmw.end: +; GFX1100-NEXT: ret float [[TMP6]] +; +; GFX11-LABEL: @no_unsafe( +; GFX11-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32* +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("workgroup") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP6]] + %res = atomicrmw fadd float* %addr, float %val syncscope("workgroup") seq_cst + ret float %res +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" }