diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1969,6 +1969,14 @@ llvm_unreachable("Masked atomicrmw expansion unimplemented on this target"); } + /// Perform a atomicrmw expansion using a target-specific way. This is + /// expected to be called when masked atomicrmw and bit test atomicrmw don't + /// work, and the target supports another way to lower atomicrmw. + virtual void emitExpandAtomicRMW(AtomicRMWInst *AI) const { + llvm_unreachable( + "Generic atomicrmw expansion unimplemented on this target"); + } + /// Perform a bit test atomicrmw using a target-specific intrinsic. This /// represents the combined bit test intrinsic which will be lowered at a late /// stage by the backend. diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -610,6 +610,9 @@ } case TargetLoweringBase::AtomicExpansionKind::NotAtomic: return lowerAtomicRMWInst(AI); + case TargetLoweringBase::AtomicExpansionKind::Expand: + TLI->emitExpandAtomicRMW(AI); + return true; default: llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -487,6 +487,8 @@ AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; + void emitExpandAtomicRMW(AtomicRMWInst *AI) const override; + virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override; virtual bool requiresUniformRegister(MachineFunction &MF, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -30,6 +30,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" @@ -12763,19 +12764,19 @@ return AtomicExpansionKind::CmpXChg; if (Subtarget->hasGFX90AInsts()) { - if (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS) - return AtomicExpansionKind::CmpXChg; - auto SSID = RMW->getSyncScopeID(); if (SSID == SyncScope::System || SSID == RMW->getContext().getOrInsertSyncScopeID("one-as")) return AtomicExpansionKind::CmpXChg; + if (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS) + return AtomicExpansionKind::Expand; + return ReportUnsafeHWInst(AtomicExpansionKind::None); } if (AS == AMDGPUAS::FLAT_ADDRESS) - return AtomicExpansionKind::CmpXChg; + return AtomicExpansionKind::Expand; return RMW->use_empty() ? ReportUnsafeHWInst(AtomicExpansionKind::None) : AtomicExpansionKind::CmpXChg; @@ -12969,3 +12970,129 @@ return MONoClobber; return MachineMemOperand::MONone; } + +void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { + assert(Subtarget->hasAtomicFaddInsts() && + "target should have atomic fadd instructions"); + assert(AI->getType()->isFloatTy() && + AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS && + "generic atomicrmw expansion only supports FP32 operand in flat " + "address space"); + assert(AI->getOperation() == AtomicRMWInst::FAdd && + "only fadd is supported for now"); + + // Given: atomicrmw fadd float* %addr, float %val ordering + // + // With this expansion we produce the following code: + // [...] + // atomicrmw.start: + // %is.shared = call i1 __builtin_amdgcn_is_shared(float* %addr) + // br i1 %is.shared, label %atomicrmw.shared, + // label %atomicrmw.check.private + // atomicrmw.shared: + // %cast.local = addrspacecast float* %addr, float addrspace(3)* + // %loaded.shared = atomicrmw fadd float* float addrspace(3)* %cast.local, + // float %val + // br label %atomicrmw.phi + // atomicrmw.check.private: + // %is.private = __builtin_amdgcn_is_private(float *%addr) + // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global + // atomicrmw.private: + // %cast.private = addrspacecast float* %addr, float addrspace(5)* + // %loaded.private = load float* %cast.private + // %new.val = fadd %loaded_private, %val + // store float %new_val, float* %cast.private + // br label %atomicrmw.phi + // atomicrmw.global: + // %cast.global = addrspacecast float* %addr, float addrspace(1)* + // %loaded.global = atomicrmw fadd float* float addrspace(1)* %cast.global, + // float %val + // br label %atomicrmw.phi + // atomicrmw.phi: + // %loaded = phi float [ %loaded.shared, %atomicrmw.shared ], + // [ %loaded.private, %atomicrmw.private, + // [ %loaded.global, %atomicrmw.global] ] + // br label %atomicrmw.end + // atomicrmw.end: + // [...] + + IRBuilder<> Builder(AI); + LLVMContext &Ctx = Builder.getContext(); + + BasicBlock *BB = Builder.GetInsertBlock(); + Function *F = BB->getParent(); + BasicBlock *ExitBB = + BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); + BasicBlock *StartBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); + BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB); + BasicBlock *CheckPrivateBB = + BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB); + BasicBlock *PrivateBB = + BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB); + BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB); + BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB); + + Value *Val = AI->getValOperand(); + Type *ValTy = Val->getType(); + Value *Addr = AI->getPointerOperand(); + PointerType *PtrTy = cast(Addr->getType()); + if (!PtrTy->isOpaquePointerTy()) { + Addr = Builder.CreatePointerCast(Addr, Builder.getInt8PtrTy()); + PtrTy = Builder.getInt8PtrTy(); + } + + auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr, + Value *Val) -> Value * { + return Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, + AI->getAlign(), AI->getOrdering(), + AI->getSyncScopeID()); + }; + + std::prev(BB->end())->eraseFromParent(); + Builder.SetInsertPoint(BB); + Builder.CreateBr(StartBB); + + Builder.SetInsertPoint(StartBB); + CallInst *IsShared = Builder.CreateIntrinsic( + Intrinsic::amdgcn_is_shared, {PtrTy}, {Addr}, nullptr, "is.shared"); + Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB); + + Builder.SetInsertPoint(SharedBB); + Value *CastToLocal = Builder.CreateAddrSpaceCast( + Addr, + PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::LOCAL_ADDRESS)); + Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val); + Builder.CreateBr(PhiBB); + + Builder.SetInsertPoint(CheckPrivateBB); + CallInst *IsPrivate = Builder.CreateIntrinsic( + Intrinsic::amdgcn_is_private, {PtrTy}, {Addr}, nullptr, "is.private"); + Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB); + + Builder.SetInsertPoint(PrivateBB); + Value *CastToPrivate = Builder.CreateAddrSpaceCast( + Addr, + PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::PRIVATE_ADDRESS)); + Value *LoadedPrivate = + Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private"); + Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new"); + Builder.CreateStore(NewVal, CastToPrivate); + Builder.CreateBr(PhiBB); + + Builder.SetInsertPoint(GlobalBB); + Value *CastToGlobal = Builder.CreateAddrSpaceCast( + Addr, + PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::GLOBAL_ADDRESS)); + Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val); + Builder.CreateBr(PhiBB); + + Builder.SetInsertPoint(PhiBB); + PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi"); + Loaded->addIncoming(LoadedShared, SharedBB); + Loaded->addIncoming(LoadedPrivate, PrivateBB); + Loaded->addIncoming(LoadedGlobal, GlobalBB); + Builder.CreateBr(ExitBB); + + AI->replaceAllUsesWith(Loaded); + AI->eraseFromParent(); +} diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -0,0 +1,16 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck %s + + ; CHECK-LABEL: foo: + ; CHECK: global_atomic_add_f32 + ; CHECK: flat_load_dword + ; CHECK: v_add_f32_e32 + ; CHECK: flat_store_dword + ; CHECK: ds_add_f32 + ; CHECK-NOT: flat_atomic_cmpswap + define protected void @foo(float* %addr, float noundef %val) local_unnamed_addr #0 { + entry: + %0 = atomicrmw fadd float* %addr, float %val monotonic, align 4 + ret void + } + + attributes #0 = { alwaysinline mustprogress nofree norecurse nounwind "amdgpu-unsafe-fp-atomics"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx908" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" }