diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1969,6 +1969,14 @@ llvm_unreachable("Masked atomicrmw expansion unimplemented on this target"); } + /// Perform a atomicrmw expansion using a target-specific way. This is + /// expected to be called when masked atomicrmw and bit test atomicrmw don't + /// work, and the target supports another way to lower atomicrmw. + virtual void emitExpandAtomicRMW(AtomicRMWInst *AI) const { + llvm_unreachable( + "Generic atomicrmw expansion unimplemented on this target"); + } + /// Perform a bit test atomicrmw using a target-specific intrinsic. This /// represents the combined bit test intrinsic which will be lowered at a late /// stage by the backend. diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -610,6 +610,9 @@ } case TargetLoweringBase::AtomicExpansionKind::NotAtomic: return lowerAtomicRMWInst(AI); + case TargetLoweringBase::AtomicExpansionKind::Expand: + TLI->emitExpandAtomicRMW(AI); + return true; default: llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -486,6 +486,8 @@ AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; + void emitExpandAtomicRMW(AtomicRMWInst *AI) const override; + virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override; virtual bool requiresUniformRegister(MachineFunction &MF, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -30,6 +30,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" @@ -12730,7 +12731,7 @@ if (Subtarget->hasGFX90AInsts()) { if (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS) - return AtomicExpansionKind::CmpXChg; + return AtomicExpansionKind::Expand; auto SSID = RMW->getSyncScopeID(); if (SSID == SyncScope::System || @@ -12741,7 +12742,7 @@ } if (AS == AMDGPUAS::FLAT_ADDRESS) - return AtomicExpansionKind::CmpXChg; + return AtomicExpansionKind::Expand; return RMW->use_empty() ? ReportUnsafeHWInst(AtomicExpansionKind::None) : AtomicExpansionKind::CmpXChg; @@ -12935,3 +12936,126 @@ return MONoClobber; return MachineMemOperand::MONone; } + +void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { + assert(Subtarget->hasAtomicFaddInsts() && + "target should have atomic fadd instructions"); + assert(AI->getType()->isFloatTy() && + AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS && + "generic atomicrmw expansion only supports FP32 operand in flat " + "address space"); + assert(AI->getOperation() == AtomicRMWInst::FAdd && + "only fadd is supported for now"); + + // Given: atomicrmw fadd float* %addr, float %val ordering + // + // With this expansion we produce the following code: + // [...] + // atomicrmw.start: + // %is_shared = call i1 __builtin_amdgcn_is_shared(float* %addr) + // br i1 %is_shared, label %atomicrmw.shared, + // label %atomicrmw.check.private + // atomicrmw.shared: + // %cast = addrspacecast float* %addr, float addrspace(3)* + // %loaded.shared = __builtin_amdgcn_ds_atomic_fadd_f32( + // float addrspace(3)* %cast, float %val) + // br label %atomicrmw.phi + // atomicrmw.check.private: + // %is_private = __builtin_amdgcn_is_private(float *%addr) + // br i1 %is_private, label %atomicrmw.private, label %atomicrmw.global + // atomicrmw.private: + // %loaded.private = load atomic float* %addr + // %new.val = fadd %loaded_private, %val + // store float %new_val, float* %addr + // br label %atomicrmw.phi + // atomicrmw.global: + // %cast1 = addrspacecast float* %addr, float addrspace(1)* + // %loaded.global = __builtin_amdgcn_global_atomic_fadd_f32( + // float addrspace(1)* %cast1, float %val) + // br label %atomicrmw.phi + // atomicrmw.phi: + // %loaded = phi float [ %loaded.shared, %atomicrmw.shared ], + // [ %loaded.private, %atomicrmw.private, + // [ %loaded.global, %atomicrmw.global] ] + // br label %atomicrmw.end + // atomicrmw.end: + // [...] + + IRBuilder<> Builder(AI); + LLVMContext &Ctx = Builder.getContext(); + + BasicBlock *BB = Builder.GetInsertBlock(); + Function *F = BB->getParent(); + Module *M = BB->getModule(); + BasicBlock *ExitBB = + BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); + BasicBlock *StartBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); + BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB); + BasicBlock *CheckPrivateBB = + BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB); + BasicBlock *PrivateBB = + BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB); + BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB); + BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB); + + Value *Addr = AI->getPointerOperand(); + Value *Val = AI->getValOperand(); + assert(Addr->getType()->isPointerTy()); + PointerType *PtrTy = cast(Addr->getType()); + Type *ValTy = Val->getType(); + + std::prev(BB->end())->eraseFromParent(); + Builder.SetInsertPoint(BB); + Value *AddrInt8Ptr = Builder.CreatePointerCast(Addr, Builder.getInt8PtrTy()); + Builder.CreateBr(StartBB); + + Builder.SetInsertPoint(StartBB); + FunctionCallee AddressShared = M->getOrInsertFunction( + "llvm.amdgcn.is.shared", Builder.getInt1Ty(), Builder.getInt8PtrTy()); + Value *IsShared = Builder.CreateCall(AddressShared, {AddrInt8Ptr}); + Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB); + + Builder.SetInsertPoint(SharedBB); + Value *Cast = Builder.CreateAddrSpaceCast( + Addr, + PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::LOCAL_ADDRESS)); + Constant *ZeroI32 = + ConstantInt::getIntegerValue(Type::getInt32Ty(Ctx), APInt(32, 0, true)); + Constant *ZeroI1 = + ConstantInt::getIntegerValue(Type::getInt1Ty(Ctx), APInt(1, 0)); + Value *LoadedShared = Builder.CreateIntrinsic( + Intrinsic::amdgcn_ds_fadd, {ValTy}, {Cast, Val, ZeroI32, ZeroI32, ZeroI1}, + nullptr, "loaded.shared"); + Builder.CreateBr(PhiBB); + + Builder.SetInsertPoint(CheckPrivateBB); + FunctionCallee AddressPrivate = M->getOrInsertFunction( + "llvm.amdgcn.is.private", Builder.getInt1Ty(), Builder.getInt8PtrTy()); + Value *IsPrivate = Builder.CreateCall(AddressPrivate, {AddrInt8Ptr}); + Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB); + + Builder.SetInsertPoint(PrivateBB); + Value *LoadedPrivate = Builder.CreateLoad(ValTy, Addr, "loaded.private"); + Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new"); + Builder.CreateStore(NewVal, Addr); + Builder.CreateBr(PhiBB); + + Builder.SetInsertPoint(GlobalBB); + Value *Cast1 = Builder.CreateAddrSpaceCast( + Addr, + PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::GLOBAL_ADDRESS)); + Value *LoadedGlobal = Builder.CreateIntrinsic( + Intrinsic::amdgcn_global_atomic_fadd, {ValTy, Cast1->getType(), ValTy}, + {Cast1, Val}, nullptr, "loaded.global"); + Builder.CreateBr(PhiBB); + + Builder.SetInsertPoint(PhiBB); + PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi"); + Loaded->addIncoming(LoadedShared, SharedBB); + Loaded->addIncoming(LoadedPrivate, PrivateBB); + Loaded->addIncoming(LoadedGlobal, GlobalBB); + Builder.CreateBr(ExitBB); + + AI->replaceAllUsesWith(Loaded); + AI->eraseFromParent(); +} diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -0,0 +1,16 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck %s + + ; CHECK-LABEL: foo: + ; CHECK: global_atomic_add_f32 + ; CHECK: flat_load_dword + ; CHECK: v_add_f32_e32 + ; CHECK: flat_store_dword + ; CHECK: ds_add_f32 + ; CHECK-NOT: flat_atomic_cmpswap + define protected void @foo(float* %addr, float noundef %val) local_unnamed_addr #0 { + entry: + %0 = atomicrmw fadd float* %addr, float %val monotonic, align 4 + ret void + } + + attributes #0 = { alwaysinline mustprogress nofree norecurse nounwind "amdgpu-unsafe-fp-atomics"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx908" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" }