diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h @@ -16,6 +16,8 @@ namespace llvm { class MachineRegisterInfo; +class GCNSubtarget; +class LLT; namespace AMDGPU { @@ -24,7 +26,7 @@ getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg); bool isLegalVOP3PShuffleMask(ArrayRef Mask); - +bool hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget, const LLT &Ty); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp @@ -7,8 +7,10 @@ //===----------------------------------------------------------------------===// #include "AMDGPUGlobalISelUtils.h" +#include "GCNSubtarget.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/IR/Constants.h" +#include "llvm/Support/LowLevelTypeImpl.h" using namespace llvm; using namespace MIPatternMatch; @@ -66,3 +68,12 @@ return true; return (Mask[0] & 2) == (Mask[1] & 2); } + +bool AMDGPU::hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget, + const LLT &Ty) { + if (Ty == LLT::scalar(32)) + return Subtarget.hasAtomicFaddRtnInsts(); + if (Ty == LLT::fixed_vector(2, 16) || Ty == LLT::scalar(64)) + return Subtarget.hasGFX90AInsts(); + return false; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2994,13 +2994,15 @@ bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD( MachineInstr &MI) const { - if (STI.hasGFX90AInsts()) + const Register DefReg = MI.getOperand(0).getReg(); + LLT DefTy = MRI->getType(DefReg); + if (AMDGPU::hasAtomicFaddRtnForTy(STI, DefTy)) return selectImpl(MI, *CoverageInfo); MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) { + if (!MRI->use_nodbg_empty(DefReg)) { Function &F = MBB->getParent()->getFunction(); DiagnosticInfoUnsupported NoFpRet(F, "return versions of fp atomics not supported", diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5738,7 +5738,8 @@ case Intrinsic::amdgcn_raw_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { Register DstReg = MI.getOperand(0).getReg(); - if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) { + if (!MRI.use_empty(DstReg) && + !AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) { Function &F = B.getMF().getFunction(); DiagnosticInfoUnsupported NoFpRet( F, "return versions of fp atomics not supported", B.getDebugLoc(), diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -394,6 +394,7 @@ MachineBasicBlock *BB) const override; bool hasBitPreservingFPLogic(EVT VT) const override; + bool hasAtomicFaddRtnForTy(SDValue &Op) const; bool enableAggressiveFMAFusion(EVT VT) const override; bool enableAggressiveFMAFusion(LLT Ty) const override; EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4362,6 +4362,18 @@ return isTypeLegal(VT.getScalarType()); } +bool SITargetLowering::hasAtomicFaddRtnForTy(SDValue &Op) const { + switch (Op.getValue(0).getSimpleValueType().SimpleTy) { + case MVT::f32: + return Subtarget->hasAtomicFaddRtnInsts(); + case MVT::v2f16: + case MVT::f64: + return Subtarget->hasGFX90AInsts(); + default: + return false; + } +} + bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { // This currently forces unfolding various combinations of fsub into fma with // free fneg'd operands. As long as we have fast FMA (controlled by @@ -7399,7 +7411,7 @@ Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; case Intrinsic::amdgcn_buffer_atomic_fadd: - if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) { + if (!Op.getValue(0).use_empty() && !hasAtomicFaddRtnForTy(Op)) { DiagnosticInfoUnsupported NoFpRet(DAG.getMachineFunction().getFunction(), "return versions of fp atomics not supported", @@ -12623,7 +12635,7 @@ return AtomicExpansionKind::CmpXChg; if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) && - Subtarget->hasAtomicFaddInsts()) { + Subtarget->hasAtomicFaddNoRtnInsts()) { if (Subtarget->hasGFX940Insts()) return AtomicExpansionKind::None; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.rtn_no-rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.rtn_no-rtn.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.rtn_no-rtn.ll @@ -0,0 +1,99 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s + +; no-rtn + +; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + %voffset.add = add i32 %voffset, 4095 + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + ret void +} + +; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) + ret void +} + +; GFX11: BUFFER_ATOMIC_ADD_F32_OFFSET +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) + ret void +} + +; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0) + ret void +} + +; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2) + ret void +} + +; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + %voffset.add = add i32 %voffset, 4095 + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) + ret void +} + +; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN +define amdgpu_ps void @xstruct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + ret void +} + + +; rtn + +; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN +define amdgpu_ps float @raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + %voffset.add = add i32 %voffset, 4095 + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + ret float %ret +} + +; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN +define amdgpu_ps float @raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) + ret float %ret +} + +; GFX11: BUFFER_ATOMIC_ADD_F32_OFFSET +define amdgpu_ps float @raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) + ret float %ret +} + +; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN +define amdgpu_ps float @struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0) + ret float %ret +} + +; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN +define amdgpu_ps float @struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2) + ret float %ret +} + +; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN +define amdgpu_ps float @struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + %voffset.add = add i32 %voffset, 4095 + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) + ret float %ret +} + +; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN +define amdgpu_ps float @xstruct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + ret float %ret +} + +declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0 +attributes #0 = { nounwind }