Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -139,9 +139,6 @@ bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const; bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const; bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const; - bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const; - bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp, - MachineOperand &DataOp) const; bool selectBufferLoadLds(MachineInstr &MI) const; bool selectGlobalLoadLds(MachineInstr &MI) const; bool selectBVHIntrinsic(MachineInstr &I) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1825,8 +1825,6 @@ return selectDSAppendConsume(I, false); case Intrinsic::amdgcn_s_barrier: return selectSBarrier(I); - case Intrinsic::amdgcn_global_atomic_fadd: - return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3)); case Intrinsic::amdgcn_raw_buffer_load_lds: case Intrinsic::amdgcn_struct_buffer_load_lds: return selectBufferLoadLds(I); @@ -2442,13 +2440,6 @@ bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW( MachineInstr &I) const { - if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) { - const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); - unsigned AS = PtrTy.getAddressSpace(); - if (AS == AMDGPUAS::GLOBAL_ADDRESS) - return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2)); - } - initM0(I); return selectImpl(I, *CoverageInfo); } @@ -3015,133 +3006,6 @@ return true; } -bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD( - MachineInstr &MI) const { - const Register DefReg = MI.getOperand(0).getReg(); - LLT DefTy = MRI->getType(DefReg); - if (AMDGPU::hasAtomicFaddRtnForTy(STI, DefTy)) - return selectImpl(MI, *CoverageInfo); - - MachineBasicBlock *MBB = MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - if (!MRI->use_nodbg_empty(DefReg)) { - Function &F = MBB->getParent()->getFunction(); - DiagnosticInfoUnsupported - NoFpRet(F, "return versions of fp atomics not supported", - MI.getDebugLoc(), DS_Error); - F.getContext().diagnose(NoFpRet); - return false; - } - - // FIXME: This is only needed because tablegen requires number of dst operands - // in match and replace pattern to be the same. Otherwise patterns can be - // exported from SDag path. - MachineOperand &VDataIn = MI.getOperand(1); - MachineOperand &VIndex = MI.getOperand(3); - MachineOperand &VOffset = MI.getOperand(4); - MachineOperand &SOffset = MI.getOperand(5); - int16_t Offset = MI.getOperand(6).getImm(); - - bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI); - bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI); - - unsigned Opcode; - if (HasVOffset) { - Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN - : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN; - } else { - Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN - : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET; - } - - if (MRI->getType(VDataIn.getReg()).isVector()) { - switch (Opcode) { - case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN: - Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN; - break; - case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN: - Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN; - break; - case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN: - Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN; - break; - case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET: - Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET; - break; - } - } - - auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode)); - I.add(VDataIn); - - if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN || - Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) { - Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class()); - BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) - .addReg(VIndex.getReg()) - .addImm(AMDGPU::sub0) - .addReg(VOffset.getReg()) - .addImm(AMDGPU::sub1); - - I.addReg(IdxReg); - } else if (HasVIndex) { - I.add(VIndex); - } else if (HasVOffset) { - I.add(VOffset); - } - - I.add(MI.getOperand(2)); // rsrc - I.add(SOffset); - I.addImm(Offset); - I.addImm(MI.getOperand(7).getImm()); // cpol - I.cloneMemRefs(MI); - - MI.eraseFromParent(); - - return true; -} - -bool AMDGPUInstructionSelector::selectGlobalAtomicFadd( - MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const { - - if (STI.hasGFX90AInsts()) { - // gfx90a adds return versions of the global atomic fadd instructions so no - // special handling is required. - return selectImpl(MI, *CoverageInfo); - } - - MachineBasicBlock *MBB = MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) { - Function &F = MBB->getParent()->getFunction(); - DiagnosticInfoUnsupported - NoFpRet(F, "return versions of fp atomics not supported", - MI.getDebugLoc(), DS_Error); - F.getContext().diagnose(NoFpRet); - return false; - } - - // FIXME: This is only needed because tablegen requires number of dst operands - // in match and replace pattern to be the same. Otherwise patterns can be - // exported from SDag path. - auto Addr = selectFlatOffsetImpl(AddrOp, SIInstrFlags::FlatGlobal); - - Register Data = DataOp.getReg(); - const unsigned Opc = MRI->getType(Data).isVector() ? - AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32; - auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) - .addReg(Addr.first) - .addReg(Data) - .addImm(Addr.second) - .addImm(0) // cpol - .cloneMemRefs(MI); - - MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); -} - bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { unsigned Opc; unsigned Size = MI.getOperand(3).getImm(); @@ -3553,8 +3417,6 @@ } case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: return selectBVHIntrinsic(I); - case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: - return selectAMDGPU_BUFFER_ATOMIC_FADD(I); case AMDGPU::G_SBFX: case AMDGPU::G_UBFX: return selectG_SBFX_UBFX(I); Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5766,24 +5766,9 @@ case Intrinsic::amdgcn_struct_buffer_atomic_fmin: case Intrinsic::amdgcn_raw_buffer_atomic_fmax: case Intrinsic::amdgcn_struct_buffer_atomic_fmax: - return legalizeBufferAtomic(MI, B, IntrID); case Intrinsic::amdgcn_raw_buffer_atomic_fadd: - case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { - Register DstReg = MI.getOperand(0).getReg(); - if (!MRI.use_empty(DstReg) && - !AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) { - Function &F = B.getMF().getFunction(); - DiagnosticInfoUnsupported NoFpRet( - F, "return versions of fp atomics not supported", B.getDebugLoc(), - DS_Error); - F.getContext().diagnose(NoFpRet); - B.buildUndef(DstReg); - MI.eraseFromParent(); - return true; - } - + case Intrinsic::amdgcn_struct_buffer_atomic_fadd: return legalizeBufferAtomic(MI, B, IntrID); - } case Intrinsic::amdgcn_atomic_inc: return legalizeAtomicIncDec(MI, B, true); case Intrinsic::amdgcn_atomic_dec: Index: llvm/lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/BUFInstructions.td +++ llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1590,6 +1590,7 @@ let GISelPredicateCode = [{ return MRI.use_nodbg_empty(MI.getOperand(0).getReg()); }]; + let HasNoUse = true; } multiclass BufferAtomicPatterns_NO_RTN(opcode # _OFFSET) getVregSrcForVT.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), $cachepolicy) + (as_i16timm $offset), timm:$cachepolicy) >; def : GCNPat< @@ -1607,7 +1608,7 @@ 0, i32:$soffset, timm:$offset, timm:$cachepolicy, timm), (!cast(opcode # _IDXEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), $cachepolicy) + (as_i16timm $offset), timm:$cachepolicy) >; def : GCNPat< @@ -1615,7 +1616,7 @@ i32:$voffset, i32:$soffset, timm:$offset, timm:$cachepolicy, 0), (!cast(opcode # _OFFEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), $cachepolicy) + (as_i16timm $offset), timm:$cachepolicy) >; def : GCNPat< @@ -1625,7 +1626,7 @@ (!cast(opcode # _BOTHEN) getVregSrcForVT.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), $cachepolicy) + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), timm:$cachepolicy) >; } Index: llvm/lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/FLATInstructions.td +++ llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1026,21 +1026,32 @@ (!cast(inst) VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; } -multiclass FlatSignedAtomicPat { - defvar rtnNode = !cast(node # !if(isIntr, "", "_" # vt.Size)); +multiclass FlatSignedAtomicPatImplNoRtn { defvar noRtnNode = !cast(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); - let AddedComplexity = complexity in - def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), - (!cast(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; - let AddedComplexity = !add(complexity, 1) in def : GCNPat <(vt (noRtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), (!cast(inst) VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; } +multiclass FlatSignedAtomicPatImplRtn { + defvar rtnNode = !cast(node # !if(isIntr, "", "_" # vt.Size)); + + let AddedComplexity = complexity in + def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), + (!cast(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; +} + +multiclass FlatSignedAtomicPat : + FlatSignedAtomicPatImplNoRtn , + FlatSignedAtomicPatImplRtn ; + multiclass FlatSignedAtomicIntrPat { defm : FlatSignedAtomicPat; @@ -1262,25 +1273,46 @@ } } -multiclass GlobalFLATAtomicPats { - defvar rtnNode = !cast(node # !if(isIntr, "", "_" # vt.Size)); +multiclass GlobalFLATAtomicPatsImplNoRtn { defvar noRtnNode = !cast(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); - defm : FlatSignedAtomicPat ; + defm : FlatSignedAtomicPatImplNoRtn ; let AddedComplexity = 13 in def : GlobalAtomicSaddrPat(inst#"_SADDR"), noRtnNode, vt, data_vt>; +} + +multiclass GlobalFLATAtomicPatsImplRtn { + defvar rtnNode = !cast(node # !if(isIntr, "", "_" # vt.Size)); + + defm : FlatSignedAtomicPatImplRtn ; let AddedComplexity = 12 in def : GlobalAtomicSaddrPat(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>; } +multiclass GlobalFLATAtomicPats : + GlobalFLATAtomicPatsImplNoRtn, + GlobalFLATAtomicPatsImplRtn; + multiclass GlobalFLATAtomicIntrPats { defm : GlobalFLATAtomicPats; } +multiclass GlobalFLATAtomicIntrPatsNoRet { + defm : GlobalFLATAtomicPatsImplNoRtn; +} + +multiclass GlobalFLATAtomicIntrPatsRet { + defm : GlobalFLATAtomicPatsImplRtn; +} + multiclass GlobalFLATNoRtnAtomicPats { def : FlatSignedAtomicPatNoRtn { @@ -1452,10 +1484,17 @@ defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>; } -let OtherPredicates = [HasAtomicFaddNoRtnInsts] in +let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { defm : GlobalFLATNoRtnAtomicPats ; -let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in +defm : GlobalFLATAtomicIntrPatsNoRet <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>; +} +let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in { defm : GlobalFLATNoRtnAtomicPats ; +defm : GlobalFLATAtomicIntrPatsNoRet <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>; +} + +let OtherPredicates = [HasAtomicFaddRtnInsts] in +defm : GlobalFLATAtomicIntrPatsRet <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>; let OtherPredicates = [isGFX90APlus] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; @@ -1463,9 +1502,8 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", f64>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>; +defm : GlobalFLATAtomicIntrPatsRet <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>; defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7799,19 +7799,8 @@ DAG.setNodeMemRefs(NewNode, {MemRef}); return SDValue(NewNode, 0); } - case Intrinsic::amdgcn_global_atomic_fadd: - if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) { - DiagnosticInfoUnsupported - NoFpRet(DAG.getMachineFunction().getFunction(), - "return versions of fp atomics not supported", - DL.getDebugLoc(), DS_Error); - DAG.getContext()->diagnose(NoFpRet); - return SDValue(); - } - LLVM_FALLTHROUGH; case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmax: - case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: { MemSDNode *M = cast(Op); @@ -7822,13 +7811,6 @@ }; unsigned Opcode = 0; switch (IntrID) { - case Intrinsic::amdgcn_global_atomic_fadd: - case Intrinsic::amdgcn_flat_atomic_fadd: { - EVT VT = Op.getOperand(3).getValueType(); - return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT, - DAG.getVTList(VT, MVT::Other), Ops, - M->getMemOperand()); - } case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmin: { Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll @@ -4,7 +4,7 @@ declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) declare <2 x half> @llvm.amdgcn.global.atomic.fadd.f32.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) -; GFX908: error: {{.*}} return versions of fp atomics not supported +; GFX908: LLVM ERROR: cannot select: %4:vgpr_32(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.global.atomic.fadd), %0:vgpr(p1), %1:vgpr(s32) :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) (in function: global_atomic_fadd_f32_rtn) ; GFX90A-LABEL: {{^}}global_atomic_fadd_f32_rtn: ; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll @@ -63,11 +63,10 @@ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s2 -; GFX908-NEXT: v_mov_b32_e32 v0, s0 -; GFX908-NEXT: v_mov_b32_e32 v1, s1 -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2048 +; GFX908-NEXT: v_mov_b32_e32 v0, s2 +; GFX908-NEXT: global_atomic_add_f32 v1, v0, s[0:1] offset:2048 ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_f32_off_ss: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll @@ -1,11 +1,10 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s -; RUN: not llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 +; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) -; GFX908: error: {{.*}} return versions of fp atomics not supported -; GFX908: error: {{.*}} return versions of fp atomics not supported +; GFX908: LLVM ERROR: cannot select: %24:vgpr_32(s32) = G_AMDGPU_BUFFER_ATOMIC_FADD %28:vgpr, %14:sgpr(<4 x s32>), %29:vgpr(s32), %30:vgpr, %27:sgpr, 0, 0, 0 :: (volatile dereferenceable load store (s32), align 1, addrspace 4) (in function: buffer_atomic_add_f32_rtn) ; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_rtn: ; GFX90A: buffer_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll @@ -1,8 +1,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s -; RUN: not llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 +; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 -; GFX908: error: {{.*}} return versions of fp atomics not supported -; GFX908: error: {{.*}} return versions of fp atomics not supported +; GFX908: LLVM ERROR: cannot select: %29:vgpr_32(s32) = G_AMDGPU_BUFFER_ATOMIC_FADD %40:vgpr, %15:sgpr(<4 x s32>), %41:vgpr(s32), %42:vgpr, %33:sgpr, 0, 0, -1 :: (volatile dereferenceable load store (s32), align 1, addrspace 4) (in function: buffer_atomic_add_f32_rtn) declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.rtn_no-rtn.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.rtn_no-rtn.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.rtn_no-rtn.ll @@ -1,8 +1,9 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s ; no-rtn +; GFX11-LABEL: {{^}}name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset ; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { %voffset.add = add i32 %voffset, 4095 @@ -10,30 +11,35 @@ ret void } +; GFX11-LABEL: {{^}}name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void } +; GFX11-LABEL: {{^}}name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset ; GFX11: BUFFER_ATOMIC_ADD_F32_OFFSET define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void } +; GFX11-LABEL: {{^}}name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset ; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0) ret void } +; GFX11-LABEL: {{^}}name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc ; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2) ret void } +; GFX11-LABEL: {{^}}name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset ; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { %voffset.add = add i32 %voffset, 4095 @@ -41,6 +47,7 @@ ret void } +; GFX11-LABEL: {{^}}name: xstruct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN define amdgpu_ps void @xstruct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -50,45 +57,52 @@ ; rtn -; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN +; GFX11-LABEL: {{^}}name: raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset +; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN_RTN define amdgpu_ps float @raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { %voffset.add = add i32 %voffset, 4095 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret float %ret } -; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN +; GFX11-LABEL: {{^}}name: raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc +; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN_RTN define amdgpu_ps float @raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) ret float %ret } -; GFX11: BUFFER_ATOMIC_ADD_F32_OFFSET +; GFX11-LABEL: {{^}}name: raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset +; GFX11: BUFFER_ATOMIC_ADD_F32_OFFSET_RTN define amdgpu_ps float @raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret float %ret } -; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN +; GFX11-LABEL: {{^}}name: struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset +; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN_RTN define amdgpu_ps float @struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0) ret float %ret } -; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN +; GFX11-LABEL: {{^}}name: struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc +; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN_RTN define amdgpu_ps float @struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2) ret float %ret } -; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN +; GFX11-LABEL: {{^}}name: struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset +; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN define amdgpu_ps float @struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { %voffset.add = add i32 %voffset, 4095 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) ret float %ret } -; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN +; GFX11-LABEL: {{^}}name: xstruct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc +; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN define amdgpu_ps float @xstruct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret float %ret