Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -139,9 +139,6 @@ bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const; bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const; bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const; - bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const; - bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp, - MachineOperand &DataOp) const; bool selectBufferLoadLds(MachineInstr &MI) const; bool selectGlobalLoadLds(MachineInstr &MI) const; bool selectBVHIntrinsic(MachineInstr &I) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1825,8 +1825,6 @@ return selectDSAppendConsume(I, false); case Intrinsic::amdgcn_s_barrier: return selectSBarrier(I); - case Intrinsic::amdgcn_global_atomic_fadd: - return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3)); case Intrinsic::amdgcn_raw_buffer_load_lds: case Intrinsic::amdgcn_struct_buffer_load_lds: return selectBufferLoadLds(I); @@ -2442,13 +2440,6 @@ bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW( MachineInstr &I) const { - if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) { - const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); - unsigned AS = PtrTy.getAddressSpace(); - if (AS == AMDGPUAS::GLOBAL_ADDRESS) - return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2)); - } - initM0(I); return selectImpl(I, *CoverageInfo); } @@ -3015,133 +3006,6 @@ return true; } -bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD( - MachineInstr &MI) const { - const Register DefReg = MI.getOperand(0).getReg(); - LLT DefTy = MRI->getType(DefReg); - if (AMDGPU::hasAtomicFaddRtnForTy(STI, DefTy)) - return selectImpl(MI, *CoverageInfo); - - MachineBasicBlock *MBB = MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - if (!MRI->use_nodbg_empty(DefReg)) { - Function &F = MBB->getParent()->getFunction(); - DiagnosticInfoUnsupported - NoFpRet(F, "return versions of fp atomics not supported", - MI.getDebugLoc(), DS_Error); - F.getContext().diagnose(NoFpRet); - return false; - } - - // FIXME: This is only needed because tablegen requires number of dst operands - // in match and replace pattern to be the same. Otherwise patterns can be - // exported from SDag path. - MachineOperand &VDataIn = MI.getOperand(1); - MachineOperand &VIndex = MI.getOperand(3); - MachineOperand &VOffset = MI.getOperand(4); - MachineOperand &SOffset = MI.getOperand(5); - int16_t Offset = MI.getOperand(6).getImm(); - - bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI); - bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI); - - unsigned Opcode; - if (HasVOffset) { - Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN - : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN; - } else { - Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN - : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET; - } - - if (MRI->getType(VDataIn.getReg()).isVector()) { - switch (Opcode) { - case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN: - Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN; - break; - case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN: - Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN; - break; - case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN: - Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN; - break; - case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET: - Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET; - break; - } - } - - auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode)); - I.add(VDataIn); - - if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN || - Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) { - Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class()); - BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) - .addReg(VIndex.getReg()) - .addImm(AMDGPU::sub0) - .addReg(VOffset.getReg()) - .addImm(AMDGPU::sub1); - - I.addReg(IdxReg); - } else if (HasVIndex) { - I.add(VIndex); - } else if (HasVOffset) { - I.add(VOffset); - } - - I.add(MI.getOperand(2)); // rsrc - I.add(SOffset); - I.addImm(Offset); - I.addImm(MI.getOperand(7).getImm()); // cpol - I.cloneMemRefs(MI); - - MI.eraseFromParent(); - - return true; -} - -bool AMDGPUInstructionSelector::selectGlobalAtomicFadd( - MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const { - - if (STI.hasGFX90AInsts()) { - // gfx90a adds return versions of the global atomic fadd instructions so no - // special handling is required. - return selectImpl(MI, *CoverageInfo); - } - - MachineBasicBlock *MBB = MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) { - Function &F = MBB->getParent()->getFunction(); - DiagnosticInfoUnsupported - NoFpRet(F, "return versions of fp atomics not supported", - MI.getDebugLoc(), DS_Error); - F.getContext().diagnose(NoFpRet); - return false; - } - - // FIXME: This is only needed because tablegen requires number of dst operands - // in match and replace pattern to be the same. Otherwise patterns can be - // exported from SDag path. - auto Addr = selectFlatOffsetImpl(AddrOp, SIInstrFlags::FlatGlobal); - - Register Data = DataOp.getReg(); - const unsigned Opc = MRI->getType(Data).isVector() ? - AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32; - auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) - .addReg(Addr.first) - .addReg(Data) - .addImm(Addr.second) - .addImm(0) // cpol - .cloneMemRefs(MI); - - MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); -} - bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { unsigned Opc; unsigned Size = MI.getOperand(3).getImm(); @@ -3553,8 +3417,6 @@ } case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: return selectBVHIntrinsic(I); - case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: - return selectAMDGPU_BUFFER_ATOMIC_FADD(I); case AMDGPU::G_SBFX: case AMDGPU::G_UBFX: return selectG_SBFX_UBFX(I); Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5769,24 +5769,9 @@ case Intrinsic::amdgcn_struct_buffer_atomic_fmin: case Intrinsic::amdgcn_raw_buffer_atomic_fmax: case Intrinsic::amdgcn_struct_buffer_atomic_fmax: - return legalizeBufferAtomic(MI, B, IntrID); case Intrinsic::amdgcn_raw_buffer_atomic_fadd: - case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { - Register DstReg = MI.getOperand(0).getReg(); - if (!MRI.use_empty(DstReg) && - !AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) { - Function &F = B.getMF().getFunction(); - DiagnosticInfoUnsupported NoFpRet( - F, "return versions of fp atomics not supported", B.getDebugLoc(), - DS_Error); - F.getContext().diagnose(NoFpRet); - B.buildUndef(DstReg); - MI.eraseFromParent(); - return true; - } - + case Intrinsic::amdgcn_struct_buffer_atomic_fadd: return legalizeBufferAtomic(MI, B, IntrID); - } case Intrinsic::amdgcn_atomic_inc: return legalizeAtomicIncDec(MI, B, true); case Intrinsic::amdgcn_atomic_dec: Index: llvm/lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/BUFInstructions.td +++ llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1151,12 +1151,12 @@ let OtherPredicates = [HasAtomicFaddRtnInsts] in defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN< - "buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_32 + "buffer_atomic_add_f32", VGPR_32, f32, null_frag >; let OtherPredicates = [isGFX90APlus] in defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < - "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_32 + "buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag >; //===----------------------------------------------------------------------===// @@ -1584,12 +1584,8 @@ class NoUseBufferAtomic : PatFrag < (ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5, node:$src6, node:$src7), - (vt (Op $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7)), - [{ return SDValue(N, 0).use_empty(); }]> { - - let GISelPredicateCode = [{ - return MRI.use_nodbg_empty(MI.getOperand(0).getReg()); - }]; + (vt (Op $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7))> { + let HasNoUse = true; } multiclass BufferAtomicPatterns_NO_RTN(opcode # _OFFSET) getVregSrcForVT.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), $cachepolicy) + (as_i16timm $offset), timm:$cachepolicy) >; def : GCNPat< @@ -1607,7 +1603,7 @@ 0, i32:$soffset, timm:$offset, timm:$cachepolicy, timm), (!cast(opcode # _IDXEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), $cachepolicy) + (as_i16timm $offset), timm:$cachepolicy) >; def : GCNPat< @@ -1615,7 +1611,7 @@ i32:$voffset, i32:$soffset, timm:$offset, timm:$cachepolicy, 0), (!cast(opcode # _OFFEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), $cachepolicy) + (as_i16timm $offset), timm:$cachepolicy) >; def : GCNPat< @@ -1625,24 +1621,21 @@ (!cast(opcode # _BOTHEN) getVregSrcForVT.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), $cachepolicy) + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), timm:$cachepolicy) >; } let SubtargetPredicate = HasAtomicFaddNoRtnInsts in -defm : BufferAtomicPatterns_NO_RTN; +defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>; let SubtargetPredicate = HasAtomicPkFaddNoRtnInsts in -defm : BufferAtomicPatterns_NO_RTN; +defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>; let SubtargetPredicate = HasAtomicFaddRtnInsts in - defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>; let SubtargetPredicate = isGFX90APlus in { - defm : BufferAtomicIntrPat<"int_amdgcn_global_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">; - defm : BufferAtomicIntrPat<"int_amdgcn_global_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">; - defm : BufferAtomicIntrPat<"int_amdgcn_global_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">; - defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16">; + defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>; defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">; Index: llvm/lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/FLATInstructions.td +++ llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1026,6 +1026,12 @@ (!cast(inst) VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; } +class FlatSignedAtomicPatBase : GCNPat < + (vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), + (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset) +>; + multiclass FlatSignedAtomicPat { @@ -1033,12 +1039,10 @@ defvar noRtnNode = !cast(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); let AddedComplexity = complexity in - def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), - (!cast(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; + def : FlatSignedAtomicPatBase(inst#"_RTN"), rtnNode, vt, data_vt>; let AddedComplexity = !add(complexity, 1) in - def : GCNPat <(vt (noRtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), - (!cast(inst) VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; + def : FlatSignedAtomicPatBase(inst), noRtnNode, vt, data_vt>; } multiclass FlatSignedAtomicIntrPat ; } -class FlatSignedAtomicPatNoRtn : GCNPat < - (node (GlobalOffset i64:$vaddr, i16:$offset), vt:$data), - (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset) ->; - -class FlatSignedAtomicPatRtn : GCNPat < - (vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), - (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset) ->; - class ScratchLoadSignedPat : GCNPat < (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset))), (inst $vaddr, $offset) @@ -1251,45 +1244,46 @@ } } -multiclass GlobalFLATAtomicPatsRtn { - def : FlatSignedAtomicPatRtn (nortn_inst_name#"_RTN"), node, vt, data_vt> { - let AddedComplexity = 10; - } - - def : GlobalAtomicSaddrPat(nortn_inst_name#"_SADDR_RTN"), node, vt, data_vt> { - let AddedComplexity = 11; - } -} - -multiclass GlobalFLATAtomicPats { - defvar rtnNode = !cast(node # !if(isIntr, "", "_" # vt.Size)); +multiclass GlobalFLATAtomicPatsNoRtn { defvar noRtnNode = !cast(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); - defm : FlatSignedAtomicPat ; + let AddedComplexity = 11 in + def : FlatSignedAtomicPatBase(inst), noRtnNode, vt, data_vt>; let AddedComplexity = 13 in def : GlobalAtomicSaddrPat(inst#"_SADDR"), noRtnNode, vt, data_vt>; +} + +multiclass GlobalFLATAtomicPatsRtn { + defvar rtnNode = !cast(node # !if(isIntr, "", "_" # vt.Size)); + + let AddedComplexity = 10 in + def : FlatSignedAtomicPatBase(inst#"_RTN"), rtnNode, vt, data_vt>; let AddedComplexity = 12 in def : GlobalAtomicSaddrPat(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>; } +multiclass GlobalFLATAtomicPats : + GlobalFLATAtomicPatsNoRtn, + GlobalFLATAtomicPatsRtn; + multiclass GlobalFLATAtomicIntrPats { defm : GlobalFLATAtomicPats; } -multiclass GlobalFLATNoRtnAtomicPats { - def : FlatSignedAtomicPatNoRtn { - let AddedComplexity = 10; - } +multiclass GlobalFLATAtomicIntrPatsNoRet { + defm : GlobalFLATAtomicPatsNoRtn; +} - def : GlobalAtomicNoRtnSaddrPat(!cast(inst)#"_SADDR"), node, vt> { - let AddedComplexity = 11; - } +multiclass GlobalFLATAtomicIntrPatsRet { + defm : GlobalFLATAtomicPatsRtn; } multiclass ScratchFLATLoadPats { @@ -1425,7 +1419,7 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>; -defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", int_amdgcn_global_atomic_csub, i32>; +defm : GlobalFLATAtomicIntrPatsRet <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>; @@ -1452,20 +1446,26 @@ defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>; } -let OtherPredicates = [HasAtomicFaddNoRtnInsts] in -defm : GlobalFLATNoRtnAtomicPats ; -let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in -defm : GlobalFLATNoRtnAtomicPats ; +let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { +defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; +defm : GlobalFLATAtomicIntrPatsNoRet <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>; +} + +let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in { +defm : GlobalFLATAtomicIntrPatsNoRet <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>; +} + +let OtherPredicates = [HasAtomicFaddRtnInsts] in { +defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; +defm : GlobalFLATAtomicIntrPatsRet <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>; +} let OtherPredicates = [isGFX90APlus] in { -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_global", v2f16>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", f64>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>; +defm : GlobalFLATAtomicIntrPatsRet <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>; defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>; @@ -1476,10 +1476,12 @@ defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>; } +let OtherPredicates = [isGFX940GFX11Plus] in { +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>; +defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>; +} + let OtherPredicates = [isGFX940Plus] in { -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>; -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_flat", v2f16>; -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>; defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>; defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7520,14 +7520,6 @@ Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; case Intrinsic::amdgcn_buffer_atomic_fadd: - if (!Op.getValue(0).use_empty() && !hasAtomicFaddRtnForTy(Op)) { - DiagnosticInfoUnsupported - NoFpRet(DAG.getMachineFunction().getFunction(), - "return versions of fp atomics not supported", - DL.getDebugLoc(), DS_Error); - DAG.getContext()->diagnose(NoFpRet); - return SDValue(); - } Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD; break; default: @@ -7798,19 +7790,13 @@ DAG.setNodeMemRefs(NewNode, {MemRef}); return SDValue(NewNode, 0); } - case Intrinsic::amdgcn_global_atomic_fadd: - if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) { - DiagnosticInfoUnsupported - NoFpRet(DAG.getMachineFunction().getFunction(), - "return versions of fp atomics not supported", - DL.getDebugLoc(), DS_Error); - DAG.getContext()->diagnose(NoFpRet); - return SDValue(); - } - [[fallthrough]]; + case Intrinsic::amdgcn_global_atomic_fadd: { + if (!Subtarget->hasAtomicFaddNoRtnInsts()) + return makeV_ILLEGAL(Op, DAG); + return SDValue(); + } case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmax: - case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: { MemSDNode *M = cast(Op); @@ -7821,16 +7807,6 @@ }; unsigned Opcode = 0; switch (IntrID) { - case Intrinsic::amdgcn_global_atomic_fadd: - if (!Subtarget->hasAtomicFaddNoRtnInsts()) - return makeV_ILLEGAL(Op, DAG); - [[fallthrough]]; - case Intrinsic::amdgcn_flat_atomic_fadd: { - EVT VT = Op.getOperand(3).getValueType(); - return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT, - DAG.getVTList(VT, MVT::Other), Ops, - M->getMemOperand()); - } case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmin: { Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd-f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd-f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd-f32.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI300 %s -; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s 2>&1 | FileCheck -check-prefix=GFX11 %s - -; GFX11: LLVM ERROR: cannot select: %4:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.flat.atomic.fadd), %0:vgpr(p0), %1:vgpr(s32) :: (volatile dereferenceable load store (s32) on %ir.ptr) (in function: flat_atomic_fadd_f32_no_rtn_intrinsic) +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(float* %ptr, float %data) { ; MI300-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic @@ -15,6 +13,16 @@ ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; MI300-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) ; MI300-NEXT: S_ENDPGM 0 + ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX11-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float* %ptr, float %data) ret void } @@ -31,6 +39,17 @@ ; MI300-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) ; MI300-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float* %ptr, float %data) ret float %ret } @@ -46,6 +65,16 @@ ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; MI300-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) ; MI300-NEXT: S_ENDPGM 0 + ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; GFX11-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic ret void } @@ -62,6 +91,17 @@ ; MI300-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) ; MI300-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic ret float %ret } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd-f32-no-rtn.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd-f32-no-rtn.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd-f32-no-rtn.ll @@ -38,8 +38,8 @@ ; MI100_GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; MI100_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; MI100_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI100_GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; MI100_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[COPY3]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; MI100_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI100_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; MI100_GFX11-NEXT: S_ENDPGM 0 ; MI200_MI300-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic ; MI200_MI300: bb.1 (%ir-block.0): @@ -90,8 +90,8 @@ ; MI100_GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; MI100_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; MI100_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI100_GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; MI100_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[COPY3]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; MI100_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI100_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) ; MI100_GFX11-NEXT: S_ENDPGM 0 ; MI200_MI300-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw ; MI200_MI300: bb.1 (%ir-block.0): Index: llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd-f32-rtn.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd-f32-rtn.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd-f32-rtn.ll @@ -1,9 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI200_MI300 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI200_MI300 %s -; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s 2>&1 | FileCheck -check-prefix=GFX11 %s - -; GFX11: error: {{.*}} return versions of fp atomics not supported +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(float addrspace(1)* %ptr, float %data) { ; MI200_MI300-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic @@ -17,6 +15,17 @@ ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) ret float %ret } @@ -34,6 +43,18 @@ ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* inreg %ptr, float %data) ret float %ret } @@ -50,6 +71,17 @@ ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data syncscope("wavefront") monotonic ret float %ret } @@ -67,6 +99,18 @@ ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data syncscope("wavefront") monotonic ret float %ret } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd-v2f16-no-rtn.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd-v2f16-no-rtn.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd-v2f16-no-rtn.ll @@ -37,8 +37,8 @@ ; MI100-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; MI100-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; MI100-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI100-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; MI100-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[COPY3]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) + ; MI100-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI100-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) ; MI100-NEXT: S_ENDPGM 0 ; MI200_MI300-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic ; MI200_MI300: bb.1 (%ir-block.0): Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll @@ -63,11 +63,10 @@ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s2 -; GFX908-NEXT: v_mov_b32_e32 v0, s0 -; GFX908-NEXT: v_mov_b32_e32 v1, s1 -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2048 +; GFX908-NEXT: v_mov_b32_e32 v0, s2 +; GFX908-NEXT: global_atomic_add_f32 v1, v0, s[0:1] offset:2048 ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_f32_off_ss: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll @@ -1,11 +1,10 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s -; RUN: not llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 +; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) -; GFX908: error: {{.*}} return versions of fp atomics not supported -; GFX908: error: {{.*}} return versions of fp atomics not supported +; GFX908: LLVM ERROR: cannot select: %24:vgpr_32(s32) = G_AMDGPU_BUFFER_ATOMIC_FADD %28:vgpr, %14:sgpr(<4 x s32>), %29:vgpr(s32), %30:vgpr, %27:sgpr, 0, 0, 0 :: (volatile dereferenceable load store (s32), align 1, addrspace 4) (in function: buffer_atomic_add_f32_rtn) ; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_rtn: ; GFX90A: buffer_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll @@ -1,8 +1,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s -; RUN: not llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 +; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 -; GFX908: error: {{.*}} return versions of fp atomics not supported -; GFX908: error: {{.*}} return versions of fp atomics not supported +; GFX908: LLVM ERROR: cannot select: %29:vgpr_32(s32) = G_AMDGPU_BUFFER_ATOMIC_FADD %40:vgpr, %15:sgpr(<4 x s32>), %41:vgpr(s32), %42:vgpr, %33:sgpr, 0, 0, -1 :: (volatile dereferenceable load store (s32), align 1, addrspace 4) (in function: buffer_atomic_add_f32_rtn) declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) Index: llvm/test/CodeGen/AMDGPU/flat-atomic-fadd-f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-atomic-fadd-f32.ll +++ llvm/test/CodeGen/AMDGPU/flat-atomic-fadd-f32.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=MI300 %s -; RUN: not --crash llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s 2>&1 | FileCheck -check-prefix=GFX11 %s - -; GFX11: LLVM ERROR: Cannot select: t11: f32,ch = AtomicLoadFAdd<(volatile dereferenceable load store (s32) on %ir.ptr)> t0, t7, t6 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(float* %ptr, float %data) { ; MI300-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic @@ -16,6 +14,17 @@ ; MI300-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] ; MI300-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) ; MI300-NEXT: S_ENDPGM 0 + ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX11-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float* %ptr, float %data) ret void } @@ -33,6 +42,18 @@ ; MI300-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) ; MI300-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; MI300-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float* %ptr, float %data) ret float %ret } @@ -49,6 +70,17 @@ ; MI300-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] ; MI300-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) ; MI300-NEXT: S_ENDPGM 0 + ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; GFX11-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic ret void } @@ -66,6 +98,18 @@ ; MI300-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) ; MI300-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; MI300-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic ret float %ret } Index: llvm/test/CodeGen/AMDGPU/global-atomic-fadd-f32-rtn.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-atomic-fadd-f32-rtn.ll +++ llvm/test/CodeGen/AMDGPU/global-atomic-fadd-f32-rtn.ll @@ -1,9 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=MI200_MI300 %s ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=MI200_MI300 %s -; RUN: not --crash llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s 2>&1 | FileCheck -check-prefix=GFX11 %s - -; GFX11: error: {{.*}} return versions of fp atomics not supported +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(float addrspace(1)* %ptr, float %data) { ; MI200_MI300-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic @@ -18,6 +16,18 @@ ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) ret float %ret } @@ -35,6 +45,18 @@ ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* inreg %ptr, float %data) ret float %ret } @@ -52,6 +74,18 @@ ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data syncscope("wavefront") monotonic ret float %ret } @@ -69,6 +103,18 @@ ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data syncscope("wavefront") monotonic ret float %ret } Index: llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -3,9 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s -; RUN: not --crash llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s 2>&1 | FileCheck -enable-var-scope -check-prefixes=GFX11 %s - -; GFX11: LLVM ERROR: Cannot select: t15: f32,ch = AtomicLoadFAdd<(load store seq_cst (s32) on %ir.ptr.load, addrspace 1)> t0, t21, ConstantFP:f32<4.000000e+00> +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) #0 { ; GFX900-LABEL: global_atomic_fadd_ret_f32: @@ -115,6 +113,20 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_ret_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst store float %result, float addrspace(1)* undef ret void @@ -212,6 +224,20 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_ret_f32_ieee: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef ret void @@ -289,6 +315,18 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_noret_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -365,6 +403,18 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_noret_f32_ieee: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -461,6 +511,20 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_ret_f32_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef ret void @@ -574,6 +638,20 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_ret_f32_system: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("one-as") seq_cst store float %result, float addrspace(1)* undef ret void @@ -605,6 +683,32 @@ ; GCN-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-NEXT: global_store_dword v[0:1], v1, off ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b64 s[2:3], 0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_wbinvl1_vol +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX11-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX11-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11-NEXT: global_store_dword v[0:1], v1, off +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef ret void @@ -621,6 +725,17 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_wbinvl1_vol +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -710,6 +825,18 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_noret_f32_safe: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -784,6 +911,17 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: infer_as_before_atomic: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load float*, float* addrspace(4)* %arg %v = atomicrmw fadd float* %load, float 1.0 syncscope("agent-one-as") monotonic, align 4 ret void Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll @@ -6,7 +6,7 @@ declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)*, float) declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>) -; GFX908: error: {{.*}} return versions of fp atomics not supported +; GFX908: LLVM ERROR: Cannot select: t23: f32,ch = BUFFER_ATOMIC_FADD<(volatile dereferenceable load store (s32), align 1, addrspace 4)> t0, t10, t13, t12, Constant:i32<0>, Constant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<-1> ; GFX90A-LABEL: {{^}}buffer_atomic_add_f32: ; GFX90A: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen glc