Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -140,9 +140,6 @@ bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const; bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const; bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const; - bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const; - bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp, - MachineOperand &DataOp) const; bool selectBufferLoadLds(MachineInstr &MI) const; bool selectGlobalLoadLds(MachineInstr &MI) const; bool selectBVHIntrinsic(MachineInstr &I) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1852,8 +1852,6 @@ return selectDSAppendConsume(I, false); case Intrinsic::amdgcn_s_barrier: return selectSBarrier(I); - case Intrinsic::amdgcn_global_atomic_fadd: - return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3)); case Intrinsic::amdgcn_raw_buffer_load_lds: case Intrinsic::amdgcn_struct_buffer_load_lds: return selectBufferLoadLds(I); @@ -2471,13 +2469,6 @@ bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW( MachineInstr &I) const { - if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) { - const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); - unsigned AS = PtrTy.getAddressSpace(); - if (AS == AMDGPUAS::GLOBAL_ADDRESS) - return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2)); - } - initM0(I); return selectImpl(I, *CoverageInfo); } @@ -3043,133 +3034,6 @@ return true; } -bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD( - MachineInstr &MI) const { - const Register DefReg = MI.getOperand(0).getReg(); - LLT DefTy = MRI->getType(DefReg); - if (AMDGPU::hasAtomicFaddRtnForTy(STI, DefTy)) - return selectImpl(MI, *CoverageInfo); - - MachineBasicBlock *MBB = MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - if (!MRI->use_nodbg_empty(DefReg)) { - Function &F = MBB->getParent()->getFunction(); - DiagnosticInfoUnsupported - NoFpRet(F, "return versions of fp atomics not supported", - MI.getDebugLoc(), DS_Error); - F.getContext().diagnose(NoFpRet); - return false; - } - - // FIXME: This is only needed because tablegen requires number of dst operands - // in match and replace pattern to be the same. Otherwise patterns can be - // exported from SDag path. - MachineOperand &VDataIn = MI.getOperand(1); - MachineOperand &VIndex = MI.getOperand(3); - MachineOperand &VOffset = MI.getOperand(4); - MachineOperand &SOffset = MI.getOperand(5); - int16_t Offset = MI.getOperand(6).getImm(); - - bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI); - bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI); - - unsigned Opcode; - if (HasVOffset) { - Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN - : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN; - } else { - Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN - : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET; - } - - if (MRI->getType(VDataIn.getReg()).isVector()) { - switch (Opcode) { - case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN: - Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN; - break; - case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN: - Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN; - break; - case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN: - Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN; - break; - case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET: - Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET; - break; - } - } - - auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode)); - I.add(VDataIn); - - if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN || - Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) { - Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class()); - BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) - .addReg(VIndex.getReg()) - .addImm(AMDGPU::sub0) - .addReg(VOffset.getReg()) - .addImm(AMDGPU::sub1); - - I.addReg(IdxReg); - } else if (HasVIndex) { - I.add(VIndex); - } else if (HasVOffset) { - I.add(VOffset); - } - - I.add(MI.getOperand(2)); // rsrc - I.add(SOffset); - I.addImm(Offset); - I.addImm(MI.getOperand(7).getImm()); // cpol - I.cloneMemRefs(MI); - - MI.eraseFromParent(); - - return true; -} - -bool AMDGPUInstructionSelector::selectGlobalAtomicFadd( - MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const { - - if (STI.hasGFX90AInsts()) { - // gfx90a adds return versions of the global atomic fadd instructions so no - // special handling is required. - return selectImpl(MI, *CoverageInfo); - } - - MachineBasicBlock *MBB = MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) { - Function &F = MBB->getParent()->getFunction(); - DiagnosticInfoUnsupported - NoFpRet(F, "return versions of fp atomics not supported", - MI.getDebugLoc(), DS_Error); - F.getContext().diagnose(NoFpRet); - return false; - } - - // FIXME: This is only needed because tablegen requires number of dst operands - // in match and replace pattern to be the same. Otherwise patterns can be - // exported from SDag path. - auto Addr = selectFlatOffsetImpl(AddrOp, SIInstrFlags::FlatGlobal); - - Register Data = DataOp.getReg(); - const unsigned Opc = MRI->getType(Data).isVector() ? - AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32; - auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) - .addReg(Addr.first) - .addReg(Data) - .addImm(Addr.second) - .addImm(0) // cpol - .cloneMemRefs(MI); - - MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); -} - bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { unsigned Opc; unsigned Size = MI.getOperand(3).getImm(); @@ -3583,8 +3447,6 @@ } case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: return selectBVHIntrinsic(I); - case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: - return selectAMDGPU_BUFFER_ATOMIC_FADD(I); case AMDGPU::G_SBFX: case AMDGPU::G_UBFX: return selectG_SBFX_UBFX(I); Index: llvm/lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -548,11 +548,44 @@ (!cast(NAME) node:$ptr, node:$data)>; } -defm int_amdgcn_flat_atomic_fadd : noret_op; +multiclass global_addr_space_atomic_op { + def "_noret_global_addrspace" : + PatFrag<(ops node:$ptr, node:$data), + (!cast(NAME) node:$ptr, node:$data)>{ + let HasNoUse = true; + let AddressSpaces = LoadAddress_global.AddrSpaces; + let IsAtomic = 1; + } + def "_global_addrspace" : + PatFrag<(ops node:$ptr, node:$data), + (!cast(NAME) node:$ptr, node:$data)>{ + let AddressSpaces = LoadAddress_global.AddrSpaces; + let IsAtomic = 1; + } +} + +multiclass flat_addr_space_atomic_op { + def "_noret_flat_addrspace" : + PatFrag<(ops node:$ptr, node:$data), + (!cast(NAME) node:$ptr, node:$data)>{ + let HasNoUse = true; + let AddressSpaces = LoadAddress_flat.AddrSpaces; + let IsAtomic = 1; + } + def "_flat_addrspace" : + PatFrag<(ops node:$ptr, node:$data), + (!cast(NAME) node:$ptr, node:$data)>{ + let AddressSpaces = LoadAddress_flat.AddrSpaces; + let IsAtomic = 1; + } +} + +defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op; defm int_amdgcn_flat_atomic_fadd_v2bf16 : noret_op; defm int_amdgcn_flat_atomic_fmin : noret_op; defm int_amdgcn_flat_atomic_fmax : noret_op; -defm int_amdgcn_global_atomic_fadd : noret_op; +defm int_amdgcn_global_atomic_fadd : global_addr_space_atomic_op; +defm int_amdgcn_flat_atomic_fadd : global_addr_space_atomic_op; defm int_amdgcn_global_atomic_fadd_v2bf16 : noret_op; defm int_amdgcn_global_atomic_fmin : noret_op; defm int_amdgcn_global_atomic_fmax : noret_op; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5767,24 +5767,9 @@ case Intrinsic::amdgcn_struct_buffer_atomic_fmin: case Intrinsic::amdgcn_raw_buffer_atomic_fmax: case Intrinsic::amdgcn_struct_buffer_atomic_fmax: - return legalizeBufferAtomic(MI, B, IntrID); case Intrinsic::amdgcn_raw_buffer_atomic_fadd: - case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { - Register DstReg = MI.getOperand(0).getReg(); - if (!MRI.use_empty(DstReg) && - !AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) { - Function &F = B.getMF().getFunction(); - DiagnosticInfoUnsupported NoFpRet( - F, "return versions of fp atomics not supported", B.getDebugLoc(), - DS_Error); - F.getContext().diagnose(NoFpRet); - B.buildUndef(DstReg); - MI.eraseFromParent(); - return true; - } - + case Intrinsic::amdgcn_struct_buffer_atomic_fadd: return legalizeBufferAtomic(MI, B, IntrID); - } case Intrinsic::amdgcn_atomic_inc: return legalizeAtomicIncDec(MI, B, true); case Intrinsic::amdgcn_atomic_dec: Index: llvm/lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/BUFInstructions.td +++ llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1151,12 +1151,12 @@ let OtherPredicates = [HasAtomicFaddRtnInsts] in defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN< - "buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_32 + "buffer_atomic_add_f32", VGPR_32, f32, null_frag >; let OtherPredicates = [isGFX90APlus] in defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < - "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_32 + "buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag >; //===----------------------------------------------------------------------===// @@ -1584,12 +1584,8 @@ class NoUseBufferAtomic : PatFrag < (ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5, node:$src6, node:$src7), - (vt (Op $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7)), - [{ return SDValue(N, 0).use_empty(); }]> { - - let GISelPredicateCode = [{ - return MRI.use_nodbg_empty(MI.getOperand(0).getReg()); - }]; + (vt (Op $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7))> { + let HasNoUse = true; } multiclass BufferAtomicPatterns_NO_RTN(opcode # _OFFSET) getVregSrcForVT.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), $cachepolicy) + (as_i16timm $offset), timm:$cachepolicy) >; def : GCNPat< @@ -1607,7 +1603,7 @@ 0, i32:$soffset, timm:$offset, timm:$cachepolicy, timm), (!cast(opcode # _IDXEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), $cachepolicy) + (as_i16timm $offset), timm:$cachepolicy) >; def : GCNPat< @@ -1615,7 +1611,7 @@ i32:$voffset, i32:$soffset, timm:$offset, timm:$cachepolicy, 0), (!cast(opcode # _OFFEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), $cachepolicy) + (as_i16timm $offset), timm:$cachepolicy) >; def : GCNPat< @@ -1625,24 +1621,21 @@ (!cast(opcode # _BOTHEN) getVregSrcForVT.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), $cachepolicy) + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), timm:$cachepolicy) >; } let SubtargetPredicate = HasAtomicFaddNoRtnInsts in -defm : BufferAtomicPatterns_NO_RTN; +defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>; let SubtargetPredicate = HasAtomicPkFaddNoRtnInsts in -defm : BufferAtomicPatterns_NO_RTN; +defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>; let SubtargetPredicate = HasAtomicFaddRtnInsts in - defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>; let SubtargetPredicate = isGFX90APlus in { - defm : BufferAtomicIntrPat<"int_amdgcn_global_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">; - defm : BufferAtomicIntrPat<"int_amdgcn_global_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">; - defm : BufferAtomicIntrPat<"int_amdgcn_global_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">; - defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16">; + defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>; defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">; Index: llvm/lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/FLATInstructions.td +++ llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1008,11 +1008,6 @@ (inst $vaddr, getVregSrcForVT.ret:$data, $offset) >; -class FlatAtomicPatNoRtn : GCNPat < - (node (FlatOffset i64:$vaddr, i16:$offset), vt:$data), - (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset) ->; - multiclass FlatAtomicPat { defvar rtnNode = !cast(node#"_"#vt.Size); @@ -1026,6 +1021,12 @@ (!cast(inst) VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; } +class FlatSignedAtomicPatBase : GCNPat < + (vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), + (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset) +>; + multiclass FlatSignedAtomicPat { @@ -1033,12 +1034,10 @@ defvar noRtnNode = !cast(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); let AddedComplexity = complexity in - def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), - (!cast(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; + def : FlatSignedAtomicPatBase(inst#"_RTN"), rtnNode, vt, data_vt>; let AddedComplexity = !add(complexity, 1) in - def : GCNPat <(vt (noRtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), - (!cast(inst) VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; + def : FlatSignedAtomicPatBase(inst), noRtnNode, vt, data_vt>; } multiclass FlatSignedAtomicIntrPat ; } -class FlatSignedAtomicPatNoRtn : GCNPat < - (node (GlobalOffset i64:$vaddr, i16:$offset), vt:$data), - (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset) ->; +multiclass FlatSignedAtomicPatWithAddrSpace { + defvar noRtnNode = !cast(intr # "_noret_" # addrSpaceSuffix); + defvar rtnNode = !cast(intr # "_" # addrSpaceSuffix); -class FlatSignedAtomicPatRtn : GCNPat < - (vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), - (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset) ->; + let AddedComplexity = 1 in + def : FlatSignedAtomicPatBase(inst), noRtnNode, vt, data_vt>; + def : FlatSignedAtomicPatBase(inst#"_RTN"), rtnNode, vt, data_vt>; +} class ScratchLoadSignedPat : GCNPat < (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset))), @@ -1251,47 +1249,57 @@ } } -multiclass GlobalFLATAtomicPatsRtn { - def : FlatSignedAtomicPatRtn (nortn_inst_name#"_RTN"), node, vt, data_vt> { - let AddedComplexity = 10; - } +multiclass GlobalFLATAtomicPatsNoRtnBase { + let AddedComplexity = 11 in + def : FlatSignedAtomicPatBase(inst), !cast(node), vt, data_vt>; - def : GlobalAtomicSaddrPat(nortn_inst_name#"_SADDR_RTN"), node, vt, data_vt> { - let AddedComplexity = 11; - } + let AddedComplexity = 13 in + def : GlobalAtomicSaddrPat(inst#"_SADDR"), !cast(node), vt, data_vt>; } -multiclass GlobalFLATAtomicPats { - defvar rtnNode = !cast(node # !if(isIntr, "", "_" # vt.Size)); - defvar noRtnNode = !cast(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); - - defm : FlatSignedAtomicPat ; +multiclass GlobalFLATAtomicPatsRtnBase { + defvar rtnNode = !if(isPatFrags, !cast(node), !cast(node)); - let AddedComplexity = 13 in - def : GlobalAtomicSaddrPat(inst#"_SADDR"), noRtnNode, vt, data_vt>; + let AddedComplexity = 10 in + def : FlatSignedAtomicPatBase(inst#"_RTN"), rtnNode, vt, data_vt>; let AddedComplexity = 12 in def : GlobalAtomicSaddrPat(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>; } +multiclass GlobalFLATAtomicPatsNoRtn : + GlobalFLATAtomicPatsNoRtnBase; + +multiclass GlobalFLATAtomicPatsRtn : + GlobalFLATAtomicPatsRtnBase; + +multiclass GlobalFLATAtomicPats : + GlobalFLATAtomicPatsNoRtn, + GlobalFLATAtomicPatsRtn; + +multiclass GlobalFLATAtomicPatsNoRtnWithAddrSpace : + GlobalFLATAtomicPatsNoRtnBase; + +multiclass GlobalFLATAtomicPatsRtnWithAddrSpace : + GlobalFLATAtomicPatsRtnBase; + +multiclass GlobalFLATAtomicPatsWithAddrSpace : + GlobalFLATAtomicPatsNoRtnWithAddrSpace, + GlobalFLATAtomicPatsRtnWithAddrSpace; + multiclass GlobalFLATAtomicIntrPats { defm : GlobalFLATAtomicPats; } -multiclass GlobalFLATNoRtnAtomicPats { - def : FlatSignedAtomicPatNoRtn { - let AddedComplexity = 10; - } - - def : GlobalAtomicNoRtnSaddrPat(!cast(inst)#"_SADDR"), node, vt> { - let AddedComplexity = 11; - } -} - multiclass ScratchFLATLoadPats { def : ScratchLoadSignedPat { let AddedComplexity = 25; @@ -1425,7 +1433,7 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>; -defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", int_amdgcn_global_atomic_csub, i32>; +defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>; @@ -1452,35 +1460,48 @@ defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>; } -let OtherPredicates = [HasAtomicFaddNoRtnInsts] in -defm : GlobalFLATNoRtnAtomicPats ; -let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in -defm : GlobalFLATNoRtnAtomicPats ; +let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { +defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; +defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>; +defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>; +} + +let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in { +defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>; +defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>; +} + +let OtherPredicates = [HasAtomicFaddRtnInsts] in { +defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; +defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>; +defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>; +} let OtherPredicates = [isGFX90APlus] in { -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_global", v2f16>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", f64>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>; +defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>; +defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>; +defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>; +defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>; defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>; defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", f64>; defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", f64>; -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>; +defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", f64>; defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>; defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>; } +let OtherPredicates = [isGFX940GFX11Plus] in { +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>; +defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", f32>; +} + let OtherPredicates = [isGFX940Plus] in { -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>; -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_flat", v2f16>; -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>; -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>; +defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", v2f16>; defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>; } Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7530,14 +7530,6 @@ Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; case Intrinsic::amdgcn_buffer_atomic_fadd: - if (!Op.getValue(0).use_empty() && !hasAtomicFaddRtnForTy(Op)) { - DiagnosticInfoUnsupported - NoFpRet(DAG.getMachineFunction().getFunction(), - "return versions of fp atomics not supported", - DL.getDebugLoc(), DS_Error); - DAG.getContext()->diagnose(NoFpRet); - return SDValue(); - } Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD; break; default: @@ -7808,19 +7800,13 @@ DAG.setNodeMemRefs(NewNode, {MemRef}); return SDValue(NewNode, 0); } - case Intrinsic::amdgcn_global_atomic_fadd: - if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) { - DiagnosticInfoUnsupported - NoFpRet(DAG.getMachineFunction().getFunction(), - "return versions of fp atomics not supported", - DL.getDebugLoc(), DS_Error); - DAG.getContext()->diagnose(NoFpRet); - return SDValue(); - } - [[fallthrough]]; + case Intrinsic::amdgcn_global_atomic_fadd: { + if (!Subtarget->hasAtomicFaddNoRtnInsts()) + return makeV_ILLEGAL(Op, DAG); + return SDValue(); + } case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmax: - case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: { MemSDNode *M = cast(Op); @@ -7831,16 +7817,6 @@ }; unsigned Opcode = 0; switch (IntrID) { - case Intrinsic::amdgcn_global_atomic_fadd: - if (!Subtarget->hasAtomicFaddNoRtnInsts()) - return makeV_ILLEGAL(Op, DAG); - [[fallthrough]]; - case Intrinsic::amdgcn_flat_atomic_fadd: { - EVT VT = Op.getOperand(3).getValueType(); - return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT, - DAG.getVTList(VT, MVT::Other), Ops, - M->getMemOperand()); - } case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmin: { Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI300 %s -; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s 2>&1 | FileCheck -check-prefix=GFX11 %s - -; GFX11: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.flat.atomic.fadd) +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(float* %ptr, float %data) { ; MI300-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic @@ -15,6 +13,16 @@ ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; MI300-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) ; MI300-NEXT: S_ENDPGM 0 + ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX11-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float* %ptr, float %data) ret void } @@ -31,6 +39,17 @@ ; MI300-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) ; MI300-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float* %ptr, float %data) ret float %ret } @@ -46,6 +65,16 @@ ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; MI300-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) ; MI300-NEXT: S_ENDPGM 0 + ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; GFX11-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic ret void } @@ -62,6 +91,17 @@ ; MI300-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) ; MI300-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic ret float %ret } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll @@ -1,100 +1,161 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=instruction-select < %s 2>&1 | FileCheck -check-prefix=MI100_MI200_GFX11 %s -; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s 2>&1 | FileCheck -check-prefix=MI100_MI200_GFX11 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI300 %s -; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s 2>&1 | FileCheck -check-prefix=MI100_MI200_GFX11 %s - -; MI100_MI200_GFX11: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.flat.atomic.fadd) +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI100_GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI200_MI300 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI200_MI300 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI100_GFX11 %s define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(float addrspace(1)* %ptr, float %data) { - ; MI300-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; MI300-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; MI300-NEXT: S_ENDPGM 0 + ; MI100_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic + ; MI100_GFX11: bb.1 (%ir-block.0): + ; MI100_GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI100_GFX11-NEXT: {{ $}} + ; MI100_GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI100_GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI100_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI100_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI100_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; MI100_GFX11-NEXT: S_ENDPGM 0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI200_MI300-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) ret void } define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_intrinsic(float addrspace(1)* inreg %ptr, float %data) { - ; MI300-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; MI300-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; MI300-NEXT: S_ENDPGM 0 + ; MI100_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic + ; MI100_GFX11: bb.1 (%ir-block.0): + ; MI100_GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; MI100_GFX11-NEXT: {{ $}} + ; MI100_GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; MI100_GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; MI100_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI100_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI100_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI100_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; MI100_GFX11-NEXT: S_ENDPGM 0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI200_MI300-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* inreg %ptr, float %data) ret void } define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_flat_intrinsic(float addrspace(1)* %ptr, float %data) { - ; MI300-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; MI300-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; MI300-NEXT: S_ENDPGM 0 + ; MI100_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic + ; MI100_GFX11: bb.1 (%ir-block.0): + ; MI100_GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI100_GFX11-NEXT: {{ $}} + ; MI100_GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI100_GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI100_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI100_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI100_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; MI100_GFX11-NEXT: S_ENDPGM 0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI200_MI300-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) ret void } define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic(float addrspace(1)* inreg %ptr, float %data) { - ; MI300-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; MI300-NEXT: FLAT_ATOMIC_ADD_F32 [[COPY3]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; MI300-NEXT: S_ENDPGM 0 + ; MI100_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic + ; MI100_GFX11: bb.1 (%ir-block.0): + ; MI100_GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; MI100_GFX11-NEXT: {{ $}} + ; MI100_GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; MI100_GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; MI100_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI100_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI100_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI100_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; MI100_GFX11-NEXT: S_ENDPGM 0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI200_MI300-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float addrspace(1)* inreg %ptr, float %data) ret void } define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(float addrspace(1)* %ptr, float %data) #0 { - ; MI300-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; MI300-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; MI300-NEXT: S_ENDPGM 0 + ; MI100_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw + ; MI100_GFX11: bb.1 (%ir-block.0): + ; MI100_GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI100_GFX11-NEXT: {{ $}} + ; MI100_GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI100_GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI100_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI100_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI100_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; MI100_GFX11-NEXT: S_ENDPGM 0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI200_MI300-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data syncscope("wavefront") monotonic ret void } define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(float addrspace(1)* inreg %ptr, float %data) #0 { - ; MI300-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; MI300-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; MI300-NEXT: S_ENDPGM 0 + ; MI100_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw + ; MI100_GFX11: bb.1 (%ir-block.0): + ; MI100_GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; MI100_GFX11-NEXT: {{ $}} + ; MI100_GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; MI100_GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; MI100_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI100_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI100_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI100_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; MI100_GFX11-NEXT: S_ENDPGM 0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI200_MI300-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data syncscope("wavefront") monotonic ret void } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll @@ -1,106 +1,172 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s 2>&1 | FileCheck -check-prefix=MI200 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI300 %s -; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s 2>&1 | FileCheck -check-prefix=GFX11 %s - -; MI200: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.flat.atomic.fadd) -; GFX11: error: {{.*}} return versions of fp atomics not supported +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI200_MI300 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI200_MI300 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(float addrspace(1)* %ptr, float %data) { - ; MI300-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) ret float %ret } define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_intrinsic(float addrspace(1)* inreg %ptr, float %data) { - ; MI300-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* inreg %ptr, float %data) ret float %ret } define amdgpu_ps float @global_atomic_fadd_f32_rtn_flat_intrinsic(float addrspace(1)* %ptr, float %data) { - ; MI300-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; MI300-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; MI300-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] - ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) ret float %ret } define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_flat_intrinsic(float addrspace(1)* inreg %ptr, float %data) { - ; MI300-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; MI300-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[COPY3]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; MI300-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] - ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float addrspace(1)* inreg %ptr, float %data) ret float %ret } define amdgpu_ps float @global_atomic_fadd_f32_rtn_atomicrmw(float addrspace(1)* %ptr, float %data) #0 { - ; MI300-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data syncscope("wavefront") monotonic ret float %ret } define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(float addrspace(1)* inreg %ptr, float %data) #0 { - ; MI300-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data syncscope("wavefront") monotonic ret float %ret } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll @@ -95,7 +95,7 @@ ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; MI200_MI300-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; MI200_MI300-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; MI200_MI300-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) ; MI200_MI300-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) ret void @@ -112,9 +112,9 @@ ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; MI200_MI300-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; MI200_MI300-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; MI200_MI300-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; MI200_MI300-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 - ; MI200_MI300-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 + ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; MI200_MI300-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 ; MI200_MI300-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; MI200_MI300-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; MI200_MI300-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec @@ -135,8 +135,8 @@ ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; MI200_MI300-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; MI200_MI300-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; MI200_MI300-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; MI200_MI300-NEXT: FLAT_ATOMIC_ADD_F64 [[COPY4]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI200_MI300-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) ; MI200_MI300-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) ret void @@ -153,13 +153,13 @@ ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; MI200_MI300-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; MI200_MI300-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; MI200_MI300-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; MI200_MI300-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN [[COPY4]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; MI200_MI300-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 - ; MI200_MI300-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 - ; MI200_MI300-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; MI200_MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; MI200_MI300-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; MI200_MI300-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; MI200_MI300-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; MI200_MI300-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; MI200_MI300-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; MI200_MI300-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll @@ -1,68 +1,108 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=instruction-select < %s 2>&1 | FileCheck -check-prefix=MI100_MI200 %s -; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s 2>&1 | FileCheck -check-prefix=MI100_MI200 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI300 %s - -; MI100_MI200: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr(<2 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.flat.atomic.fadd) +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI100 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI200_MI300 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI200_MI300 %s define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { - ; MI300-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; MI300-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; MI300-NEXT: S_ENDPGM 0 + ; MI100-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic + ; MI100: bb.1 (%ir-block.0): + ; MI100-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI100-NEXT: {{ $}} + ; MI100-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI100-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI100-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI100-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI100-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) + ; MI100-NEXT: S_ENDPGM 0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI200_MI300-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) ret void } define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(<2 x half> addrspace(1)* inreg %ptr, <2 x half> %data) { - ; MI300-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; MI300-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; MI300-NEXT: S_ENDPGM 0 + ; MI100-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic + ; MI100: bb.1 (%ir-block.0): + ; MI100-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; MI100-NEXT: {{ $}} + ; MI100-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; MI100-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; MI100-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI100-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI100-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI100-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) + ; MI100-NEXT: S_ENDPGM 0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI200_MI300-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) ret void } define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { - ; MI300-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; MI300-NEXT: FLAT_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; MI300-NEXT: S_ENDPGM 0 + ; MI100-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic + ; MI100: bb.1 (%ir-block.0): + ; MI100-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI100-NEXT: {{ $}} + ; MI100-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI100-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI100-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI100-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI100-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) + ; MI100-NEXT: S_ENDPGM 0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI200_MI300-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) ret void } define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(<2 x half> addrspace(1)* inreg %ptr, <2 x half> %data) { - ; MI300-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; MI300-NEXT: FLAT_ATOMIC_PK_ADD_F16 [[COPY3]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; MI300-NEXT: S_ENDPGM 0 + ; MI100-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic + ; MI100: bb.1 (%ir-block.0): + ; MI100-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; MI100-NEXT: {{ $}} + ; MI100-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; MI100-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; MI100-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI100-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI100-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI100-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) + ; MI100-NEXT: S_ENDPGM 0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI200_MI300-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) ret void } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll @@ -1,71 +1,69 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s 2>&1 | FileCheck -check-prefix=MI200 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI300 %s - -; MI200: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(<2 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.flat.atomic.fadd) +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI200_MI300 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI200_MI300 %s define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_intrinsic(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { - ; MI300-LABEL: name: global_atomic_fadd_v2f16_rtn_intrinsic - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; MI300-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]] - ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_v2f16_rtn_intrinsic + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]] + ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) ret <2 x half> %ret } define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_intrinsic(<2 x half> addrspace(1)* inreg %ptr, <2 x half> %data) { - ; MI300-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_intrinsic - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; MI300-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]] - ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_intrinsic + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]] + ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) ret <2 x half> %ret } define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat_intrinsic(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { - ; MI300-LABEL: name: global_atomic_fadd_v2f16_rtn_flat_intrinsic - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; MI300-NEXT: [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; MI300-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]] - ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_v2f16_rtn_flat_intrinsic + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]] + ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) ret <2 x half> %ret } define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic(<2 x half> addrspace(1)* inreg %ptr, <2 x half> %data) { - ; MI300-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic - ; MI300: bb.1 (%ir-block.0): - ; MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; MI300-NEXT: {{ $}} - ; MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MI300-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; MI300-NEXT: [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN [[COPY3]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; MI300-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]] - ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; MI200_MI300-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic + ; MI200_MI300: bb.1 (%ir-block.0): + ; MI200_MI300-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; MI200_MI300-NEXT: {{ $}} + ; MI200_MI300-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; MI200_MI300-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; MI200_MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI200_MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI200_MI300-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) + ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]] + ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) ret <2 x half> %ret } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll @@ -63,11 +63,10 @@ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s2 -; GFX908-NEXT: v_mov_b32_e32 v0, s0 -; GFX908-NEXT: v_mov_b32_e32 v1, s1 -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2048 +; GFX908-NEXT: v_mov_b32_e32 v0, s2 +; GFX908-NEXT: global_atomic_add_f32 v1, v0, s[0:1] offset:2048 ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_f32_off_ss: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll @@ -1,11 +1,10 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s -; RUN: not llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 +; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) -; GFX908: error: {{.*}} return versions of fp atomics not supported -; GFX908: error: {{.*}} return versions of fp atomics not supported +; GFX908: LLVM ERROR: cannot select: %24:vgpr_32(s32) = G_AMDGPU_BUFFER_ATOMIC_FADD %28:vgpr, %14:sgpr(<4 x s32>), %29:vgpr(s32), %30:vgpr, %27:sgpr, 0, 0, 0 :: (volatile dereferenceable load store (s32), align 1, addrspace 4) (in function: buffer_atomic_add_f32_rtn) ; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_rtn: ; GFX90A: buffer_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll @@ -1,8 +1,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s -; RUN: not llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 +; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 -; GFX908: error: {{.*}} return versions of fp atomics not supported -; GFX908: error: {{.*}} return versions of fp atomics not supported +; GFX908: LLVM ERROR: cannot select: %29:vgpr_32(s32) = G_AMDGPU_BUFFER_ATOMIC_FADD %40:vgpr, %15:sgpr(<4 x s32>), %41:vgpr(s32), %42:vgpr, %33:sgpr, 0, 0, -1 :: (volatile dereferenceable load store (s32), align 1, addrspace 4) (in function: buffer_atomic_add_f32_rtn) declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) Index: llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll +++ llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=MI300 %s -; RUN: not --crash llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s 2>&1 | FileCheck -check-prefix=GFX11 %s - -; GFX11: LLVM ERROR: Cannot select: {{.+}}: f32,ch = AtomicLoadFAdd +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(float* %ptr, float %data) { ; MI300-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic @@ -16,6 +14,17 @@ ; MI300-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] ; MI300-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) ; MI300-NEXT: S_ENDPGM 0 + ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX11-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float* %ptr, float %data) ret void } @@ -33,6 +42,18 @@ ; MI300-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) ; MI300-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; MI300-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float* %ptr, float %data) ret float %ret } @@ -49,6 +70,17 @@ ; MI300-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] ; MI300-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) ; MI300-NEXT: S_ENDPGM 0 + ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; GFX11-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic ret void } @@ -66,6 +98,18 @@ ; MI300-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) ; MI300-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; MI300-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic ret float %ret } Index: llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -1,9 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=MI200_MI300 %s ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=MI200_MI300 %s -; RUN: not --crash llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s 2>&1 | FileCheck -check-prefix=GFX11 %s - -; GFX11: error: {{.*}} return versions of fp atomics not supported +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(float addrspace(1)* %ptr, float %data) { ; MI200_MI300-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic @@ -18,6 +16,18 @@ ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) ret float %ret } @@ -35,6 +45,18 @@ ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* inreg %ptr, float %data) ret float %ret } @@ -52,6 +74,18 @@ ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) ret float %ret } @@ -69,6 +103,18 @@ ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float addrspace(1)* inreg %ptr, float %data) ret float %ret } @@ -86,6 +132,18 @@ ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data syncscope("wavefront") monotonic ret float %ret } @@ -103,6 +161,18 @@ ; MI200_MI300-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) ; MI200_MI300-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] ; MI200_MI300-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw + ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data syncscope("wavefront") monotonic ret float %ret } Index: llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -3,9 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s -; RUN: not --crash llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s 2>&1 | FileCheck -enable-var-scope -check-prefixes=GFX11 %s - -; GFX11: LLVM ERROR: Cannot select: {{.+}}: f32,ch = AtomicLoadFAdd<(load store syncscope("agent") seq_cst +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) #0 { ; GFX900-LABEL: global_atomic_fadd_ret_f32: @@ -115,6 +113,35 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_ret_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst store float %result, float addrspace(1)* undef ret void @@ -212,6 +239,20 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_ret_f32_ieee: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef ret void @@ -289,6 +330,18 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_noret_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -365,6 +418,18 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_noret_f32_ieee: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -461,6 +526,20 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_ret_f32_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef ret void @@ -574,6 +653,35 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_ret_f32_system: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("one-as") seq_cst store float %result, float addrspace(1)* undef ret void @@ -605,6 +713,32 @@ ; GCN-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-NEXT: global_store_dword v[0:1], v1, off ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b64 s[2:3], 0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_wbinvl1_vol +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX11-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX11-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11-NEXT: global_store_dword v[0:1], v1, off +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef ret void @@ -621,6 +755,17 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_wbinvl1_vol +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -723,6 +868,32 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_noret_f32_safe: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -797,6 +968,17 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: infer_as_before_atomic: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load float*, float* addrspace(4)* %arg %v = atomicrmw fadd float* %load, float 1.0 syncscope("agent-one-as") monotonic, align 4 ret void Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll @@ -6,7 +6,7 @@ declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)*, float) declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>) -; GFX908: error: {{.*}} return versions of fp atomics not supported +; GFX908: LLVM ERROR: Cannot select: {{.+}}: f32,ch = BUFFER_ATOMIC_FADD ; GFX90A-LABEL: {{^}}buffer_atomic_add_f32: ; GFX90A: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen glc