Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -139,9 +139,6 @@ bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const; bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const; bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const; - bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const; - bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp, - MachineOperand &DataOp) const; bool selectBufferLoadLds(MachineInstr &MI) const; bool selectGlobalLoadLds(MachineInstr &MI) const; bool selectBVHIntrinsic(MachineInstr &I) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1825,8 +1825,6 @@ return selectDSAppendConsume(I, false); case Intrinsic::amdgcn_s_barrier: return selectSBarrier(I); - case Intrinsic::amdgcn_global_atomic_fadd: - return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3)); case Intrinsic::amdgcn_raw_buffer_load_lds: case Intrinsic::amdgcn_struct_buffer_load_lds: return selectBufferLoadLds(I); @@ -2442,13 +2440,6 @@ bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW( MachineInstr &I) const { - if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) { - const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); - unsigned AS = PtrTy.getAddressSpace(); - if (AS == AMDGPUAS::GLOBAL_ADDRESS) - return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2)); - } - initM0(I); return selectImpl(I, *CoverageInfo); } @@ -3015,133 +3006,6 @@ return true; } -bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD( - MachineInstr &MI) const { - const Register DefReg = MI.getOperand(0).getReg(); - LLT DefTy = MRI->getType(DefReg); - if (AMDGPU::hasAtomicFaddRtnForTy(STI, DefTy)) - return selectImpl(MI, *CoverageInfo); - - MachineBasicBlock *MBB = MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - if (!MRI->use_nodbg_empty(DefReg)) { - Function &F = MBB->getParent()->getFunction(); - DiagnosticInfoUnsupported - NoFpRet(F, "return versions of fp atomics not supported", - MI.getDebugLoc(), DS_Error); - F.getContext().diagnose(NoFpRet); - return false; - } - - // FIXME: This is only needed because tablegen requires number of dst operands - // in match and replace pattern to be the same. Otherwise patterns can be - // exported from SDag path. - MachineOperand &VDataIn = MI.getOperand(1); - MachineOperand &VIndex = MI.getOperand(3); - MachineOperand &VOffset = MI.getOperand(4); - MachineOperand &SOffset = MI.getOperand(5); - int16_t Offset = MI.getOperand(6).getImm(); - - bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI); - bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI); - - unsigned Opcode; - if (HasVOffset) { - Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN - : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN; - } else { - Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN - : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET; - } - - if (MRI->getType(VDataIn.getReg()).isVector()) { - switch (Opcode) { - case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN: - Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN; - break; - case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN: - Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN; - break; - case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN: - Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN; - break; - case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET: - Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET; - break; - } - } - - auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode)); - I.add(VDataIn); - - if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN || - Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) { - Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class()); - BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) - .addReg(VIndex.getReg()) - .addImm(AMDGPU::sub0) - .addReg(VOffset.getReg()) - .addImm(AMDGPU::sub1); - - I.addReg(IdxReg); - } else if (HasVIndex) { - I.add(VIndex); - } else if (HasVOffset) { - I.add(VOffset); - } - - I.add(MI.getOperand(2)); // rsrc - I.add(SOffset); - I.addImm(Offset); - I.addImm(MI.getOperand(7).getImm()); // cpol - I.cloneMemRefs(MI); - - MI.eraseFromParent(); - - return true; -} - -bool AMDGPUInstructionSelector::selectGlobalAtomicFadd( - MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const { - - if (STI.hasGFX90AInsts()) { - // gfx90a adds return versions of the global atomic fadd instructions so no - // special handling is required. - return selectImpl(MI, *CoverageInfo); - } - - MachineBasicBlock *MBB = MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) { - Function &F = MBB->getParent()->getFunction(); - DiagnosticInfoUnsupported - NoFpRet(F, "return versions of fp atomics not supported", - MI.getDebugLoc(), DS_Error); - F.getContext().diagnose(NoFpRet); - return false; - } - - // FIXME: This is only needed because tablegen requires number of dst operands - // in match and replace pattern to be the same. Otherwise patterns can be - // exported from SDag path. - auto Addr = selectFlatOffsetImpl(AddrOp, SIInstrFlags::FlatGlobal); - - Register Data = DataOp.getReg(); - const unsigned Opc = MRI->getType(Data).isVector() ? - AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32; - auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) - .addReg(Addr.first) - .addReg(Data) - .addImm(Addr.second) - .addImm(0) // cpol - .cloneMemRefs(MI); - - MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); -} - bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { unsigned Opc; unsigned Size = MI.getOperand(3).getImm(); @@ -3553,8 +3417,6 @@ } case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: return selectBVHIntrinsic(I); - case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: - return selectAMDGPU_BUFFER_ATOMIC_FADD(I); case AMDGPU::G_SBFX: case AMDGPU::G_UBFX: return selectG_SBFX_UBFX(I); Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1341,6 +1341,10 @@ } if (ST.hasAtomicFaddInsts()) Atomic.legalFor({{S32, GlobalPtr}}); + if (ST.hasGFX940Insts()) + Atomic.legalFor({{S32, FlatPtr}, {S64, FlatPtr}, {V2S16, FlatPtr}}); + if (AMDGPU::isGFX11Plus(ST)) + Atomic.legalFor({{S32, FlatPtr}}); if (ST.hasGFX90AInsts()) { // These are legal with some caveats, and should have undergone expansion in @@ -5766,24 +5770,9 @@ case Intrinsic::amdgcn_struct_buffer_atomic_fmin: case Intrinsic::amdgcn_raw_buffer_atomic_fmax: case Intrinsic::amdgcn_struct_buffer_atomic_fmax: - return legalizeBufferAtomic(MI, B, IntrID); case Intrinsic::amdgcn_raw_buffer_atomic_fadd: - case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { - Register DstReg = MI.getOperand(0).getReg(); - if (!MRI.use_empty(DstReg) && - !AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) { - Function &F = B.getMF().getFunction(); - DiagnosticInfoUnsupported NoFpRet( - F, "return versions of fp atomics not supported", B.getDebugLoc(), - DS_Error); - F.getContext().diagnose(NoFpRet); - B.buildUndef(DstReg); - MI.eraseFromParent(); - return true; - } - + case Intrinsic::amdgcn_struct_buffer_atomic_fadd: return legalizeBufferAtomic(MI, B, IntrID); - } case Intrinsic::amdgcn_atomic_inc: return legalizeAtomicIncDec(MI, B, true); case Intrinsic::amdgcn_atomic_dec: Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4572,7 +4572,8 @@ case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: - case Intrinsic::amdgcn_ds_ordered_swap: { + case Intrinsic::amdgcn_ds_ordered_swap: + case Intrinsic::amdgcn_ds_fadd_v2bf16: { unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, Index: llvm/lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/BUFInstructions.td +++ llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1584,12 +1584,8 @@ class NoUseBufferAtomic : PatFrag < (ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5, node:$src6, node:$src7), - (vt (Op $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7)), - [{ return SDValue(N, 0).use_empty(); }]> { - - let GISelPredicateCode = [{ - return MRI.use_nodbg_empty(MI.getOperand(0).getReg()); - }]; + (vt (Op $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7))> { + let HasNoUse = true; } multiclass BufferAtomicPatterns_NO_RTN(opcode # _OFFSET) getVregSrcForVT.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), $cachepolicy) + (as_i16timm $offset), timm:$cachepolicy) >; def : GCNPat< @@ -1607,7 +1603,7 @@ 0, i32:$soffset, timm:$offset, timm:$cachepolicy, timm), (!cast(opcode # _IDXEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), $cachepolicy) + (as_i16timm $offset), timm:$cachepolicy) >; def : GCNPat< @@ -1615,7 +1611,7 @@ i32:$voffset, i32:$soffset, timm:$offset, timm:$cachepolicy, 0), (!cast(opcode # _OFFEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), $cachepolicy) + (as_i16timm $offset), timm:$cachepolicy) >; def : GCNPat< @@ -1625,7 +1621,7 @@ (!cast(opcode # _BOTHEN) getVregSrcForVT.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), $cachepolicy) + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), timm:$cachepolicy) >; } Index: llvm/lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/FLATInstructions.td +++ llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1026,6 +1026,12 @@ (!cast(inst) VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; } +class FlatSignedAtomicPatBase : GCNPat < + (vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), + (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset) +>; + multiclass FlatSignedAtomicPat { @@ -1033,12 +1039,10 @@ defvar noRtnNode = !cast(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); let AddedComplexity = complexity in - def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), - (!cast(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; + def : FlatSignedAtomicPatBase(inst#"_RTN"), rtnNode, vt, data_vt>; let AddedComplexity = !add(complexity, 1) in - def : GCNPat <(vt (noRtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), - (!cast(inst) VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; + def : FlatSignedAtomicPatBase(inst), noRtnNode, vt, data_vt>; } multiclass FlatSignedAtomicIntrPat ; } -class FlatSignedAtomicPatNoRtn : GCNPat < - (node (GlobalOffset i64:$vaddr, i16:$offset), vt:$data), - (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset) ->; - -class FlatSignedAtomicPatRtn : GCNPat < - (vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), - (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset) ->; - class ScratchLoadSignedPat : GCNPat < (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset))), (inst $vaddr, $offset) @@ -1251,45 +1244,50 @@ } } -multiclass GlobalFLATAtomicPatsRtn { - def : FlatSignedAtomicPatRtn (nortn_inst_name#"_RTN"), node, vt, data_vt> { - let AddedComplexity = 10; - } +multiclass GlobalFLATAtomicPatsNoRtnBase { + let AddedComplexity = 11 in + def : FlatSignedAtomicPatBase(inst), node, vt, data_vt>; - def : GlobalAtomicSaddrPat(nortn_inst_name#"_SADDR_RTN"), node, vt, data_vt> { - let AddedComplexity = 11; - } + let AddedComplexity = 13 in + def : GlobalAtomicSaddrPat(inst#"_SADDR"), node, vt, data_vt>; } -multiclass GlobalFLATAtomicPats { - defvar rtnNode = !cast(node # !if(isIntr, "", "_" # vt.Size)); +multiclass GlobalFLATAtomicPatsNoRtn { defvar noRtnNode = !cast(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); + defm : GlobalFLATAtomicPatsNoRtnBase; +} - defm : FlatSignedAtomicPat ; +multiclass GlobalFLATAtomicPatsRtn { + defvar rtnNode = !cast(node # !if(isIntr, "", "_" # vt.Size)); - let AddedComplexity = 13 in - def : GlobalAtomicSaddrPat(inst#"_SADDR"), noRtnNode, vt, data_vt>; + let AddedComplexity = 10 in + def : FlatSignedAtomicPatBase(inst#"_RTN"), rtnNode, vt, data_vt>; let AddedComplexity = 12 in def : GlobalAtomicSaddrPat(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>; } +multiclass GlobalFLATAtomicPats : + GlobalFLATAtomicPatsNoRtn, + GlobalFLATAtomicPatsRtn; + multiclass GlobalFLATAtomicIntrPats { defm : GlobalFLATAtomicPats; } -multiclass GlobalFLATNoRtnAtomicPats { - def : FlatSignedAtomicPatNoRtn { - let AddedComplexity = 10; - } +multiclass GlobalFLATAtomicIntrPatsNoRet { + defm : GlobalFLATAtomicPatsNoRtn; +} - def : GlobalAtomicNoRtnSaddrPat(!cast(inst)#"_SADDR"), node, vt> { - let AddedComplexity = 11; - } +multiclass GlobalFLATAtomicIntrPatsRet { + defm : GlobalFLATAtomicPatsRtn; } multiclass ScratchFLATLoadPats { @@ -1425,7 +1423,7 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>; -defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", int_amdgcn_global_atomic_csub, i32>; +defm : GlobalFLATAtomicIntrPatsRet <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>; @@ -1452,20 +1450,28 @@ defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>; } -let OtherPredicates = [HasAtomicFaddNoRtnInsts] in -defm : GlobalFLATNoRtnAtomicPats ; -let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in -defm : GlobalFLATNoRtnAtomicPats ; +let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { +defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; +defm : GlobalFLATAtomicIntrPatsNoRet <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>; +} + +let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in { +defm : GlobalFLATAtomicPatsNoRtnBase <"GLOBAL_ATOMIC_PK_ADD_F16", atomic_load_fadd_v2f16_global_noret_32, v2f16>; +defm : GlobalFLATAtomicIntrPatsNoRet <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>; +} + +let OtherPredicates = [HasAtomicFaddRtnInsts] in { +defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; +defm : GlobalFLATAtomicIntrPatsRet <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>; +} let OtherPredicates = [isGFX90APlus] in { -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_global", v2f16>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", f64>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>; +defm : GlobalFLATAtomicIntrPatsRet <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>; defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>; @@ -1476,10 +1482,13 @@ defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>; } +let OtherPredicates = [isGFX940GFX11Plus] in { +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>; +defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>; +} + let OtherPredicates = [isGFX940Plus] in { -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>; defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_flat", v2f16>; -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>; defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>; defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7799,19 +7799,8 @@ DAG.setNodeMemRefs(NewNode, {MemRef}); return SDValue(NewNode, 0); } - case Intrinsic::amdgcn_global_atomic_fadd: - if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) { - DiagnosticInfoUnsupported - NoFpRet(DAG.getMachineFunction().getFunction(), - "return versions of fp atomics not supported", - DL.getDebugLoc(), DS_Error); - DAG.getContext()->diagnose(NoFpRet); - return SDValue(); - } - LLVM_FALLTHROUGH; case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmax: - case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: { MemSDNode *M = cast(Op); @@ -7822,13 +7811,6 @@ }; unsigned Opcode = 0; switch (IntrID) { - case Intrinsic::amdgcn_global_atomic_fadd: - case Intrinsic::amdgcn_flat_atomic_fadd: { - EVT VT = Op.getOperand(3).getValueType(); - return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT, - DAG.getVTList(VT, MVT::Other), Ops, - M->getMemOperand()); - } case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmin: { Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN; @@ -12752,7 +12734,7 @@ if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) && Subtarget->hasAtomicFaddNoRtnInsts()) { - if (Subtarget->hasGFX940Insts()) + if (Subtarget->hasGFX940Insts() || AMDGPU::isGFX11(*Subtarget)) return AtomicExpansionKind::None; // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -march=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 +; RUN: llc < %s -march=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data) declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data) @@ -24,6 +24,40 @@ ret void } +define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(float* %ptr) { +; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_endpgm + %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(float* %ptr) #0 { +; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_endpgm + %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst + ret void +} + define float @flat_atomic_fadd_f32_rtn(float* %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_rtn: ; GFX940: ; %bb.0: @@ -35,6 +69,21 @@ ret float %ret } +define float @flat_atomic_fadd_f32_rtn_pat(float* %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst + ret float %ret +} + define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(<2 x half>* %ptr, <2 x half> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: @@ -60,6 +109,56 @@ ret <2 x half> %ret } +define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(<2 x i16>* %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX940-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(<2 x i16>* %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + +define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[2:3] +; GFX940-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @global_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: global_atomic_fadd_v2bf16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(<2 x half> addrspace(3)* %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: @@ -84,3 +183,37 @@ %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3)* %ptr, <2 x half> %data, i32 0, i32 0, i1 0) ret <2 x half> %ret } + +define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x28 +; GFX940-NEXT: s_load_dword s3, s[0:1], 0x24 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_pk_add_bf16 v1, v0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @local_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + +attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll @@ -4,7 +4,7 @@ declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) declare <2 x half> @llvm.amdgcn.global.atomic.fadd.f32.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) -; GFX908: error: {{.*}} return versions of fp atomics not supported +; GFX908: LLVM ERROR: cannot select: %4:vgpr_32(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.global.atomic.fadd), %0:vgpr(p1), %1:vgpr(s32) :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) (in function: global_atomic_fadd_f32_rtn) ; GFX90A-LABEL: {{^}}global_atomic_fadd_f32_rtn: ; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll @@ -63,11 +63,10 @@ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s2 -; GFX908-NEXT: v_mov_b32_e32 v0, s0 -; GFX908-NEXT: v_mov_b32_e32 v1, s1 -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2048 +; GFX908-NEXT: v_mov_b32_e32 v0, s2 +; GFX908-NEXT: global_atomic_add_f32 v1, v0, s[0:1] offset:2048 ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_f32_off_ss: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll @@ -1,11 +1,10 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s -; RUN: not llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 +; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) -; GFX908: error: {{.*}} return versions of fp atomics not supported -; GFX908: error: {{.*}} return versions of fp atomics not supported +; GFX908: LLVM ERROR: cannot select: %24:vgpr_32(s32) = G_AMDGPU_BUFFER_ATOMIC_FADD %28:vgpr, %14:sgpr(<4 x s32>), %29:vgpr(s32), %30:vgpr, %27:sgpr, 0, 0, 0 :: (volatile dereferenceable load store (s32), align 1, addrspace 4) (in function: buffer_atomic_add_f32_rtn) ; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_rtn: ; GFX90A: buffer_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll @@ -1,8 +1,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s -; RUN: not llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 +; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 -; GFX908: error: {{.*}} return versions of fp atomics not supported -; GFX908: error: {{.*}} return versions of fp atomics not supported +; GFX908: LLVM ERROR: cannot select: %29:vgpr_32(s32) = G_AMDGPU_BUFFER_ATOMIC_FADD %40:vgpr, %15:sgpr(<4 x s32>), %41:vgpr(s32), %42:vgpr, %33:sgpr, 0, 0, -1 :: (volatile dereferenceable load store (s32), align 1, addrspace 4) (in function: buffer_atomic_add_f32_rtn) declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) Index: llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -117,29 +117,14 @@ ; GFX11-LABEL: global_atomic_fadd_ret_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB0_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst @@ -243,29 +228,14 @@ ; GFX11-LABEL: global_atomic_fadd_ret_f32_ieee: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB1_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst @@ -545,29 +515,14 @@ ; GFX11-LABEL: global_atomic_fadd_ret_f32_agent: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst @@ -687,29 +642,14 @@ ; GFX11-LABEL: global_atomic_fadd_ret_f32_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("one-as") seq_cst @@ -865,27 +805,13 @@ ; GFX11-LABEL: global_atomic_fadd_noret_f32_safe: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.rtn_no-rtn.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.rtn_no-rtn.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.rtn_no-rtn.ll @@ -1,48 +1,244 @@ -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=SDAG %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -enable-var-scope -check-prefixes=GISEL %s ; no-rtn -; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; SDAG-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; SDAG-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; SDAG-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; SDAG-NEXT: S_ENDPGM 0 + ; GISEL-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; GISEL-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } -; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; SDAG-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; SDAG-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; SDAG-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; SDAG-NEXT: S_ENDPGM 0 + ; GISEL-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; GISEL-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void } -; GFX11: BUFFER_ATOMIC_ADD_F32_OFFSET define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { + ; SDAG-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 + ; SDAG-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; SDAG-NEXT: S_ENDPGM 0 + ; GISEL-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; GISEL-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void } -; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { + ; SDAG-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; SDAG-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; SDAG-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; SDAG-NEXT: S_ENDPGM 0 + ; GISEL-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; GISEL-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0) ret void } -; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { + ; SDAG-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; SDAG-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; SDAG-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; SDAG-NEXT: S_ENDPGM 0 + ; GISEL-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; GISEL-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2) ret void } -; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; SDAG-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; SDAG-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; SDAG-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; SDAG-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; SDAG-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; SDAG-NEXT: S_ENDPGM 0 + ; GISEL-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GISEL-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; GISEL-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) ret void } -; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN define amdgpu_ps void @xstruct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; SDAG-LABEL: name: xstruct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; SDAG-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; SDAG-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; SDAG-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; SDAG-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; SDAG-NEXT: S_ENDPGM 0 + ; GISEL-LABEL: name: xstruct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GISEL-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; GISEL-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } @@ -50,50 +246,590 @@ ; rtn -; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN define amdgpu_ps float @raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; SDAG-LABEL: name: raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; SDAG-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; SDAG-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; SDAG-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] + ; SDAG-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GISEL-LABEL: name: raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; GISEL-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] + ; GISEL-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset.add = add i32 %voffset, 4095 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret float %ret } -; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN define amdgpu_ps float @raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; SDAG-LABEL: name: raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; SDAG-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; SDAG-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; SDAG-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] + ; SDAG-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GISEL-LABEL: name: raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; GISEL-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] + ; GISEL-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) ret float %ret } -; GFX11: BUFFER_ATOMIC_ADD_F32_OFFSET define amdgpu_ps float @raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { + ; SDAG-LABEL: name: raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 + ; SDAG-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; SDAG-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] + ; SDAG-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GISEL-LABEL: name: raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; GISEL-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] + ; GISEL-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret float %ret } -; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN define amdgpu_ps float @struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { + ; SDAG-LABEL: name: struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; SDAG-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; SDAG-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; SDAG-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] + ; SDAG-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GISEL-LABEL: name: struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; GISEL-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] + ; GISEL-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0) ret float %ret } -; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN define amdgpu_ps float @struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { + ; SDAG-LABEL: name: struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; SDAG-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; SDAG-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; SDAG-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] + ; SDAG-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GISEL-LABEL: name: struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; GISEL-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] + ; GISEL-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2) ret float %ret } -; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN define amdgpu_ps float @struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; SDAG-LABEL: name: struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; SDAG-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; SDAG-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; SDAG-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; SDAG-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; SDAG-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] + ; SDAG-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GISEL-LABEL: name: struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GISEL-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; GISEL-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] + ; GISEL-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset.add = add i32 %voffset, 4095 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) ret float %ret } -; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN define amdgpu_ps float @xstruct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; SDAG-LABEL: name: xstruct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; SDAG-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; SDAG-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; SDAG-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; SDAG-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; SDAG-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] + ; SDAG-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GISEL-LABEL: name: xstruct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GISEL-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) + ; GISEL-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] + ; GISEL-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret float %ret } +define amdgpu_ps void @global_atomic_fadd_f32_noret_intrinsic(float addrspace(1)* %ptr, float %data) { + ; SDAG-LABEL: name: global_atomic_fadd_f32_noret_intrinsic + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; SDAG-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; SDAG-NEXT: S_ENDPGM 0 + ; GISEL-LABEL: name: global_atomic_fadd_f32_noret_intrinsic + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GISEL-NEXT: S_ENDPGM 0 + %ret = call float @llvm.amdgcn.global.atomic.fadd(float addrspace(1)* %ptr, float %data) + ret void +} + +define amdgpu_ps float @global_atomic_fadd_f32_ret_intrinsic(float addrspace(1)* %ptr, float %data) { + ; SDAG-LABEL: name: global_atomic_fadd_f32_ret_intrinsic + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; SDAG-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; SDAG-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; SDAG-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GISEL-LABEL: name: global_atomic_fadd_f32_ret_intrinsic + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GISEL-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GISEL-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %ret = call float @llvm.amdgcn.global.atomic.fadd(float addrspace(1)* %ptr, float %data) + ret float %ret +} + +define amdgpu_ps void @global_atomic_fadd_f32_saddr_noret_intrinsic(float addrspace(1)* inreg %ptr, float %data) { + ; SDAG-LABEL: name: global_atomic_fadd_f32_saddr_noret_intrinsic + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; SDAG-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; SDAG-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; SDAG-NEXT: S_ENDPGM 0 + ; GISEL-LABEL: name: global_atomic_fadd_f32_saddr_noret_intrinsic + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GISEL-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GISEL-NEXT: S_ENDPGM 0 + %ret = call float @llvm.amdgcn.global.atomic.fadd(float addrspace(1)* %ptr, float %data) + ret void +} + +define amdgpu_ps float @global_atomic_fadd_f32_saddr_ret_intrinsic(float addrspace(1)* inreg %ptr, float %data) { + ; SDAG-LABEL: name: global_atomic_fadd_f32_saddr_ret_intrinsic + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; SDAG-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; SDAG-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; SDAG-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; SDAG-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GISEL-LABEL: name: global_atomic_fadd_f32_saddr_ret_intrinsic + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GISEL-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GISEL-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; GISEL-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %ret = call float @llvm.amdgcn.global.atomic.fadd(float addrspace(1)* %ptr, float %data) + ret float %ret +} + + +define amdgpu_ps void @global_atomic_fadd_f32_noret_atomicrmw(float addrspace(1)* %ptr, float %data) { + ; SDAG-LABEL: name: global_atomic_fadd_f32_noret_atomicrmw + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; SDAG-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store seq_cst (s32) on %ir.ptr, addrspace 1) + ; SDAG-NEXT: S_ENDPGM 0 + ; GISEL-LABEL: name: global_atomic_fadd_f32_noret_atomicrmw + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store seq_cst (s32) on %ir.ptr, addrspace 1) + ; GISEL-NEXT: S_ENDPGM 0 + %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data seq_cst + ret void +} + +define amdgpu_ps float @global_atomic_fadd_f32_ret_atomicrmw(float addrspace(1)* %ptr, float %data) { + ; SDAG-LABEL: name: global_atomic_fadd_f32_ret_atomicrmw + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; SDAG-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store seq_cst (s32) on %ir.ptr, addrspace 1) + ; SDAG-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; SDAG-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GISEL-LABEL: name: global_atomic_fadd_f32_ret_atomicrmw + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store seq_cst (s32) on %ir.ptr, addrspace 1) + ; GISEL-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GISEL-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data seq_cst + ret float %ret +} + +define amdgpu_ps void @global_atomic_fadd_f32_saddr_noret_atomicrmw(float addrspace(1)* inreg %ptr, float %data) { + ; SDAG-LABEL: name: global_atomic_fadd_f32_saddr_noret_atomicrmw + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; SDAG-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; SDAG-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store seq_cst (s32) on %ir.ptr, addrspace 1) + ; SDAG-NEXT: S_ENDPGM 0 + ; GISEL-LABEL: name: global_atomic_fadd_f32_saddr_noret_atomicrmw + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GISEL-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store seq_cst (s32) on %ir.ptr, addrspace 1) + ; GISEL-NEXT: S_ENDPGM 0 + %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data seq_cst + ret void +} + +define amdgpu_ps float @global_atomic_fadd_f32_saddr_ret_atomicrmw(float addrspace(1)* inreg %ptr, float %data) { + ; SDAG-LABEL: name: global_atomic_fadd_f32_saddr_ret_atomicrmw + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; SDAG-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; SDAG-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store seq_cst (s32) on %ir.ptr, addrspace 1) + ; SDAG-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; SDAG-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GISEL-LABEL: name: global_atomic_fadd_f32_saddr_ret_atomicrmw + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GISEL-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store seq_cst (s32) on %ir.ptr, addrspace 1) + ; GISEL-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; GISEL-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data seq_cst + ret float %ret +} + +define amdgpu_ps void @flat_atomic_fadd_f32_noret_intrinsic(float* %ptr, float %data) { + ; SDAG-LABEL: name: flat_atomic_fadd_f32_noret_intrinsic + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; SDAG-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; SDAG-NEXT: S_ENDPGM 0 + ; GISEL-LABEL: name: flat_atomic_fadd_f32_noret_intrinsic + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GISEL-NEXT: S_ENDPGM 0 + %ret = call float @llvm.amdgcn.flat.atomic.fadd(float* %ptr, float %data) + ret void +} + +define amdgpu_ps float @flat_atomic_fadd_f32_ret_intrinsic(float* %ptr, float %data) { + ; SDAG-LABEL: name: flat_atomic_fadd_f32_ret_intrinsic + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; SDAG-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; SDAG-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; SDAG-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GISEL-LABEL: name: flat_atomic_fadd_f32_ret_intrinsic + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GISEL-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; GISEL-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %ret = call float @llvm.amdgcn.flat.atomic.fadd(float* %ptr, float %data) + ret float %ret +} + +define amdgpu_ps void @flat_atomic_fadd_f32_noret_atomicrmw(float* %ptr, float %data) { + ; SDAG-LABEL: name: flat_atomic_fadd_f32_noret_atomicrmw + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; SDAG-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32) on %ir.ptr) + ; SDAG-NEXT: S_ENDPGM 0 + ; GISEL-LABEL: name: flat_atomic_fadd_f32_noret_atomicrmw + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32) on %ir.ptr) + ; GISEL-NEXT: S_ENDPGM 0 + %ret = atomicrmw fadd float* %ptr, float %data seq_cst + ret void +} + +define amdgpu_ps float @flat_atomic_fadd_f32_ret_atomicrmw(float* %ptr, float %data) { + ; SDAG-LABEL: name: flat_atomic_fadd_f32_ret_atomicrmw + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; SDAG-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; SDAG-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32) on %ir.ptr) + ; SDAG-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; SDAG-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GISEL-LABEL: name: flat_atomic_fadd_f32_ret_atomicrmw + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32) on %ir.ptr) + ; GISEL-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; GISEL-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %ret = atomicrmw fadd float* %ptr, float %data seq_cst + ret float %ret +} + declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.global.atomic.fadd(float addrspace(1)*, float) +declare float @llvm.amdgcn.flat.atomic.fadd(float*, float) attributes #0 = { nounwind }