Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -135,6 +135,20 @@ def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; + class GISelSop2Pat < SDPatternOperator node, Instruction inst, Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -116,6 +116,8 @@ bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsTyped, bool IsFormat) const; + bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, + Intrinsic::ID IID) const; bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const override; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2607,6 +2607,114 @@ return true; } +static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { + switch (IntrID) { + case Intrinsic::amdgcn_raw_buffer_atomic_swap: + case Intrinsic::amdgcn_struct_buffer_atomic_swap: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; + case Intrinsic::amdgcn_raw_buffer_atomic_add: + case Intrinsic::amdgcn_struct_buffer_atomic_add: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; + case Intrinsic::amdgcn_raw_buffer_atomic_sub: + case Intrinsic::amdgcn_struct_buffer_atomic_sub: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; + case Intrinsic::amdgcn_raw_buffer_atomic_smin: + case Intrinsic::amdgcn_struct_buffer_atomic_smin: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; + case Intrinsic::amdgcn_raw_buffer_atomic_umin: + case Intrinsic::amdgcn_struct_buffer_atomic_umin: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; + case Intrinsic::amdgcn_raw_buffer_atomic_smax: + case Intrinsic::amdgcn_struct_buffer_atomic_smax: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; + case Intrinsic::amdgcn_raw_buffer_atomic_umax: + case Intrinsic::amdgcn_struct_buffer_atomic_umax: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; + case Intrinsic::amdgcn_raw_buffer_atomic_and: + case Intrinsic::amdgcn_struct_buffer_atomic_and: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; + case Intrinsic::amdgcn_raw_buffer_atomic_or: + case Intrinsic::amdgcn_struct_buffer_atomic_or: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; + case Intrinsic::amdgcn_raw_buffer_atomic_xor: + case Intrinsic::amdgcn_struct_buffer_atomic_xor: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; + case Intrinsic::amdgcn_raw_buffer_atomic_inc: + case Intrinsic::amdgcn_struct_buffer_atomic_inc: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; + case Intrinsic::amdgcn_raw_buffer_atomic_dec: + case Intrinsic::amdgcn_struct_buffer_atomic_dec: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; + default: + llvm_unreachable("unhandled atomic opcode"); + } +} + +bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, + MachineIRBuilder &B, + Intrinsic::ID IID) const { + B.setInstr(MI); + + const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || + IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; + + Register Dst = MI.getOperand(0).getReg(); + Register VData = MI.getOperand(2).getReg(); + + Register CmpVal; + int OpOffset = 0; + + if (IsCmpSwap) { + CmpVal = MI.getOperand(3 + OpOffset).getReg(); + ++OpOffset; + } + + Register RSrc = MI.getOperand(3 + OpOffset).getReg(); + const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; + + // The struct intrinsic variants add one additional operand over raw. + const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; + Register VIndex; + if (HasVIndex) { + VIndex = MI.getOperand(4).getReg(); + ++OpOffset; + } + + Register VOffset = MI.getOperand(4 + OpOffset).getReg(); + Register SOffset = MI.getOperand(5 + OpOffset).getReg(); + unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); + + MachineMemOperand *MMO = *MI.memoperands_begin(); + + unsigned ImmOffset; + unsigned TotalOffset; + std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); + if (TotalOffset != 0) + MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); + + if (!VIndex) + VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); + + auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) + .addDef(Dst) + .addUse(VData); // vdata + + if (IsCmpSwap) + MIB.addReg(CmpVal); + + MIB.addUse(RSrc) // rsrc + .addUse(VIndex) // vindex + .addUse(VOffset) // voffset + .addUse(SOffset) // soffset + .addImm(ImmOffset) // offset(imm) + .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) + .addImm(HasVIndex ? -1 : 0) // idxen(imm) + .addMemOperand(MMO); + + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -2739,6 +2847,33 @@ case Intrinsic::amdgcn_raw_tbuffer_load: case Intrinsic::amdgcn_struct_tbuffer_load: return legalizeBufferLoad(MI, MRI, B, true, true); + case Intrinsic::amdgcn_raw_buffer_atomic_swap: + case Intrinsic::amdgcn_struct_buffer_atomic_swap: + case Intrinsic::amdgcn_raw_buffer_atomic_add: + case Intrinsic::amdgcn_struct_buffer_atomic_add: + case Intrinsic::amdgcn_raw_buffer_atomic_sub: + case Intrinsic::amdgcn_struct_buffer_atomic_sub: + case Intrinsic::amdgcn_raw_buffer_atomic_smin: + case Intrinsic::amdgcn_struct_buffer_atomic_smin: + case Intrinsic::amdgcn_raw_buffer_atomic_umin: + case Intrinsic::amdgcn_struct_buffer_atomic_umin: + case Intrinsic::amdgcn_raw_buffer_atomic_smax: + case Intrinsic::amdgcn_struct_buffer_atomic_smax: + case Intrinsic::amdgcn_raw_buffer_atomic_umax: + case Intrinsic::amdgcn_struct_buffer_atomic_umax: + case Intrinsic::amdgcn_raw_buffer_atomic_and: + case Intrinsic::amdgcn_struct_buffer_atomic_and: + case Intrinsic::amdgcn_raw_buffer_atomic_or: + case Intrinsic::amdgcn_struct_buffer_atomic_or: + case Intrinsic::amdgcn_raw_buffer_atomic_xor: + case Intrinsic::amdgcn_struct_buffer_atomic_xor: + case Intrinsic::amdgcn_raw_buffer_atomic_inc: + case Intrinsic::amdgcn_struct_buffer_atomic_inc: + case Intrinsic::amdgcn_raw_buffer_atomic_dec: + case Intrinsic::amdgcn_struct_buffer_atomic_dec: + case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: + return legalizeBufferAtomic(MI, B, IntrID); default: return true; } Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2124,6 +2124,27 @@ executeInWaterfallLoop(MI, MRI, {1, 4}); return; } + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, {2, 5}); + return; + } + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, {3, 6}); + return; + } case AMDGPU::G_INTRINSIC: { switch (MI.getIntrinsicID()) { case Intrinsic::amdgcn_s_buffer_load: { @@ -2912,6 +2933,40 @@ // initialized. break; } + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { + // vdata_out + OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + + // vdata_in + OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + + // rsrc + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + + // vindex + OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + + // voffset + OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + + // soffset + OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); + + // Any remaining operands are immediates and were correctly null + // initialized. + break; + } case AMDGPU::G_INTRINSIC: { switch (MI.getIntrinsicID()) { default: Index: llvm/lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/BUFInstructions.td +++ llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1342,37 +1342,37 @@ multiclass BufferAtomicPatterns { def : GCNPat< - (vt (name vt:$vdata_in, v4i32:$rsrc, 0, - 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0)), - (!cast(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset, - (as_i16imm $offset), (extract_slc $cachepolicy)) + (vt (name vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset, + timm:$offset, timm:$cachepolicy, 0)), + (!cast(opcode # _OFFSET_RTN) + getVregSrcForVT.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, + (as_i16timm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm)), - (!cast(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (extract_slc $cachepolicy)) + (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, + timm:$offset, timm:$cachepolicy, timm)), + (!cast(opcode # _IDXEN_RTN) getVregSrcForVT.ret:$vdata_in, + VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, + (as_i16timm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (vt (name vt:$vdata_in, v4i32:$rsrc, 0, - i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0)), - (!cast(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (extract_slc $cachepolicy)) + (vt (name vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset, + i32:$soffset, timm:$offset, timm:$cachepolicy, 0)), + (!cast(opcode # _OFFEN_RTN) getVregSrcForVT.ret:$vdata_in, + VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, + (as_i16timm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm)), + (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset, + i32:$soffset, timm:$offset, timm:$cachepolicy, timm)), (!cast(opcode # _BOTHEN_RTN) - $vdata_in, - (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) + getVregSrcForVT.ret:$vdata_in, + (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (extract_slc $cachepolicy)) >; } Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2127,3 +2127,36 @@ let mayLoad = 1; let mayStore = 1; } + +class BufferAtomicGenericInstruction : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, + type2:$soffset, untyped_imm_0:$offset, + untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 1; +} + +def G_AMDGPU_BUFFER_ATOMIC_SWAP : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_ADD : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_SUB : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_SMIN : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; + +def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$vdata, type0:$cmp, type1:$rsrc, type2:$vindex, + type2:$voffset, type2:$soffset, untyped_imm_0:$offset, + untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 1; +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll @@ -0,0 +1,231 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s + +; Natural mapping +define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]] + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i32 %ret to float + ret float %cast +} + +define amdgpu_ps float @raw_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]] + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i32 %ret to float + ret float %cast +} + +define amdgpu_ps <2 x float> @raw_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; CHECK: [[BUFFER_ATOMIC_ADD_X2_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 8 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_OFFEN_RTN]].sub0 + ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_OFFEN_RTN]].sub1 + ; CHECK: $vgpr0 = COPY [[COPY8]] + ; CHECK: $vgpr1 = COPY [[COPY9]] + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i64 %ret to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps void @raw_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; CHECK: [[BUFFER_ATOMIC_ADD_X2_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 8 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +; All operands need regbank legalization +define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; CHECK: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]] + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i32 %ret to float + ret float %cast +} + +; All operands need regbank legalization +define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; CHECK: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: S_ENDPGM 0 + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4) + ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]] + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %voffset = add i32 %voffset.base, 4095 + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i32 %ret to float + ret float %cast +} + +; Natural mapping + slc +define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]] + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) + %cast = bitcast i32 %ret to float + ret float %cast +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32 immarg) #0 +declare i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64, <4 x i32>, i32, i32, i32 immarg) #0 + +attributes #0 = { nounwind } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll @@ -0,0 +1,226 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s + +; Natural mapping +define amdgpu_ps float @struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; CHECK: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]] + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i32 %ret to float + ret float %cast +} + +define amdgpu_ps float @struct_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: struct_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; CHECK: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]] + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i32 %ret to float + ret float %cast +} + +define amdgpu_ps <2 x float> @struct_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: struct_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; CHECK: [[BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 8 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN]].sub0 + ; CHECK: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN]].sub1 + ; CHECK: $vgpr0 = COPY [[COPY9]] + ; CHECK: $vgpr1 = COPY [[COPY10]] + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i64 %ret to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps void @struct_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: struct_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; CHECK: [[BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 8 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +; All register operands need legalization +define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { + ; CHECK-LABEL: name: struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; CHECK: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; CHECK: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY11]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY12]], implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec + ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 + ; CHECK: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]] + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i32 %ret to float + ret float %cast +} + +; All register operands need legalization +define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { + ; CHECK-LABEL: name: struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; CHECK: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; CHECK: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY11]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY12]], implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec + ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 + ; CHECK: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: S_ENDPGM 0 + %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +; Natural mapping + slc +define amdgpu_ps float @struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; CHECK: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]] + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + %cast = bitcast i32 %ret to float + ret float %cast +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg) #0 +declare i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64, <4 x i32>, i32, i32, i32, i32 immarg) #0 + +attributes #0 = { nounwind }