Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -973,9 +973,9 @@ def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore; -class AMDGPURawBufferAtomic : Intrinsic < - [data_ty], - [LLVMMatchType<0>, // vdata(VGPR) +class AMDGPURawBufferAtomic : Intrinsic < + !if(NoRtn, [], [data_ty]), + [!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR) llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) @@ -1005,9 +1005,12 @@ [ImmArg>, IntrWillReturn], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; -class AMDGPUStructBufferAtomic : Intrinsic < - [data_ty], - [LLVMMatchType<0>, // vdata(VGPR) +// gfx908 intrinsic +def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic; + +class AMDGPUStructBufferAtomic : Intrinsic < + !if(NoRtn, [], [data_ty]), + [!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR) llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) @@ -1039,6 +1042,10 @@ [ImmArg>, IntrWillReturn], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; +// gfx908 intrinsic +def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic; + + // Obsolescent tbuffer intrinsics. def int_amdgcn_tbuffer_load : Intrinsic < [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 @@ -1804,9 +1811,11 @@ [IntrArgMemOnly, IntrWillReturn, NoCapture>], "", [SDNPMemOperand]>; -def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicNoRtn; def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicNoRtn; +// Legacy form of the intrinsic. raw and struct forms should be preferred. +def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicNoRtn; + // llvm.amdgcn.mfma.f32.* vdst, srcA, srcB, srcC, cbsz, abid, blgp def int_amdgcn_mfma_f32_32x32x1f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x1f32">, Intrinsic<[llvm_v32f32_ty], Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -199,6 +199,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3606,6 +3606,9 @@ case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; + case Intrinsic::amdgcn_raw_buffer_atomic_fadd: + case Intrinsic::amdgcn_struct_buffer_atomic_fadd: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; default: llvm_unreachable("unhandled atomic opcode"); } @@ -3616,12 +3619,20 @@ Intrinsic::ID IID) const { const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; + const bool HasReturn = MI.getNumExplicitDefs() != 0; - Register Dst = MI.getOperand(0).getReg(); - Register VData = MI.getOperand(2).getReg(); + Register Dst; - Register CmpVal; int OpOffset = 0; + if (HasReturn) { + // A few FP atomics do not support return values. + Dst = MI.getOperand(0).getReg(); + } else { + OpOffset = -1; + } + + Register VData = MI.getOperand(2 + OpOffset).getReg(); + Register CmpVal; if (IsCmpSwap) { CmpVal = MI.getOperand(3 + OpOffset).getReg(); @@ -3629,7 +3640,7 @@ } Register RSrc = MI.getOperand(3 + OpOffset).getReg(); - const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; + const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; // The struct intrinsic variants add one additional operand over raw. const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; @@ -3654,9 +3665,12 @@ if (!VIndex) VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); - auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) - .addDef(Dst) - .addUse(VData); // vdata + auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); + + if (HasReturn) + MIB.addDef(Dst); + + MIB.addUse(VData); // vdata if (IsCmpSwap) MIB.addReg(CmpVal); @@ -4419,6 +4433,8 @@ case Intrinsic::amdgcn_struct_buffer_atomic_inc: case Intrinsic::amdgcn_raw_buffer_atomic_dec: case Intrinsic::amdgcn_struct_buffer_atomic_dec: + case Intrinsic::amdgcn_raw_buffer_atomic_fadd: + case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: return legalizeBufferAtomic(MI, B, IntrID); Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2957,6 +2957,11 @@ executeInWaterfallLoop(MI, MRI, {2, 5}); return; } + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, {1, 4}); + return; + } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { applyDefaultMapping(OpdMapper); executeInWaterfallLoop(MI, MRI, {3, 6}); @@ -3906,6 +3911,23 @@ // initialized. break; } + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: { + // vdata_in + OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + + // rsrc + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + + // vindex + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + + // voffset + OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + + // soffset + OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + break; + } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { // vdata_out OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); Index: llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -225,6 +225,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; @@ -238,6 +239,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; Index: llvm/lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/BUFInstructions.td +++ llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1399,24 +1399,24 @@ (name vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$cachepolicy, 0), - (!cast(opcode # _OFFSET) $vdata_in, $rsrc, $soffset, - (as_i16imm $offset), (extract_slc $cachepolicy)) + (!cast(opcode # _OFFSET) getVregSrcForVT.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, + (as_i16timm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$cachepolicy, timm), - (!cast(opcode # _IDXEN) $vdata_in, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (extract_slc $cachepolicy)) + (!cast(opcode # _IDXEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, + (as_i16timm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (name vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$cachepolicy, 0), - (!cast(opcode # _OFFEN) $vdata_in, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (extract_slc $cachepolicy)) + (!cast(opcode # _OFFEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, + (as_i16timm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< @@ -1424,9 +1424,9 @@ i32:$voffset, i32:$soffset, timm:$offset, timm:$cachepolicy, timm), (!cast(opcode # _BOTHEN) - $vdata_in, - (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) + getVregSrcForVT.ret:$vdata_in, + (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)) >; } Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1082,8 +1082,9 @@ Info.flags |= MachineMemOperand::MOStore; } else { // Atomic - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType()); + Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID : + ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType()); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore | MachineMemOperand::MODereferenceable; @@ -7083,7 +7084,6 @@ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC); case Intrinsic::amdgcn_raw_buffer_atomic_dec: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); - case Intrinsic::amdgcn_struct_buffer_atomic_swap: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP); @@ -7506,7 +7506,10 @@ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } - + case Intrinsic::amdgcn_raw_buffer_atomic_fadd: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); + case Intrinsic::amdgcn_struct_buffer_atomic_fadd: + return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); case Intrinsic::amdgcn_buffer_atomic_fadd: { unsigned Slc = cast(Op.getOperand(6))->getZExtValue(); unsigned IdxEn = 1; Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2430,8 +2430,8 @@ def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP; } -class BufferAtomicGenericInstruction : AMDGPUGenericInstruction { - let OutOperandList = (outs type0:$dst); +class BufferAtomicGenericInstruction : AMDGPUGenericInstruction { + let OutOperandList = !if(NoRtn, (outs), (outs type0:$dst)); let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, type2:$soffset, untyped_imm_0:$offset, untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); @@ -2452,6 +2452,7 @@ def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction<1/*NoRtn*/>; def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -0,0 +1,245 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s + +; Natural mapping +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %voffset.add = add i32 %voffset, 4095 + call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) + ret void +} + +; Natural mapping, no voffset +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + ret void +} + +; All operands need regbank legalization +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +; All operands need regbank legalization, no voffset +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec + ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095(float %val, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %voffset = add i32 %voffset.base, 4095 + call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +; Natural mapping + slc +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) + ret void +} + +define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 +declare void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0 + +attributes #0 = { nounwind } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -0,0 +1,260 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s + +; Natural mapping +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %voffset.add = add i32 %voffset, 4095 + call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { + ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0) + ret void +} + +; Natural mapping, no voffset +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { + ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + ret void +} + +; All register operands need legalization +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { + ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; CHECK: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; CHECK: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY11]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY12]], implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec + ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 + ; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +; All register operands need legalization, no voffset +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 %soffset) { + ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + ret void +} + +; Natural mapping + slc +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + ret void +} + +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { + ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2) + ret void +} + +define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; CHECK: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { + ; CHECK-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0 +declare void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0 + +attributes #0 = { nounwind } Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -0,0 +1,72 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck %s -check-prefix=CHECK + +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s11, s5 +; CHECK-NEXT: s_mov_b32 s10, s4 +; CHECK-NEXT: s_mov_b32 s9, s3 +; CHECK-NEXT: s_mov_b32 s8, s2 +; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 offen +; CHECK-NEXT: s_endpgm + call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 24) + ret void +} + +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { +; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s11, s5 +; CHECK-NEXT: s_mov_b32 s10, s4 +; CHECK-NEXT: s_mov_b32 s9, s3 +; CHECK-NEXT: s_mov_b32 s8, s2 +; CHECK-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s6 +; CHECK-NEXT: s_endpgm + call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s11, s5 +; CHECK-NEXT: s_mov_b32 s10, s4 +; CHECK-NEXT: s_mov_b32 s9, s3 +; CHECK-NEXT: s_mov_b32 s8, s2 +; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s6 offen +; CHECK-NEXT: s_endpgm + call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s11, s5 +; CHECK-NEXT: s_mov_b32 s10, s4 +; CHECK-NEXT: s_mov_b32 s9, s3 +; CHECK-NEXT: s_mov_b32 s8, s2 +; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s6 offset:92 +; CHECK-NEXT: s_endpgm + call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s11, s5 +; CHECK-NEXT: s_mov_b32 s10, s4 +; CHECK-NEXT: s_mov_b32 s9, s3 +; CHECK-NEXT: s_mov_b32 s8, s2 +; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 offen slc +; CHECK-NEXT: s_endpgm + call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 +declare void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0 + +attributes #0 = { nounwind } Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck %s -check-prefix=CHECK + + +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s11, s5 +; CHECK-NEXT: s_mov_b32 s10, s4 +; CHECK-NEXT: s_mov_b32 s9, s3 +; CHECK-NEXT: s_mov_b32 s8, s2 +; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s6 idxen offen +; CHECK-NEXT: s_endpgm + call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +; Natural mapping, no voffset +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { +; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s11, s5 +; CHECK-NEXT: s_mov_b32 s10, s4 +; CHECK-NEXT: s_mov_b32 s9, s3 +; CHECK-NEXT: s_mov_b32 s8, s2 +; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 idxen +; CHECK-NEXT: s_endpgm + call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s11, s5 +; CHECK-NEXT: s_mov_b32 s10, s4 +; CHECK-NEXT: s_mov_b32 s9, s3 +; CHECK-NEXT: s_mov_b32 s8, s2 +; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s6 idxen offen slc +; CHECK-NEXT: s_endpgm + call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + ret void +} + +define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; CHECK-LABEL: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s11, s5 +; CHECK-NEXT: s_mov_b32 s10, s4 +; CHECK-NEXT: s_mov_b32 s9, s3 +; CHECK-NEXT: s_mov_b32 s8, s2 +; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[8:11], s6 idxen offen +; CHECK-NEXT: s_endpgm + call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0 +declare void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0 + +attributes #0 = { nounwind }