Index: lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -86,7 +86,9 @@ // The queue ptr is only needed when casting to flat, not from it. static bool castRequiresQueuePtr(unsigned SrcAS) { - return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; + return SrcAS == AMDGPUAS::LOCAL_ADDRESS || + SrcAS == AMDGPUAS::PRIVATE_ADDRESS || + SrcAS == AMDGPUAS::REGION_ADDRESS; } static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -499,8 +499,6 @@ const SITargetLowering& Lowering = *static_cast(getTargetLowering()); - // Write max value to m0 before each load operation - assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain"); SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), @@ -518,10 +516,17 @@ } SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const { - if (cast(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS || - !Subtarget->ldsRequiresM0Init()) - return N; - return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); + unsigned AS = cast(N)->getAddressSpace(); + if (AS == AMDGPUAS::LOCAL_ADDRESS) { + if (Subtarget->ldsRequiresM0Init()) + return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); + } else if (AS == AMDGPUAS::REGION_ADDRESS) { + MachineFunction &MF = CurDAG->getMachineFunction(); + unsigned Value = MF.getInfo()->getGDSSize(); + return + glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32)); + } + return N; } MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -353,6 +353,10 @@ return cast(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; +class RegionAddress : CodePatPred<[{ + return cast(N)->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; +}]>; + class GlobalAddress : CodePatPred<[{ return cast(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; @@ -402,6 +406,9 @@ class LocalLoad : LoadFrag , LocalAddress; class LocalStore : StoreFrag , LocalAddress; +class RegionLoad : LoadFrag , RegionAddress; +class RegionStore : StoreFrag , RegionAddress; + class GlobalLoad : LoadFrag, GlobalLoadAddress; class GlobalStore : StoreFrag, GlobalAddress; @@ -497,6 +504,13 @@ return cast(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; +class region_binary_atomic_op : + PatFrag<(ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), [{ + return cast(N)->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; +}]>; + + def atomic_swap_local : local_binary_atomic_op; def atomic_load_add_local : local_binary_atomic_op; def atomic_load_sub_local : local_binary_atomic_op; @@ -521,6 +535,13 @@ return AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; +class AtomicCmpSwapRegion : PatFrag< + (ops node:$ptr, node:$cmp, node:$swap), + (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast(N); + return AN->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; +}]>; + def atomic_cmp_swap_local : AtomicCmpSwapLocal ; multiclass global_binary_atomic_op { Index: lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -134,6 +134,7 @@ const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); + const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); @@ -144,7 +145,7 @@ }; const std::initializer_list AddrSpaces32 = { - LocalPtr, PrivatePtr + LocalPtr, RegionPtr, PrivatePtr }; setAction({G_BRCOND, S1}, Legal); Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -140,7 +140,7 @@ unsigned Threshold = 0; if (AS == AMDGPUAS::PRIVATE_ADDRESS) Threshold = ThresholdPrivate; - else if (AS == AMDGPUAS::LOCAL_ADDRESS) + else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) Threshold = ThresholdLocal; else continue; @@ -158,7 +158,8 @@ unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; if (AllocaSize > MaxAlloca) continue; - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || + AS == AMDGPUAS::REGION_ADDRESS) { LocalGEPsSeen++; // Inhibit unroll for local memory if we have seen addressing not to // a variable, most likely we will be unable to combine it. Index: lib/Target/AMDGPU/DSInstructions.td =================================================================== --- lib/Target/AMDGPU/DSInstructions.td +++ lib/Target/AMDGPU/DSInstructions.td @@ -595,19 +595,19 @@ (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0)) >; -class DSReadPat : GCNPat < +class DSReadPat : GCNPat < (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), - (inst $ptr, (as_i16imm $offset), (i1 0)) + (inst $ptr, (as_i16imm $offset), (i1 gds)) >; multiclass DSReadPat_mc { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSReadPat(frag#"_m0")>; + def : DSReadPat(frag#"_m0"), 0>; } let OtherPredicates = [NotLDSRequiresM0Init] in { - def : DSReadPat(!cast(inst)#"_gfx9"), vt, !cast(frag)>; + def : DSReadPat(!cast(inst)#"_gfx9"), vt, !cast(frag), 0>; } } @@ -651,18 +651,18 @@ def : DSReadPat_D16; } -class DSWritePat : GCNPat < +class DSWritePat : GCNPat < (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), - (inst $ptr, $value, (as_i16imm $offset), (i1 0)) + (inst $ptr, $value, (as_i16imm $offset), (i1 gds)) >; multiclass DSWritePat_mc { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSWritePat(frag#"_m0")>; + def : DSWritePat(frag#"_m0"), 0>; } let OtherPredicates = [NotLDSRequiresM0Init] in { - def : DSWritePat(!cast(inst)#"_gfx9"), vt, !cast(frag)>; + def : DSWritePat(!cast(inst)#"_gfx9"), vt, !cast(frag), 0>; } } @@ -692,8 +692,8 @@ defm : DSAtomicWritePat_mc ; let OtherPredicates = [D16PreservesUnusedBits] in { -def : DSWritePat ; -def : DSWritePat ; +def : DSWritePat ; +def : DSWritePat ; } @@ -728,20 +728,22 @@ defm : DSWritePat_mc ; } // End AddedComplexity = 100 -class DSAtomicRetPat : GCNPat < +class DSAtomicRetPat : GCNPat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst $ptr, $value, (as_i16imm $offset), (i1 0)) + (inst $ptr, $value, (as_i16imm $offset), (i1 gds)) >; multiclass DSAtomicRetPat_mc { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicRetPat(frag#"_m0")>; + def : DSAtomicRetPat(frag#"_local_m0"), 0>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicRetPat(!cast(inst)#"_gfx9"), vt, - !cast(frag)>; + !cast(frag#"_local"), 0>; } + + def : DSAtomicRetPat(frag#"_region_m0"), 1>; } @@ -765,36 +767,36 @@ // 32-bit atomics. -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; defm : DSAtomicCmpXChg_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; // 64-bit atomics. -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; -defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; defm : DSAtomicCmpXChg_mc; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1151,7 +1151,7 @@ } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); return (MemVT.getSizeInBits() <= MaxPrivateBits); - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { return (MemVT.getSizeInBits() <= 2 * 32); } return true; @@ -5706,7 +5706,8 @@ SDValue Chain = M->getOperand(0); SDValue M0 = M->getOperand(2); SDValue Value = M->getOperand(3); - unsigned OrderedCountIndex = M->getConstantOperandVal(7); + unsigned OrderedCountIndex = M->getConstantOperandVal(7) & 0x3f; + unsigned CountDw = M->getConstantOperandVal(7) >> 24; unsigned WaveRelease = M->getConstantOperandVal(8); unsigned WaveDone = M->getConstantOperandVal(9); unsigned ShaderType; @@ -5745,6 +5746,10 @@ unsigned Offset0 = OrderedCountIndex << 2; unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | (Instruction << 4); + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) + Offset1 |= (CountDw - 1) << 6; + unsigned Offset = Offset0 | (Offset1 << 8); SDValue Ops[] = { @@ -6950,7 +6955,7 @@ default: llvm_unreachable("unsupported private_element_size"); } - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { // Use ds_read_b128 if possible. if (Subtarget->useDS128() && Load->getAlignment() >= 16 && MemVT.getStoreSize() == 16) @@ -7372,7 +7377,7 @@ default: llvm_unreachable("unsupported private_element_size"); } - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { // Use ds_write_b128 if possible. if (Subtarget->useDS128() && Store->getAlignment() >= 16 && VT.getStoreSize() == 16 && NumElements != 3) Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -505,6 +505,7 @@ >; def _local_m0 : local_binary_atomic_op (NAME#"_glue")>; + def _region_m0 : region_binary_atomic_op (NAME#"_glue")>; } defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -229,6 +229,7 @@ unsigned GITPtrHigh; unsigned HighBitsOf32BitAddress; + unsigned GDSSize; // Current recorded maximum possible occupancy. unsigned Occupancy; @@ -456,6 +457,10 @@ return HighBitsOf32BitAddress; } + unsigned getGDSSize() const { + return GDSSize; + } + unsigned getNumUserSGPRs() const { return NumUserSGPRs; } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -46,7 +46,8 @@ ImplicitBufferPtr(false), ImplicitArgPtr(false), GITPtrHigh(0xffffffff), - HighBitsOf32BitAddress(0) { + HighBitsOf32BitAddress(0), + GDSSize(4096) { const GCNSubtarget &ST = MF.getSubtarget(); const Function &F = MF.getFunction(); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); @@ -159,6 +160,10 @@ S = A.getValueAsString(); if (!S.empty()) S.consumeInteger(0, HighBitsOf32BitAddress); + + S = F.getFnAttribute("amdgpu-gds-size").getValueAsString(); + if (!S.empty()) + S.consumeInteger(0, GDSSize); } void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) { Index: test/CodeGen/AMDGPU/gds-atomic.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/gds-atomic.ll @@ -0,0 +1,28 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s + +; FUNC-LABEL: {{^}}atomic_add_ret_gds: +; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]] +; GCN-DAG: s_movk_i32 m0, 0x1000 +; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[OFF]] gds +define amdgpu_kernel void @atomic_add_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) { + %val = atomicrmw volatile add i32 addrspace(2)* %gds, i32 5 seq_cst + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_ret_gds_const_offset: +; GCN: s_movk_i32 m0, 0x80 +; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 gds +define amdgpu_kernel void @atomic_add_ret_gds_const_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #0 { + %gep = getelementptr i32, i32 addrspace(2)* %gds, i32 5 + %val = atomicrmw volatile add i32 addrspace(2)* %gep, i32 5 seq_cst + store i32 %val, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* nocapture, i32, i32, i32, i1, i32, i32, i1, i1) + +attributes #0 = { nounwind "amdgpu-gds-size"="128" }