Index: llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -12,6 +12,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/MachineFunction.h" +#include "AMDGPU.h" namespace llvm { @@ -25,11 +26,13 @@ Align MaxKernArgAlign; // Cache for this. /// Number of bytes in the LDS that are being used. - unsigned LDSSize = 0; + uint32_t LDSSize = 0; + uint32_t GDSSize = 0; /// Number of bytes in the LDS allocated statically. This field is only used /// in the instruction selector and not part of the machine function info. - unsigned StaticLDSSize = 0; + uint32_t StaticLDSSize = 0; + uint32_t StaticGDSSize = 0; /// Align for dynamic shared memory if any. Dynamic shared memory is /// allocated directly after the static one, i.e., LDSSize. Need to pad @@ -69,6 +72,10 @@ return LDSSize; } + uint32_t getGDSSize() const { + return GDSSize; + } + AMDGPU::SIModeRegisterDefaults getMode() const { return Mode; } Index: llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -32,6 +32,15 @@ Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter"); WaveLimiter = WaveLimitAttr.getValueAsBool(); + // FIXME: How is this attribute supposed to interact with statically known + // global sizes? + StringRef S = F.getFnAttribute("amdgpu-gds-size").getValueAsString(); + if (!S.empty()) + S.consumeInteger(0, GDSSize); + + // Assume the attribute allocates before any known GDS globals. + StaticGDSSize = GDSSize; + CallingConv::ID CC = F.getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign); @@ -46,18 +55,27 @@ Align Alignment = DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType()); - /// TODO: We should sort these to minimize wasted space due to alignment - /// padding. Currently the padding is decided by the first encountered use - /// during lowering. - unsigned Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment); + unsigned Offset; + if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + /// TODO: We should sort these to minimize wasted space due to alignment + /// padding. Currently the padding is decided by the first encountered use + /// during lowering. + Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment); - Entry.first->second = Offset; - StaticLDSSize += DL.getTypeAllocSize(GV.getValueType()); + StaticLDSSize += DL.getTypeAllocSize(GV.getValueType()); - // Update the LDS size considering the padding to align the dynamic shared - // memory. - LDSSize = alignTo(StaticLDSSize, DynLDSAlign); + // Update the LDS size considering the padding to align the dynamic shared + // memory. + LDSSize = alignTo(StaticLDSSize, DynLDSAlign); + } else { + Offset = StaticGDSSize = alignTo(StaticGDSSize, Alignment); + StaticGDSSize += DL.getTypeAllocSize(GV.getValueType()); + // FIXME: Apply alignment of dynamic GDS + GDSSize = StaticGDSSize; + } + + Entry.first->second = Offset; return Offset; } Index: llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -437,7 +437,6 @@ unsigned GITPtrHigh; unsigned HighBitsOf32BitAddress; - unsigned GDSSize; // Current recorded maximum possible occupancy. unsigned Occupancy; @@ -748,10 +747,6 @@ return HighBitsOf32BitAddress; } - unsigned getGDSSize() const { - return GDSSize; - } - unsigned getNumUserSGPRs() const { return NumUserSGPRs; } Index: llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -48,8 +48,7 @@ ImplicitBufferPtr(false), ImplicitArgPtr(false), GITPtrHigh(0xffffffff), - HighBitsOf32BitAddress(0), - GDSSize(0) { + HighBitsOf32BitAddress(0) { const GCNSubtarget &ST = MF.getSubtarget(); const Function &F = MF.getFunction(); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); @@ -184,10 +183,6 @@ if (!S.empty()) S.consumeInteger(0, HighBitsOf32BitAddress); - S = F.getFnAttribute("amdgpu-gds-size").getValueAsString(); - if (!S.empty()) - S.consumeInteger(0, GDSSize); - // On GFX908, in order to guarantee copying between AGPRs, we need a scratch // VGPR available at all times. if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { Index: llvm/test/CodeGen/AMDGPU/gds-allocation.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/gds-allocation.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +@gds0 = internal addrspace(2) global [4 x i32] undef, align 4 +@lds0 = internal addrspace(3) global [4 x i32] undef, align 128 +@lds1 = internal addrspace(3) global [4 x i32] undef, align 256 + +; These two objects should be allocated at the same constant offsets +; from the base. +define amdgpu_kernel void @alloc_lds_gds(i32 addrspace(1)* %out) #1 { +; GCN-LABEL: alloc_lds_gds: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v0, 5 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 m0, 16 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_wbinvl1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ds_add_u32 v1, v0 offset:12 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_endpgm + %gep.gds = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds0, i32 0, i32 3 + %val0 = atomicrmw add i32 addrspace(2)* %gep.gds, i32 5 acq_rel + %gep.lds = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds0, i32 0, i32 3 + %val1 = atomicrmw add i32 addrspace(3)* %gep.lds, i32 5 acq_rel + ret void +} + +; The LDS alignment shouldn't change offset of GDS. +define amdgpu_kernel void @alloc_lds_gds_align(i32 addrspace(1)* %out) #1 { +; GCN-LABEL: alloc_lds_gds_align: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v0, 5 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 m0, 16 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_wbinvl1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ds_add_u32 v1, v0 offset:140 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ds_add_u32 v1, v0 offset:12 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_endpgm + %gep.gds = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds0, i32 0, i32 3 + %val0 = atomicrmw add i32 addrspace(2)* %gep.gds, i32 5 acq_rel + + %gep.lds0 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds0, i32 0, i32 3 + %val1 = atomicrmw add i32 addrspace(3)* %gep.lds0, i32 5 acq_rel + + %gep.lds1 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds1, i32 0, i32 3 + %val2 = atomicrmw add i32 addrspace(3)* %gep.lds1, i32 5 acq_rel + ret void +} + +@gds_align8 = internal addrspace(2) global [4 x i32] undef, align 8 +@gds_align32 = internal addrspace(2) global [4 x i32] undef, align 32 + +define amdgpu_kernel void @gds_global_align(i32 addrspace(1)* %out) { +; GCN-LABEL: gds_global_align: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v0, 5 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 m0, 32 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: ds_add_u32 v1, v0 offset:28 gds +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_wbinvl1 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_wbinvl1 +; GCN-NEXT: s_endpgm + %gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align8, i32 0, i32 3 + %val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel + %gep.gds1 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align32, i32 0, i32 3 + %val1 = atomicrmw add i32 addrspace(2)* %gep.gds1, i32 5 acq_rel + ret void +} + +define amdgpu_kernel void @gds_global_align_plus_attr(i32 addrspace(1)* %out) #0 { +; GCN-LABEL: gds_global_align_plus_attr: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v0, 5 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_movk_i32 m0, 0x420 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: ds_add_u32 v1, v0 offset:1052 gds +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_wbinvl1 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: ds_add_u32 v1, v0 offset:1036 gds +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_wbinvl1 +; GCN-NEXT: s_endpgm + %gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align8, i32 0, i32 3 + %val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel + %gep.gds1 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align32, i32 0, i32 3 + %val1 = atomicrmw add i32 addrspace(2)* %gep.gds1, i32 5 acq_rel + ret void +} + +@small.gds = internal addrspace(2) global i8 undef, align 1 +@gds.external = external unnamed_addr addrspace(3) global [0 x i32], align 4 + +define amdgpu_kernel void @gds_extern_align(i32 addrspace(1)* %out, [4 x i32] addrspace(2)* %gds.arg) #0 { +; GCN-LABEL: gds_extern_align: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s0, s[0:1], 0x8 +; GCN-NEXT: v_mov_b32_e32 v0, 5 +; GCN-NEXT: s_movk_i32 m0, 0x401 +; GCN-NEXT: s_movk_i32 s1, 0x400 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s1 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_wbinvl1 +; GCN-NEXT: s_endpgm + call void asm sideeffect "; use $0","s"(i8 addrspace(2)* @small.gds) + %gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* %gds.arg, i32 0, i32 3 + %val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel + ret void +} + +attributes #0 = { "amdgpu-gds-size"="1024" } Index: llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll =================================================================== --- llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -53,7 +53,7 @@ ; CHECK-NEXT: explicitKernArgSize: 0 ; CHECK-NEXT: maxKernArgAlign: 4 ; CHECK-NEXT: ldsSize: 0 -; CHECK-NEXT: gdsSize: 0 +; CHECK-NEXT: gdsSize: 512 ; CHECK-NEXT: dynLDSAlign: 1 ; CHECK-NEXT: isEntryFunction: true ; CHECK-NEXT: noSignedZerosFPMath: false @@ -80,6 +80,8 @@ ; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: body: define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) { + %gep = getelementptr inbounds [128 x i32], [128 x i32] addrspace(2)* @gds, i32 0, i32 %arg0 + atomicrmw add i32 addrspace(2)* %gep, i32 8 seq_cst ret void }