diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -9,9 +9,10 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/MachineFunction.h" -#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/Support/Alignment.h" namespace llvm { @@ -27,7 +28,10 @@ Align MaxKernArgAlign; // Cache for this. /// Number of bytes in the LDS that are being used. - unsigned LDSSize = 0; + mutable unsigned LDSSize = 0; + + /// Align for dynamic shared memory if any. + MaybeAlign DynLDSAlign; // State of MODE register, assumed FP mode. AMDGPU::SIModeRegisterDefaults Mode; @@ -54,6 +58,8 @@ unsigned getMaxKernArgAlign() const { return MaxKernArgAlign.value(); } unsigned getLDSSize() const { + if (DynLDSAlign) + LDSSize = alignTo(LDSSize, *DynLDSAlign); return LDSSize; } @@ -78,6 +84,11 @@ } unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV); + + void setDynLDSAlign(Align A) { + if (!DynLDSAlign || A > *DynLDSAlign) + DynLDSAlign = A; + } }; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5566,15 +5566,28 @@ SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast(Op); + SDLoc DL(GSD); + EVT PtrVT = Op.getValueType(); + const GlobalValue *GV = GSD->getGlobal(); if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && shouldUseLDSConstAddress(GV)) || GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || - GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) + GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + GV->hasExternalLinkage()) { + ArrayType *ATy = dyn_cast(GV->getValueType()); + if (ATy && ATy->getNumElements() == 0) { + assert(PtrVT == MVT::i32 && "32-bit pointer is expected."); + // Adjust alignment for that dynamic shared memory array. + MFI->setDynLDSAlign( + DAG.getDataLayout().getABITypeAlign(ATy->getElementType())); + return SDValue( + DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0); + } + } return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); - - SDLoc DL(GSD); - EVT PtrVT = Op.getValueType(); + } if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(), diff --git a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll @@ -0,0 +1,98 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s + +@lds0 = addrspace(3) global [512 x float] undef, align 4 +@lds1 = addrspace(3) global [256 x float] undef, align 4 +@lds2 = addrspace(3) global [4096 x float] undef, align 4 +@lds3 = addrspace(3) global [67 x i8] undef, align 4 + +@dynamic_shared0 = external addrspace(3) global [0 x float], align 4 +@dynamic_shared1 = external addrspace(3) global [0 x double], align 4 + +; CHECK-LABEL: {{^}}dynamic_shared_array_0: +; CHECK: v_add_u32_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}} +define amdgpu_kernel void @dynamic_shared_array_0(float addrspace(1)* %out) { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %tid.x + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val0, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; CHECK-LABEL: {{^}}dynamic_shared_array_1: +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0xc00 +; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] +define amdgpu_kernel void @dynamic_shared_array_1(float addrspace(1)* %out, i32 %cond) { +entry: + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %idx.0 = add nsw i32 %tid.x, 64 + %tmp = icmp eq i32 %cond, 0 + br i1 %tmp, label %if, label %else + +if: ; preds = %entry + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + br label %endif + +else: ; preds = %entry + %arrayidx1 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + br label %endif + +endif: ; preds = %else, %if + %val = phi float [ %val0, %if ], [ %val1, %else ] + %arrayidx = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val, float addrspace(3)* %arrayidx, align 4 + ret void +} + +; CHECK-LABEL: {{^}}dynamic_shared_array_2: +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x4000 +; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] +define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %vidx = add i32 %tid.x, %idx + %arrayidx0 = getelementptr inbounds [4096 x float], [4096 x float] addrspace(3)* @lds2, i32 0, i32 %vidx + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val0, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; The offset to the dynamic shared memory array should be aligned on the type +; specified. +; CHECK-LABEL: {{^}}dynamic_shared_array_3: +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44 +; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] +define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %vidx = add i32 %tid.x, %idx + %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx + %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4 + %val1 = uitofp i8 %val0 to float + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; The offset to the dynamic shared memory array should be aligned on the +; maximal one. +; CHECK-LABEL: {{^}}dynamic_shared_array_4: +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48 +; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] +; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]] +define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %vidx = add i32 %tid.x, %idx + %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx + %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4 + %val1 = uitofp i8 %val0 to float + %val2 = uitofp i8 %val0 to double + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared1, i32 0, i32 %tid.x + store double %val2, double addrspace(3)* %arrayidx2, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x()