diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h --- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h +++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h @@ -159,6 +159,22 @@ static QuotingType mustQuote(StringRef) { return QuotingType::None; } }; +template <> struct ScalarTraits { + static void output(const Align &Alignment, void *, llvm::raw_ostream &OS) { + OS << Alignment.value(); + } + static StringRef input(StringRef Scalar, void *, Align &Alignment) { + unsigned long long N; + if (getAsUnsignedInteger(Scalar, 10, N)) + return "invalid number"; + if (!isPowerOf2_64(N)) + return "must be a power of two"; + Alignment = Align(N); + return StringRef(); + } + static QuotingType mustQuote(StringRef) { return QuotingType::None; } +}; + } // end namespace yaml } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2279,6 +2279,25 @@ return true; // Leave in place; } + if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { + Type *Ty = GV->getValueType(); + // HIP uses an unsized array `extern __shared__ T s[]` or similar + // zero-sized type in other languages to declare the dynamic shared + // memory which size is not known at the compile time. They will be + // allocated by the runtime and placed directly after the static + // allocated ones. They all share the same offset. + if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { + // Adjust alignment for that dynamic shared memory array. + MFI->setDynLDSAlign(B.getDataLayout(), *cast(GV)); + LLT S32 = LLT::scalar(32); + auto Sz = + B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); + B.buildIntToPtr(DstReg, Sz); + MI.eraseFromParent(); + return true; + } + } + B.buildConstant( DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *cast(GV))); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -9,9 +9,10 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/MachineFunction.h" -#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/Support/Alignment.h" namespace llvm { @@ -29,6 +30,17 @@ /// Number of bytes in the LDS that are being used. unsigned LDSSize = 0; + /// Number of bytes in the LDS allocated statically. This field is only used + /// in the instruction selector and not part of the machine function info. + unsigned StaticLDSSize = 0; + + /// Align for dynamic shared memory if any. Dynamic shared memory is + /// allocated directly after the static one, i.e., LDSSize. Need to pad + /// LDSSize to ensure that dynamic one is aligned accordingly. + /// The maximal alignment is updated during IR translation or lowering + /// stages. + Align DynLDSAlign; + // State of MODE register, assumed FP mode. AMDGPU::SIModeRegisterDefaults Mode; @@ -78,6 +90,10 @@ } unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV); + + Align getDynLDSAlign() const { return DynLDSAlign; } + + void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV); }; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -49,10 +49,27 @@ /// TODO: We should sort these to minimize wasted space due to alignment /// padding. Currently the padding is decided by the first encountered use /// during lowering. - unsigned Offset = LDSSize = alignTo(LDSSize, Alignment); + unsigned Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment); Entry.first->second = Offset; - LDSSize += DL.getTypeAllocSize(GV.getValueType()); + StaticLDSSize += DL.getTypeAllocSize(GV.getValueType()); + + // Update the LDS size considering the padding to align the dynamic shared + // memory. + LDSSize = alignTo(StaticLDSSize, DynLDSAlign); return Offset; } + +void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL, + const GlobalVariable &GV) { + assert(DL.getTypeAllocSize(GV.getValueType()).isZero()); + + Align Alignment = + DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType()); + if (Alignment <= DynLDSAlign) + return; + + LDSSize = alignTo(StaticLDSSize, Alignment); + DynLDSAlign = Alignment; +} diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5571,15 +5571,32 @@ SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast(Op); + SDLoc DL(GSD); + EVT PtrVT = Op.getValueType(); + const GlobalValue *GV = GSD->getGlobal(); if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && shouldUseLDSConstAddress(GV)) || GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || - GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) + GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + GV->hasExternalLinkage()) { + Type *Ty = GV->getValueType(); + // HIP uses an unsized array `extern __shared__ T s[]` or similar + // zero-sized type in other languages to declare the dynamic shared + // memory which size is not known at the compile time. They will be + // allocated by the runtime and placed directly after the static + // allocated ones. They all share the same offset. + if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) { + assert(PtrVT == MVT::i32 && "32-bit pointer is expected."); + // Adjust alignment for that dynamic shared memory array. + MFI->setDynLDSAlign(DAG.getDataLayout(), *cast(GV)); + return SDValue( + DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0); + } + } return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); - - SDLoc DL(GSD); - EVT PtrVT = Op.getValueType(); + } if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(), diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -277,6 +277,7 @@ uint64_t ExplicitKernArgSize = 0; unsigned MaxKernArgAlign = 0; unsigned LDSSize = 0; + Align DynLDSAlign; bool IsEntryFunction = false; bool NoSignedZerosFPMath = false; bool MemoryBound = false; @@ -306,6 +307,7 @@ UINT64_C(0)); YamlIO.mapOptional("maxKernArgAlign", MFI.MaxKernArgAlign, 0u); YamlIO.mapOptional("ldsSize", MFI.LDSSize, 0u); + YamlIO.mapOptional("dynLDSAlign", MFI.DynLDSAlign, Align()); YamlIO.mapOptional("isEntryFunction", MFI.IsEntryFunction, false); YamlIO.mapOptional("noSignedZerosFPMath", MFI.NoSignedZerosFPMath, false); YamlIO.mapOptional("memoryBound", MFI.MemoryBound, false); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -537,23 +537,20 @@ } yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( - const llvm::SIMachineFunctionInfo& MFI, - const TargetRegisterInfo &TRI) - : ExplicitKernArgSize(MFI.getExplicitKernArgSize()), - MaxKernArgAlign(MFI.getMaxKernArgAlign()), - LDSSize(MFI.getLDSSize()), - IsEntryFunction(MFI.isEntryFunction()), - NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()), - MemoryBound(MFI.isMemoryBound()), - WaveLimiter(MFI.needsWaveLimiter()), - HasSpilledSGPRs(MFI.hasSpilledSGPRs()), - HasSpilledVGPRs(MFI.hasSpilledVGPRs()), - HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()), - ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)), - FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)), - StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)), - ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), - Mode(MFI.getMode()) {} + const llvm::SIMachineFunctionInfo &MFI, const TargetRegisterInfo &TRI) + : ExplicitKernArgSize(MFI.getExplicitKernArgSize()), + MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()), + DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()), + NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()), + MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()), + HasSpilledSGPRs(MFI.hasSpilledSGPRs()), + HasSpilledVGPRs(MFI.hasSpilledVGPRs()), + HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()), + ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)), + FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)), + StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)), + ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), Mode(MFI.getMode()) { +} void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) { MappingTraits::mapping(YamlIO, *this); @@ -564,6 +561,7 @@ ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize; MaxKernArgAlign = assumeAligned(YamlMFI.MaxKernArgAlign); LDSSize = YamlMFI.LDSSize; + DynLDSAlign = YamlMFI.DynLDSAlign; HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress; IsEntryFunction = YamlMFI.IsEntryFunction; NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll @@ -0,0 +1,140 @@ +; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s + +@lds0 = addrspace(3) global [512 x float] undef +@lds1 = addrspace(3) global [256 x float] undef +@lds2 = addrspace(3) global [4096 x float] undef +@lds3 = addrspace(3) global [67 x i8] undef + +@dynamic_shared0 = external addrspace(3) global [0 x float] +@dynamic_shared1 = external addrspace(3) global [0 x double] +@dynamic_shared2 = external addrspace(3) global [0 x double], align 4 +@dynamic_shared3 = external addrspace(3) global [0 x double], align 16 + +; CHECK-LABEL: {{^}}dynamic_shared_array_0: +; CHECK: v_add_u32_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}} +define amdgpu_kernel void @dynamic_shared_array_0(float addrspace(1)* %out) { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %tid.x + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val0, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; CHECK-LABEL: {{^}}dynamic_shared_array_1: +; CHECK: v_lshlrev_b32_e32 {{v[0-9]+}}, 2, {{v[0-9]+}} +; CHECK: v_lshlrev_b32_e32 {{v[0-9]+}}, 2, {{v[0-9]+}} +; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}} +; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0xc00, [[IDX]] +define amdgpu_kernel void @dynamic_shared_array_1(float addrspace(1)* %out, i32 %cond) { +entry: + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %idx.0 = add nsw i32 %tid.x, 64 + %tmp = icmp eq i32 %cond, 0 + br i1 %tmp, label %if, label %else + +if: ; preds = %entry + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + br label %endif + +else: ; preds = %entry + %arrayidx1 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + br label %endif + +endif: ; preds = %else, %if + %val = phi float [ %val0, %if ], [ %val1, %else ] + %arrayidx = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val, float addrspace(3)* %arrayidx, align 4 + ret void +} + +; CHECK-LABEL: {{^}}dynamic_shared_array_2: +; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}} +; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x4000, [[IDX]] +define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %vidx = add i32 %tid.x, %idx + %arrayidx0 = getelementptr inbounds [4096 x float], [4096 x float] addrspace(3)* @lds2, i32 0, i32 %vidx + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val0, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; The offset to the dynamic shared memory array should be aligned on the type +; specified. +; CHECK-LABEL: {{^}}dynamic_shared_array_3: +; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}} +; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x44, [[IDX]] +define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %vidx = add i32 %tid.x, %idx + %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx + %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4 + %val1 = uitofp i8 %val0 to float + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; The offset to the dynamic shared memory array should be aligned on the +; maximal one. +; CHECK-LABEL: {{^}}dynamic_shared_array_4: +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48 +; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}} +; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]] +define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %vidx = add i32 %tid.x, %idx + %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx + %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4 + %val1 = uitofp i8 %val0 to float + %val2 = uitofp i8 %val0 to double + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared1, i32 0, i32 %tid.x + store double %val2, double addrspace(3)* %arrayidx2, align 4 + ret void +} + +; Honor the explicit alignment from the specified variable. +; CHECK-LABEL: {{^}}dynamic_shared_array_5: +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44 +; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}} +; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]] +define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %vidx = add i32 %tid.x, %idx + %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx + %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4 + %val1 = uitofp i8 %val0 to float + %val2 = uitofp i8 %val0 to double + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared2, i32 0, i32 %tid.x + store double %val2, double addrspace(3)* %arrayidx2, align 4 + ret void +} + +; Honor the explicit alignment from the specified variable. +; CHECK-LABEL: {{^}}dynamic_shared_array_6: +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50 +; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}} +; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]] +define amdgpu_kernel void @dynamic_shared_array_6(i32 %idx) { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %vidx = add i32 %tid.x, %idx + %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx + %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4 + %val1 = uitofp i8 %val0 to float + %val2 = uitofp i8 %val0 to double + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared3, i32 0, i32 %tid.x + store double %val2, double addrspace(3)* %arrayidx2, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll @@ -0,0 +1,138 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s + +@lds0 = addrspace(3) global [512 x float] undef +@lds1 = addrspace(3) global [256 x float] undef +@lds2 = addrspace(3) global [4096 x float] undef +@lds3 = addrspace(3) global [67 x i8] undef + +@dynamic_shared0 = external addrspace(3) global [0 x float] +@dynamic_shared1 = external addrspace(3) global [0 x double] +@dynamic_shared2 = external addrspace(3) global [0 x double], align 4 +@dynamic_shared3 = external addrspace(3) global [0 x double], align 16 + +; CHECK-LABEL: {{^}}dynamic_shared_array_0: +; CHECK: v_add_u32_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}} +define amdgpu_kernel void @dynamic_shared_array_0(float addrspace(1)* %out) { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %tid.x + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val0, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; CHECK-LABEL: {{^}}dynamic_shared_array_1: +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0xc00 +; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] +define amdgpu_kernel void @dynamic_shared_array_1(float addrspace(1)* %out, i32 %cond) { +entry: + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %idx.0 = add nsw i32 %tid.x, 64 + %tmp = icmp eq i32 %cond, 0 + br i1 %tmp, label %if, label %else + +if: ; preds = %entry + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + br label %endif + +else: ; preds = %entry + %arrayidx1 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + br label %endif + +endif: ; preds = %else, %if + %val = phi float [ %val0, %if ], [ %val1, %else ] + %arrayidx = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val, float addrspace(3)* %arrayidx, align 4 + ret void +} + +; CHECK-LABEL: {{^}}dynamic_shared_array_2: +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x4000 +; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] +define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %vidx = add i32 %tid.x, %idx + %arrayidx0 = getelementptr inbounds [4096 x float], [4096 x float] addrspace(3)* @lds2, i32 0, i32 %vidx + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val0, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; The offset to the dynamic shared memory array should be aligned on the type +; specified. +; CHECK-LABEL: {{^}}dynamic_shared_array_3: +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44 +; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] +define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %vidx = add i32 %tid.x, %idx + %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx + %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4 + %val1 = uitofp i8 %val0 to float + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; The offset to the dynamic shared memory array should be aligned on the +; maximal one. +; CHECK-LABEL: {{^}}dynamic_shared_array_4: +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48 +; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] +; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]] +define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %vidx = add i32 %tid.x, %idx + %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx + %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4 + %val1 = uitofp i8 %val0 to float + %val2 = uitofp i8 %val0 to double + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared1, i32 0, i32 %tid.x + store double %val2, double addrspace(3)* %arrayidx2, align 4 + ret void +} + +; Honor the explicit alignment from the specified variable. +; CHECK-LABEL: {{^}}dynamic_shared_array_5: +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44 +; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] +; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]] +define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %vidx = add i32 %tid.x, %idx + %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx + %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4 + %val1 = uitofp i8 %val0 to float + %val2 = uitofp i8 %val0 to double + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared2, i32 0, i32 %tid.x + store double %val2, double addrspace(3)* %arrayidx2, align 4 + ret void +} + +; Honor the explicit alignment from the specified variable. +; CHECK-LABEL: {{^}}dynamic_shared_array_6: +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50 +; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] +; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]] +define amdgpu_kernel void @dynamic_shared_array_6(i32 %idx) { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %vidx = add i32 %tid.x, %idx + %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx + %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4 + %val1 = uitofp i8 %val0 to float + %val2 = uitofp i8 %val0 to double + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared3, i32 0, i32 %tid.x + store double %val2, double addrspace(3)* %arrayidx2, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-dynlds-align-invalid-case.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-dynlds-align-invalid-case.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-dynlds-align-invalid-case.mir @@ -0,0 +1,14 @@ +# RUN: not llc -mtriple=amdgcn-amd-amdhsa -run-pass=none -verify-machineinstrs %s -o - 2>&1 | FileCheck %s + +--- +# CHECK: error: YAML:8:16: must be a power of two + +name: dyn_lds_with_alignment +machineFunctionInfo: + dynLDSAlign: 9 + +body: | + bb.0: + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -8,6 +8,7 @@ # FULL-NEXT: explicitKernArgSize: 128 # FULL-NEXT: maxKernArgAlign: 64 # FULL-NEXT: ldsSize: 2048 +# FULL-NEXT: dynLDSAlign: 1 # FULL-NEXT: isEntryFunction: true # FULL-NEXT: noSignedZerosFPMath: false # FULL-NEXT: memoryBound: true @@ -81,6 +82,7 @@ # FULL-NEXT: explicitKernArgSize: 0 # FULL-NEXT: maxKernArgAlign: 1 # FULL-NEXT: ldsSize: 0 +# FULL-NEXT: dynLDSAlign: 1 # FULL-NEXT: isEntryFunction: false # FULL-NEXT: noSignedZerosFPMath: false # FULL-NEXT: memoryBound: false @@ -121,6 +123,7 @@ # FULL-NEXT: explicitKernArgSize: 0 # FULL-NEXT: maxKernArgAlign: 1 # FULL-NEXT: ldsSize: 0 +# FULL-NEXT: dynLDSAlign: 1 # FULL-NEXT: isEntryFunction: false # FULL-NEXT: noSignedZerosFPMath: false # FULL-NEXT: memoryBound: false @@ -162,6 +165,7 @@ # FULL-NEXT: explicitKernArgSize: 0 # FULL-NEXT: maxKernArgAlign: 1 # FULL-NEXT: ldsSize: 0 +# FULL-NEXT: dynLDSAlign: 1 # FULL-NEXT: isEntryFunction: true # FULL-NEXT: noSignedZerosFPMath: false # FULL-NEXT: memoryBound: false @@ -285,3 +289,20 @@ S_ENDPGM 0 ... + +--- +# ALL-LABEL: name: dyn_lds_with_alignment + +# FULL: ldsSize: 0 +# FULL-NEXT: dynLDSAlign: 8 + +# SIMPLE: dynLDSAlign: 8 +name: dyn_lds_with_alignment +machineFunctionInfo: + dynLDSAlign: 8 + +body: | + bb.0: + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -11,6 +11,7 @@ ; CHECK-NEXT: explicitKernArgSize: 128 ; CHECK-NEXT: maxKernArgAlign: 64 ; CHECK-NEXT: ldsSize: 0 +; CHECK-NEXT: dynLDSAlign: 1 ; CHECK-NEXT: isEntryFunction: true ; CHECK-NEXT: noSignedZerosFPMath: false ; CHECK-NEXT: memoryBound: false @@ -46,6 +47,7 @@ ; CHECK-NEXT: explicitKernArgSize: 0 ; CHECK-NEXT: maxKernArgAlign: 1 ; CHECK-NEXT: ldsSize: 0 +; CHECK-NEXT: dynLDSAlign: 1 ; CHECK-NEXT: isEntryFunction: true ; CHECK-NEXT: noSignedZerosFPMath: false ; CHECK-NEXT: memoryBound: false @@ -76,6 +78,7 @@ ; CHECK-NEXT: explicitKernArgSize: 0 ; CHECK-NEXT: maxKernArgAlign: 1 ; CHECK-NEXT: ldsSize: 0 +; CHECK-NEXT: dynLDSAlign: 1 ; CHECK-NEXT: isEntryFunction: false ; CHECK-NEXT: noSignedZerosFPMath: false ; CHECK-NEXT: memoryBound: false @@ -105,6 +108,7 @@ ; CHECK-NEXT: explicitKernArgSize: 0 ; CHECK-NEXT: maxKernArgAlign: 1 ; CHECK-NEXT: ldsSize: 0 +; CHECK-NEXT: dynLDSAlign: 1 ; CHECK-NEXT: isEntryFunction: false ; CHECK-NEXT: noSignedZerosFPMath: true ; CHECK-NEXT: memoryBound: false