diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -164,8 +164,8 @@ bool runOnModule(Module &M) override { UsedList = getUsedList(M); - - bool Changed = processUsedLDS(M); + bool Changed = superAlignLDSGlobals(M); + Changed |= processUsedLDS(M); for (Function &F : M.functions()) { if (F.isDeclaration()) @@ -182,6 +182,50 @@ } private: + // Increase the alignment of LDS globals if necessary to maximise the chance + // that we can use aligned LDS instructions to access them. + static bool superAlignLDSGlobals(Module &M) { + const DataLayout &DL = M.getDataLayout(); + bool Changed = false; + if (!SuperAlignLDSGlobals) { + return Changed; + } + + for (auto &GV : M.globals()) { + if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { + // Only changing alignment of LDS variables + continue; + } + if (!GV.hasInitializer()) { + // cuda/hip extern __shared__ variable, leave alignment alone + continue; + } + + Align Alignment = AMDGPU::getAlign(DL, &GV); + TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType()); + + if (GVSize > 8) { + // We might want to use a b96 or b128 load/store + Alignment = std::max(Alignment, Align(16)); + } else if (GVSize > 4) { + // We might want to use a b64 load/store + Alignment = std::max(Alignment, Align(8)); + } else if (GVSize > 2) { + // We might want to use a b32 load/store + Alignment = std::max(Alignment, Align(4)); + } else if (GVSize > 1) { + // We might want to use a b16 load/store + Alignment = std::max(Alignment, Align(2)); + } + + if (Alignment != AMDGPU::getAlign(DL, &GV)) { + Changed = true; + GV.setAlignment(Alignment); + } + } + return Changed; + } + bool processUsedLDS(Module &M, Function *F = nullptr) { LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); @@ -195,31 +239,6 @@ return false; } - // Increase the alignment of LDS globals if necessary to maximise the chance - // that we can use aligned LDS instructions to access them. - if (SuperAlignLDSGlobals) { - for (auto *GV : FoundLocalVars) { - Align Alignment = AMDGPU::getAlign(DL, GV); - TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType()); - - if (GVSize > 8) { - // We might want to use a b96 or b128 load/store - Alignment = std::max(Alignment, Align(16)); - } else if (GVSize > 4) { - // We might want to use a b64 load/store - Alignment = std::max(Alignment, Align(8)); - } else if (GVSize > 2) { - // We might want to use a b32 load/store - Alignment = std::max(Alignment, Align(4)); - } else if (GVSize > 1) { - // We might want to use a b16 load/store - Alignment = std::max(Alignment, Align(2)); - } - - GV->setAlignment(Alignment); - } - } - SmallVector LayoutFields; LayoutFields.reserve(FoundLocalVars.size()); for (GlobalVariable *GV : FoundLocalVars) { diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll @@ -1,5 +1,5 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_ON %s -; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_ON %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=true < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_ON %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=true < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_ON %s ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=false < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_OFF %s ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=false < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_OFF %s @@ -8,6 +8,10 @@ ; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [32 x i64], [32 x i32] } ; CHECK: %llvm.amdgcn.kernel.k4.lds.t = type { [2 x i32 addrspace(3)*] } +; SUPER-ALIGN_ON: @lds.unused = addrspace(3) global i32 undef, align 4 +; SUPER-ALIGN_OFF: @lds.unused = addrspace(3) global i32 undef, align 2 +@lds.unused = addrspace(3) global i32 undef, align 2 + ; CHECK-NOT: @lds.1 @lds.1 = internal unnamed_addr addrspace(3) global [32 x i8] undef, align 1