Index: lib/Target/AMDGPU/AMDGPUCallingConv.td =================================================================== --- lib/Target/AMDGPU/AMDGPUCallingConv.td +++ lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -110,7 +110,7 @@ // Calling convention for compute kernels def CC_AMDGPU_Kernel : CallingConv<[ - CCCustom<"allocateStack"> + CCCustom<"allocateKernArg"> ]>; def CC_AMDGPU : CallingConv<[ Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -31,13 +31,15 @@ #include "SIInstrInfo.h" using namespace llvm; -static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - unsigned Offset = State.AllocateStack(ValVT.getStoreSize(), - ArgFlags.getOrigAlign()); - State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); - +static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + MachineFunction &MF = State.getMachineFunction(); + AMDGPUMachineFunction *MFI = MF.getInfo(); + + uint64_t Offset = MFI->allocateKernArg(ValVT.getStoreSize(), + ArgFlags.getOrigAlign()); + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo)); return true; } Index: lib/Target/AMDGPU/AMDGPUMachineFunction.h =================================================================== --- lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -16,10 +16,25 @@ namespace llvm { class AMDGPUMachineFunction : public MachineFunctionInfo { + uint64_t KernArgSize; + unsigned MaxKernArgAlign; + virtual void anchor(); public: AMDGPUMachineFunction(const MachineFunction &MF); + + uint64_t allocateKernArg(uint64_t Size, unsigned Align) { + assert(isPowerOf2_32(Align)); + KernArgSize = alignTo(KernArgSize, Align); + + uint64_t Result = KernArgSize; + KernArgSize += Size; + + MaxKernArgAlign = std::max(Align, MaxKernArgAlign); + return Result; + } + /// A map to keep track of local memory objects and their offsets within /// the local memory space. std::map LocalMemoryObjects; Index: lib/Target/AMDGPU/AMDGPUMachineFunction.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -1,8 +1,5 @@ #include "AMDGPUMachineFunction.h" -#include "AMDGPU.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "llvm/IR/Attributes.h" -#include "llvm/IR/Function.h" + using namespace llvm; // Pin the vtable to this file. @@ -10,8 +7,9 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), + KernArgSize(0), + MaxKernArgAlign(0), LDSSize(0), ABIArgOffset(0), ScratchSize(0), - IsKernel(true) { -} + IsKernel(true) {} Index: test/CodeGen/AMDGPU/kernarg-stack-alignment.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/kernarg-stack-alignment.ll @@ -0,0 +1,44 @@ +; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck %s + +; Test that the alignment of kernel arguments does not impact the +; alignment of the stack + +; CHECK-LABEL: {{^}}no_args: +; CHECK: ScratchSize: 8{{$}} +define void @no_args() { + %alloca = alloca i8 + store volatile i8 0, i8* %alloca + ret void +} + +; CHECK-LABEL: {{^}}force_align32: +; CHECK: ScratchSize: 8{{$}} +define void @force_align32(<8 x i32>) { + %alloca = alloca i8 + store volatile i8 0, i8* %alloca + ret void +} + +; CHECK-LABEL: {{^}}force_align64: +; CHECK: ScratchSize: 8{{$}} +define void @force_align64(<16 x i32>) { + %alloca = alloca i8 + store volatile i8 0, i8* %alloca + ret void +} + +; CHECK-LABEL: {{^}}force_align128: +; CHECK: ScratchSize: 8{{$}} +define void @force_align128(<32 x i32>) { + %alloca = alloca i8 + store volatile i8 0, i8* %alloca + ret void +} + +; CHECK-LABEL: {{^}}force_align256: +; CHECK: ScratchSize: 8{{$}} +define void @force_align256(<64 x i32>) { + %alloca = alloca i8 + store volatile i8 0, i8* %alloca + ret void +}