Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -730,7 +730,8 @@ header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; // FIXME: Should use getKernArgSize - header.kernarg_segment_byte_size = MFI->getABIArgOffset(); + header.kernarg_segment_byte_size = + STM.getKernArgSegmentSize(MFI->getABIArgOffset()); header.wavefront_sgpr_count = KernelInfo.NumSGPR; header.workitem_vgpr_count = KernelInfo.NumVGPR; header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -142,6 +142,10 @@ return TargetTriple.getOS() == Triple::Mesa3D; } + bool isOpenCLEnv() const { + return TargetTriple.getEnvironment() == Triple::OpenCL; + } + Generation getGeneration() const { return Gen; } @@ -288,6 +292,14 @@ return isAmdHsaOS() ? 8 : 4; } + unsigned getImplicitArgNumBytes() const { + if (isMesa3DOS()) + return 16; + if (isAmdHsaOS() && isOpenCLEnv()) + return 32; + return 0; + } + unsigned getStackAlignment() const { // Scratch is allocated in 256 dword per wave blocks. return 4 * 256 / getWavefrontSize(); @@ -521,6 +533,8 @@ return SGPRInitBug; } + unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const; + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -297,6 +297,15 @@ return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); } +unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const { + unsigned ImplicitBytes = getImplicitArgNumBytes(); + if (ImplicitBytes == 0) + return ExplicitArgBytes; + + unsigned Alignment = getAlignmentForImplicitArgPtr(); + return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; +} + unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { if (SGPRs <= 80) Index: test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll @@ -1,4 +1,5 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL,HSA-NOENV %s +; RUN: llc -mtriple=amdgcn--amdhsa-opencl -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL,HSA-OPENCL %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,OS-MESA3D,MESA,ALL %s ; RUN: llc -mtriple=amdgcn-mesa-unknown -verify-machineinstrs < %s | FileCheck -check-prefixes=OS-UNKNOWN,MESA,ALL %s @@ -29,6 +30,9 @@ } ; ALL-LABEL: {{^}}test_implicit_alignment +; HSA-NOENV: kernarg_segment_byte_size = 10 +; HSA-OPENCL: kernarg_segment_byte_size = 48 +; OS-MESA3D: kernarg_segment_byte_size = 28 ; OS-UNKNOWN: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xc ; HSA: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x4 ; OS-MESA3D: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x3