Index: llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1092,6 +1092,7 @@ // kernarg_segment_alignment is specified as log of the alignment. // The minimum alignment is 16. + // FIXME: The metadata treats the minimum as 4? Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign)); } Index: llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -201,10 +201,11 @@ Align MaxKernArgAlign; HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F, MaxKernArgAlign); + HSACodeProps.mKernargSegmentAlign = + std::max(MaxKernArgAlign, Align(4)).value(); + HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize; HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize; - HSACodeProps.mKernargSegmentAlign = - std::max(MaxKernArgAlign, Align(4)).value(); HSACodeProps.mWavefrontSize = STM.getWavefrontSize(); HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR; HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR; @@ -868,6 +869,8 @@ Kern.getDocument()->getNode(ProgramInfo.LDSSize); Kern[".private_segment_fixed_size"] = Kern.getDocument()->getNode(ProgramInfo.ScratchSize); + + // FIXME: The metadata treats the minimum as 16? Kern[".kernarg_segment_align"] = Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value()); Kern[".wavefront_size"] = Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -697,6 +697,7 @@ if (ImplicitBytes != 0) { const Align Alignment = getAlignmentForImplicitArgPtr(); TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; + MaxAlign = std::max(MaxAlign, Alignment); } // Being able to dereference past the end is useful for emitting scalar loads. Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -272,7 +272,7 @@ ; HSA-LABEL: - Name: kernel_implicitarg_ptr_empty ; HSA: CodeProps: ; HSA: KernargSegmentSize: 56 -; HSA: KernargSegmentAlign: 4 +; HSA: KernargSegmentAlign: 8 ; HSA-LABEL: - Name: kernel_implicitarg_ptr_empty_0implicit ; HSA: KernargSegmentSize: 0 @@ -280,19 +280,19 @@ ; HSA-LABEL: - Name: opencl_kernel_implicitarg_ptr_empty ; HSA: KernargSegmentSize: 48 -; HSA: KernargSegmentAlign: 4 +; HSA: KernargSegmentAlign: 8 ; HSA-LABEL: - Name: kernel_implicitarg_ptr ; HSA: KernargSegmentSize: 168 -; HSA: KernargSegmentAlign: 4 +; HSA: KernargSegmentAlign: 8 ; HSA-LABEL: - Name: opencl_kernel_implicitarg_ptr ; HSA: KernargSegmentSize: 160 -; HSA: KernargSegmentAlign: 4 +; HSA: KernargSegmentAlign: 8 ; HSA-LABEL: - Name: kernel_call_implicitarg_ptr_func_empty ; HSA: KernargSegmentSize: 56 -; HSA: KernargSegmentAlign: 4 +; HSA: KernargSegmentAlign: 8 ; HSA-LABEL: - Name: kernel_call_implicitarg_ptr_func_empty_implicit0 ; HSA: KernargSegmentSize: 0 @@ -300,19 +300,19 @@ ; HSA-LABEL: - Name: opencl_kernel_call_implicitarg_ptr_func_empty ; HSA: KernargSegmentSize: 48 -; HSA: KernargSegmentAlign: 4 +; HSA: KernargSegmentAlign: 8 ; HSA-LABEL: - Name: kernel_call_implicitarg_ptr_func ; HSA: KernargSegmentSize: 168 -; HSA: KernargSegmentAlign: 4 +; HSA: KernargSegmentAlign: 8 ; HSA-LABEL: - Name: opencl_kernel_call_implicitarg_ptr_func ; HSA: KernargSegmentSize: 160 -; HSA: KernargSegmentAlign: 4 +; HSA: KernargSegmentAlign: 8 ; HSA-LABEL: - Name: kernel_call_kernarg_implicitarg_ptr_func ; HSA: KernargSegmentSize: 168 -; HSA: KernargSegmentAlign: 4 +; HSA: KernargSegmentAlign: 8 ; HSA-LABEL: - Name: kernel_implicitarg_no_struct_align_padding ; HSA: KernargSegmentSize: 120