Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2690,7 +2690,8 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const { - uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), 4); + unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr(); + uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment); switch (Param) { case GRID_DIM: return ArgOffset; Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -276,6 +276,10 @@ return isAmdHsaOS() ? 0 : 36; } + unsigned getAlignmentForImplicitArgPtr() const { + return isAmdHsaOS() ? 8 : 4; + } + unsigned getStackAlignment() const { // Scratch is allocated in 256 dword per wave blocks. return 4 * 256 / getWavefrontSize(); Index: lib/Target/AMDGPU/GCNSchedStrategy.cpp =================================================================== --- lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -34,6 +34,7 @@ const SIMachineFunctionInfo *MFI = MF.getInfo(); unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs), ST.getOccupancyWithNumVGPRs(VGPRs)); + return MinRegOccupancy; return std::min(MinRegOccupancy, ST.getOccupancyWithLocalMemSize(MFI->getLDSSize())); } Index: test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll @@ -29,7 +29,7 @@ ; ALL-LABEL: {{^}}test_implicit_alignment ; MESA: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xc -; HSA: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x3 +; HSA: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x4 ; ALL: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[VAL]] ; MESA: buffer_store_dword [[V_VAL]] ; HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[V_VAL]]