Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -897,12 +897,27 @@ if (STM.isVGPRSpillingEnabled(*MF.getFunction())) { OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); + if (TM.getTargetTriple().getOS() == Triple::AMDPAL) { + unsigned Rsrc2Val + = S_00B84C_SCRATCH_EN(CurrentProgramInfo.ScratchBlocks > 0); + if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) + Rsrc2Val |= S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks); + OutStreamer->EmitIntValue(RsrcReg + 4 /*rsrc2*/, 4); + OutStreamer->EmitIntValue(Rsrc2Val, 4); + } } } if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) { - OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); - OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4); + // Ensure that we do not generate rsrc2 twice for a PS on AMDPAL. + // This is a slightly convoluted way of doing it, in an attempt to be + // non-intrusive for the non-AMDPAL cases. The AMDPAL case will get + // extracted into its own function in a future commit, thus restoring this + // function to its original state. + if (TM.getTargetTriple().getOS() != Triple::AMDPAL) { + OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); + OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4); + } OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4); OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -356,7 +356,48 @@ .addReg(PreloadedPrivateBufferReg, RegState::Kill); } - if (ResourceRegUsed && (ST.isMesaGfxShader(MF) || (PreloadedPrivateBufferReg == AMDGPU::NoRegister))) { + if (ResourceRegUsed && ST.isAmdPalOS()) { + // The pointer to the GIT is formed from the offset passed in and either + // the amdgpu-git-ptr-high function attribute or the top part of the PC + unsigned RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + + const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); + + if (MFI->getGITPtrHigh() != 0xffffffff) { + BuildMI(MBB, I, DL, SMovB32, RsrcHi) + .addImm(MFI->getGITPtrHigh()) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } else { + const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); + BuildMI(MBB, I, DL, GetPC64, Rsrc01); + } + BuildMI(MBB, I, DL, SMovB32, RsrcLo) + .addReg(AMDGPU::SGPR0, RegState::ImplicitDefine) // Low address passed in + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + // We now have the GIT ptr - now get the scratch descriptor from the entry + // at offset 0. + PointerType *PtrTy = + PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()), + AMDGPUAS::CONSTANT_ADDRESS); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); + const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); + auto MMO = MF.getMachineMemOperand(PtrInfo, + MachineMemOperand::MOLoad | + MachineMemOperand::MOInvariant | + MachineMemOperand::MODereferenceable, + 0, 0); + BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) + .addReg(Rsrc01) + .addImm(0) // offset + .addImm(0) // glc + .addMemOperand(MMO) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + } else if (ResourceRegUsed && (ST.isMesaGfxShader(MF) + || (PreloadedPrivateBufferReg == AMDGPU::NoRegister))) { assert(!ST.isAmdCodeObjectV2(MF)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -184,6 +184,11 @@ // user arguments. This is an offset from the KernargSegmentPtr. bool ImplicitArgPtr : 1; + // The hard-wired high half of the address of the global information table + // for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since + // current hardware only allows a 16 bit value. + unsigned GITPtrHigh; + MCPhysReg getNextUserSGPR() const { assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); return AMDGPU::SGPR0 + NumUserSGPRs; @@ -405,6 +410,10 @@ return ArgInfo.getPreloadedValue(Value).first->getRegister(); } + unsigned getGITPtrHigh() const { + return GITPtrHigh; + } + unsigned getNumUserSGPRs() const { return NumUserSGPRs; } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -46,7 +46,8 @@ WorkItemIDY(false), WorkItemIDZ(false), ImplicitBufferPtr(false), - ImplicitArgPtr(false) { + ImplicitArgPtr(false), + GITPtrHigh(0xffffffff) { const SISubtarget &ST = MF.getSubtarget(); const Function *F = MF.getFunction(); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F); @@ -158,6 +159,12 @@ if (HasStackObjects || F->hasFnAttribute("amdgpu-flat-scratch")) FlatScratchInit = true; } + + if (F->hasFnAttribute("amdgpu-git-ptr-high")) { + Attribute A = F->getFnAttribute("amdgpu-git-ptr-high"); + StringRef S = A.getValueAsString(); + S.consumeInteger(0, GITPtrHigh); + } } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( Index: test/CodeGen/AMDGPU/amdpal.ll =================================================================== --- test/CodeGen/AMDGPU/amdpal.ll +++ test/CodeGen/AMDGPU/amdpal.ll @@ -8,3 +8,48 @@ ret void } +; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tahiti | FileCheck --check-prefix=PAL %s + +; Check code sequence for amdpal use of scratch for alloca. This is the case +; where the high half of the address comes from s_getpc. + +; PAL: s_getpc_b64 s{{\[}}[[GITPTR:[0-9]+]]: +; PAL: s_mov_b32 s[[GITPTR]], s0 +; PAL: s_load_dwordx4 s{{\[}}[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s{{\[}}[[GITPTR]]: +; PAL: buffer_store{{.*}}, s{{\[}}[[SCRATCHDESC]]: + +define amdgpu_kernel void @scratch(<2 x i32> %in, i32 %idx, i32* %out) { +entry: + %v = alloca [2 x i32] + %vv = bitcast [2 x i32]* %v to <2 x i32>* + store <2 x i32> %in, <2 x i32>* %vv + %e = getelementptr [2 x i32], [2 x i32]* %v, i32 0, i32 %idx + %x = load i32, i32* %e + store i32 %x, i32* %out + ret void +} + +; Check code sequence for amdpal use of scratch for alloca. This is the case +; where the amdgpu-git-ptr-high function attribute gives the high half of the +; address to use. +; Looks like you can't do arithmetic on a filecheck variable, so we can't test +; that the s_movk_i32 is into a reg that is one more than the following +; s_mov_b32. + +; PAL: s_movk_i32 s{{[0-9]+}}, 0x1234 +; PAL: s_mov_b32 s[[GITPTR:[0-9]+]], s0 +; PAL: s_load_dwordx4 s{{\[}}[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s{{\[}}[[GITPTR]]: +; PAL: buffer_store{{.*}}, s{{\[}}[[SCRATCHDESC]]: + +define amdgpu_kernel void @scratch2(<2 x i32> %in, i32 %idx, i32* %out) #0 { +entry: + %v = alloca [2 x i32] + %vv = bitcast [2 x i32]* %v to <2 x i32>* + store <2 x i32> %in, <2 x i32>* %vv + %e = getelementptr [2 x i32], [2 x i32]* %v, i32 0, i32 %idx + %x = load i32, i32* %e + store i32 %x, i32* %out + ret void +} + +attributes #0 = { nounwind "amdgpu-git-ptr-high"="0x1234" }