Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -311,7 +311,7 @@ if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); - OutStreamer->EmitIntValue(alignTo(MFI->LDSSize, 4) >> 2, 4); + OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4); } } @@ -491,10 +491,10 @@ Ctx.diagnose(Diag); } - if (MFI->LDSSize > static_cast(STM.getLocalMemorySize())) { + if (MFI->getLDSSize() > static_cast(STM.getLocalMemorySize())) { LLVMContext &Ctx = MF.getFunction()->getContext(); DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory", - MFI->LDSSize, DS_Error); + MFI->getLDSSize(), DS_Error); Ctx.diagnose(Diag); } @@ -528,7 +528,7 @@ unsigned LDSSpillSize = MFI->LDSWaveSpillSize * MFI->getMaximumWorkGroupSize(MF); - ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize; + ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize; ProgInfo.LDSBlocks = alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; @@ -704,7 +704,8 @@ if (STM.isXNACKEnabled()) header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; - header.kernarg_segment_byte_size = MFI->ABIArgOffset; + // FIXME: Should use getKernArgSize + header.kernarg_segment_byte_size = MFI->getABIArgOffset(); header.wavefront_sgpr_count = KernelInfo.NumSGPR; header.workitem_vgpr_count = KernelInfo.NumVGPR; header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -864,24 +864,8 @@ if (hasDefinedInitializer(GV)) break; - unsigned Offset; - if (MFI->LocalMemoryObjects.count(GV) == 0) { - unsigned Align = GV->getAlignment(); - if (Align == 0) - Align = DL.getABITypeAlignment(GV->getValueType()); - - /// TODO: We should sort these to minimize wasted space due to alignment - /// padding. Currently the padding is decided by the first encountered use - /// during lowering. - Offset = MFI->LDSSize = alignTo(MFI->LDSSize, Align); - MFI->LocalMemoryObjects[GV] = Offset; - MFI->LDSSize += DL.getTypeAllocSize(GV->getValueType()); - } else { - Offset = MFI->LocalMemoryObjects[GV]; - } - - return DAG.getConstant(Offset, SDLoc(Op), - getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS)); + unsigned Offset = MFI->allocateLDSGlobal(DL, *GV); + return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType()); } } @@ -2754,7 +2738,7 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const { - uint64_t ArgOffset = MFI->ABIArgOffset; + uint64_t ArgOffset = MFI->getABIArgOffset(); switch (Param) { case GRID_DIM: return ArgOffset; Index: lib/Target/AMDGPU/AMDGPUMachineFunction.h =================================================================== --- lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -11,15 +11,26 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H #include "llvm/CodeGen/MachineFunction.h" -#include +#include "llvm/ADT/DenseMap.h" namespace llvm { class AMDGPUMachineFunction : public MachineFunctionInfo { + /// A map to keep track of local memory objects and their offsets within the + /// local memory space. + SmallDenseMap LocalMemoryObjects; + uint64_t KernArgSize; unsigned MaxKernArgAlign; - virtual void anchor(); + /// Number of bytes in the LDS that are being used. + unsigned LDSSize; + + // FIXME: This should probably be removed. + /// Start of implicit kernel args + unsigned ABIArgOffset; + + bool IsKernel; public: AMDGPUMachineFunction(const MachineFunction &MF); @@ -35,19 +46,27 @@ return Result; } - /// A map to keep track of local memory objects and their offsets within - /// the local memory space. - std::map LocalMemoryObjects; - /// Number of bytes in the LDS that are being used. - unsigned LDSSize; + uint64_t getKernArgSize() const { + return KernArgSize; + } - /// Start of implicit kernel args - unsigned ABIArgOffset; + void setABIArgOffset(unsigned NewOffset) { + ABIArgOffset = NewOffset; + } + + unsigned getABIArgOffset() const { + return ABIArgOffset; + } - bool isKernel() const; + unsigned getLDSSize() const { + return LDSSize; + } - unsigned ScratchSize; - bool IsKernel; + bool isKernel() const { + return IsKernel; + } + + unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV); }; } Index: lib/Target/AMDGPU/AMDGPUMachineFunction.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -1,23 +1,47 @@ +//===-- AMDGPUMachineFunctionInfo.cpp ---------------------------------------=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + #include "AMDGPUMachineFunction.h" +#include "AMDGPUSubtarget.h" using namespace llvm; -// Pin the vtable to this file. -void AMDGPUMachineFunction::anchor() {} - AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), + LocalMemoryObjects(), KernArgSize(0), MaxKernArgAlign(0), LDSSize(0), ABIArgOffset(0), - ScratchSize(0), - IsKernel(MF.getFunction()->getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL || - MF.getFunction()->getCallingConv() == llvm::CallingConv::SPIR_KERNEL) -{ + IsKernel(MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_KERNEL || + MF.getFunction()->getCallingConv() == CallingConv::SPIR_KERNEL) { + // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset, + // except reserved size is not correctly aligned. } -bool AMDGPUMachineFunction::isKernel() const -{ - return IsKernel; +unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, + const GlobalValue &GV) { + auto Entry = LocalMemoryObjects.insert(std::make_pair(&GV, 0)); + if (!Entry.second) + return Entry.first->second; + + unsigned Align = GV.getAlignment(); + if (Align == 0) + Align = DL.getABITypeAlignment(GV.getValueType()); + + /// TODO: We should sort these to minimize wasted space due to alignment + /// padding. Currently the padding is decided by the first encountered use + /// during lowering. + unsigned Offset = LDSSize = alignTo(LDSSize, Align); + + Entry.first->second = Offset; + LDSSize += DL.getTypeAllocSize(GV.getValueType()); + + return Offset; } Index: lib/Target/AMDGPU/R600ISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/R600ISelLowering.cpp +++ lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1762,7 +1762,7 @@ unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); unsigned PartOffset = VA.getLocMemOffset(); - unsigned Offset = 36 + VA.getLocMemOffset(); + unsigned Offset = Subtarget->getExplicitKernelArgOffset() + VA.getLocMemOffset(); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); SDValue Arg = DAG.getLoad( @@ -1773,7 +1773,7 @@ // 4 is the preferred alignment for the CONSTANT memory space. InVals.push_back(Arg); - MFI->ABIArgOffset = Offset + MemVT.getStoreSize(); + MFI->setABIArgOffset(Offset + MemVT.getStoreSize()); } return Chain; } Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -770,7 +770,7 @@ } InVals.push_back(Arg); - Info->ABIArgOffset = Offset + MemVT.getStoreSize(); + Info->setABIArgOffset(Offset + MemVT.getStoreSize()); continue; } assert(VA.isRegLoc() && "Parameter must be in a register!"); @@ -1451,8 +1451,8 @@ SIMachineFunctionInfo *MFI = MF->getInfo(); DebugLoc DL = MI.getDebugLoc(); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32)) - .addOperand(MI.getOperand(0)) - .addImm(MFI->LDSSize); + .addOperand(MI.getOperand(0)) + .addImm(MFI->getLDSSize()); MI.eraseFromParent(); return BB; } Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -807,7 +807,7 @@ } // Add FrameIndex to LDS offset - unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); + unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) .addImm(LDSOffset) .addReg(TIDReg); Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -539,7 +539,7 @@ } } - if (NeedFlat && MFI->IsKernel) { + if (NeedFlat && MFI->isKernel()) { // TODO: What to use with function calls? // We will need to Initialize the flat scratch register pair. if (NeedFlat) Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -28,7 +28,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { // FIXME: This should be removed and getPreloadedValue moved here. friend struct SIRegisterInfo; - void anchor() override; unsigned TIDReg; Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -26,9 +26,6 @@ cl::ReallyHidden, cl::init(true)); -// Pin the vtable to this file. -void SIMachineFunctionInfo::anchor() {} - SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), TIDReg(AMDGPU::NoRegister),