Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td @@ -100,6 +100,10 @@ GCCBuiltin<"__builtin_amdgcn_dispatch_id">, Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>; +def int_amdgcn_implicit_buffer_ptr : + GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, + Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; + //===----------------------------------------------------------------------===// // Instruction Intrinsics //===----------------------------------------------------------------------===// Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -140,7 +140,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() { const AMDGPUSubtarget &STM = MF->getSubtarget(); SIProgramInfo KernelInfo; - if (STM.isAmdCodeObjectV2()) { + if (STM.isAmdCodeObjectV2(*MF)) { getSIProgramInfo(KernelInfo, *MF); EmitAmdKernelCodeT(*MF, KernelInfo); } @@ -149,7 +149,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { const SIMachineFunctionInfo *MFI = MF->getInfo(); const AMDGPUSubtarget &STM = MF->getSubtarget(); - if (MFI->isKernel() && STM.isAmdCodeObjectV2()) { + if (MFI->isKernel() && STM.isAmdCodeObjectV2(*MF)) { AMDGPUTargetStreamer *TS = static_cast(OutStreamer->getTargetStreamer()); SmallString<128> SymbolName; @@ -779,7 +779,7 @@ // FIXME: Should use getKernArgSize header.kernarg_segment_byte_size = - STM.getKernArgSegmentSize(MFI->getABIArgOffset()); + STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset()); header.wavefront_sgpr_count = KernelInfo.NumSGPR; header.workitem_vgpr_count = KernelInfo.NumVGPR; header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -313,22 +313,31 @@ return EnableXNACK; } - bool isAmdCodeObjectV2() const { - return isAmdHsaOS() || isMesa3DOS(); + bool isMesaKernel(const MachineFunction &MF) const { + return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv()); + } + + // Covers VS/PS/CS graphics shaders + bool isMesaGfxShader(const MachineFunction &MF) const { + return isMesa3DOS() && AMDGPU::isShader(MF.getFunction()->getCallingConv()); + } + + bool isAmdCodeObjectV2(const MachineFunction &MF) const { + return isAmdHsaOS() || isMesaKernel(MF); } /// \brief Returns the offset in bytes from the start of the input buffer /// of the first explicit kernel argument. - unsigned getExplicitKernelArgOffset() const { - return isAmdCodeObjectV2() ? 0 : 36; + unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const { + return isAmdCodeObjectV2(MF) ? 0 : 36; } unsigned getAlignmentForImplicitArgPtr() const { return isAmdHsaOS() ? 8 : 4; } - unsigned getImplicitArgNumBytes() const { - if (isMesa3DOS()) + unsigned getImplicitArgNumBytes(const MachineFunction &MF) const { + if (isMesaKernel(MF)) return 16; if (isAmdHsaOS() && isOpenCLEnv()) return 32; @@ -595,7 +604,7 @@ return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; } - unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const; + unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const; /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -299,8 +299,9 @@ return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); } -unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const { - unsigned ImplicitBytes = getImplicitArgNumBytes(); +unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, + unsigned ExplicitArgBytes) const { + unsigned ImplicitBytes = getImplicitArgNumBytes(MF); if (ImplicitBytes == 0) return ExplicitArgBytes; Index: llvm/trunk/lib/Target/AMDGPU/R600ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/R600ISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1587,7 +1587,7 @@ unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); unsigned PartOffset = VA.getLocMemOffset(); - unsigned Offset = Subtarget->getExplicitKernelArgOffset() + VA.getLocMemOffset(); + unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + VA.getLocMemOffset(); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); SDValue Arg = DAG.getLoad( Index: llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -237,7 +237,7 @@ unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; - if (ST.isAmdCodeObjectV2()) { + if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) { PreloadedPrivateBufferReg = TRI->getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); } @@ -255,7 +255,7 @@ } if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { - assert(ST.isAmdCodeObjectV2()); + assert(ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)); MRI.addLiveIn(PreloadedPrivateBufferReg); MBB.addLiveIn(PreloadedPrivateBufferReg); } @@ -280,6 +280,7 @@ bool CopyBuffer = ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister && + ST.isAmdCodeObjectV2(MF) && ScratchRsrcReg != PreloadedPrivateBufferReg; // This needs to be careful of the copying order to avoid overwriting one of @@ -303,24 +304,57 @@ .addReg(PreloadedPrivateBufferReg, RegState::Kill); } - if (ResourceRegUsed && PreloadedPrivateBufferReg == AMDGPU::NoRegister) { - assert(!ST.isAmdCodeObjectV2()); + if (ResourceRegUsed && (ST.isMesaGfxShader(MF) || (PreloadedPrivateBufferReg == AMDGPU::NoRegister))) { + assert(!ST.isAmdCodeObjectV2(MF)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); - unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); // Use relocations to get the pointer, and setup the other bits manually. uint64_t Rsrc23 = TII->getScratchRsrcWords23(); - BuildMI(MBB, I, DL, SMovB32, Rsrc0) - .addExternalSymbol("SCRATCH_RSRC_DWORD0") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - BuildMI(MBB, I, DL, SMovB32, Rsrc1) - .addExternalSymbol("SCRATCH_RSRC_DWORD1") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + if (MFI->hasPrivateMemoryInputPtr()) { + unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + + if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { + const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); + + BuildMI(MBB, I, DL, Mov64, Rsrc01) + .addReg(PreloadedPrivateBufferReg) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } else { + const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); + + PointerType *PtrTy = + PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()), + AMDGPUAS::CONSTANT_ADDRESS); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); + auto MMO = MF.getMachineMemOperand(PtrInfo, + MachineMemOperand::MOLoad | + MachineMemOperand::MOInvariant | + MachineMemOperand::MODereferenceable, + 0, 0); + BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) + .addReg(PreloadedPrivateBufferReg) + .addImm(0) // offset + .addImm(0) // glc + .addMemOperand(MMO) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } + } else { + unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + + BuildMI(MBB, I, DL, SMovB32, Rsrc0) + .addExternalSymbol("SCRATCH_RSRC_DWORD0") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc1) + .addExternalSymbol("SCRATCH_RSRC_DWORD1") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + } BuildMI(MBB, I, DL, SMovB32, Rsrc2) .addImm(Rsrc23 & 0xffffffff) Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -891,7 +891,7 @@ if (!AMDGPU::isShader(CallConv)) { assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); } else { - assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() && + assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && @@ -899,6 +899,12 @@ !Info->hasWorkItemIDZ()); } + if (Info->hasPrivateMemoryInputPtr()) { + unsigned PrivateMemoryPtrReg = Info->addPrivateMemoryPtr(*TRI); + MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(PrivateMemoryPtrReg); + } + // FIXME: How should these inputs interact with inreg / custom SGPR inputs? if (Info->hasPrivateSegmentBuffer()) { unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); @@ -956,7 +962,7 @@ if (VA.isMemLoc()) { VT = Ins[i].VT; EVT MemVT = VA.getLocVT(); - const unsigned Offset = Subtarget->getExplicitKernelArgOffset() + + const unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + VA.getLocMemOffset(); // The first 36 bytes of the input buffer contains information about // thread group and global sizes. @@ -1080,7 +1086,7 @@ if (getTargetMachine().getOptLevel() == CodeGenOpt::None) HasStackObjects = true; - if (ST.isAmdCodeObjectV2()) { + if (ST.isAmdCodeObjectV2(MF)) { if (HasStackObjects) { // If we have stack objects, we unquestionably need the private buffer // resource. For the Code Object V2 ABI, this will be the first 4 user @@ -2504,9 +2510,13 @@ // TODO: Should this propagate fast-math-flags? switch (IntrinsicID) { + case Intrinsic::amdgcn_implicit_buffer_ptr: { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); + } case Intrinsic::amdgcn_dispatch_ptr: case Intrinsic::amdgcn_queue_ptr: { - if (!Subtarget->isAmdCodeObjectV2()) { + if (!Subtarget->isAmdCodeObjectV2(MF)) { DiagnosticInfoUnsupported BadIntrin( *MF.getFunction(), "unsupported hsa intrinsic without hsa target", DL.getDebugLoc()); Index: llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -87,6 +87,9 @@ unsigned ScratchRSrcReg; unsigned ScratchWaveOffsetReg; + // Input registers for non-HSA ABI + unsigned PrivateMemoryPtrUserSGPR; + // Input registers setup for the HSA ABI. // User SGPRs in allocation order. unsigned PrivateSegmentBufferUserSGPR; @@ -166,6 +169,11 @@ bool WorkItemIDY : 1; bool WorkItemIDZ : 1; + // Private memory buffer + // Compute directly in sgpr[0:1] + // Other shaders indirect 64-bits at sgpr[0:1] + bool PrivateMemoryInputPtr : 1; + MCPhysReg getNextUserSGPR() const { assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); return AMDGPU::SGPR0 + NumUserSGPRs; @@ -204,6 +212,7 @@ unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); unsigned addDispatchID(const SIRegisterInfo &TRI); unsigned addFlatScratchInit(const SIRegisterInfo &TRI); + unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI); // Add system SGPRs. unsigned addWorkGroupIDX() { @@ -308,6 +317,10 @@ return WorkItemIDZ; } + bool hasPrivateMemoryInputPtr() const { + return PrivateMemoryInputPtr; + } + unsigned getNumUserSGPRs() const { return NumUserSGPRs; } @@ -344,6 +357,10 @@ return QueuePtrUserSGPR; } + unsigned getPrivateMemoryPtrUserSGPR() const { + return PrivateMemoryPtrUserSGPR; + } + bool hasSpilledSGPRs() const { return HasSpilledSGPRs; } Index: llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -77,7 +77,8 @@ PrivateSegmentWaveByteOffset(false), WorkItemIDX(false), WorkItemIDY(false), - WorkItemIDZ(false) { + WorkItemIDZ(false), + PrivateMemoryInputPtr(false) { const SISubtarget &ST = MF.getSubtarget(); const Function *F = MF.getFunction(); @@ -114,7 +115,7 @@ if (HasStackObjects || MaySpill) PrivateSegmentWaveByteOffset = true; - if (ST.isAmdCodeObjectV2()) { + if (ST.isAmdCodeObjectV2(MF)) { if (HasStackObjects || MaySpill) PrivateSegmentBuffer = true; @@ -126,6 +127,9 @@ if (F->hasFnAttribute("amdgpu-dispatch-id")) DispatchID = true; + } else if (ST.isMesaGfxShader(MF)) { + if (HasStackObjects || MaySpill) + PrivateMemoryInputPtr = true; } // We don't need to worry about accessing spills with flat instructions. @@ -182,6 +186,13 @@ return FlatScratchInitUserSGPR; } +unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) { + PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return PrivateMemoryPtrUserSGPR; +} + SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg ( MachineFunction *MF, unsigned FrameIndex, Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1108,10 +1108,12 @@ case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: - assert(ST.isAmdCodeObjectV2() && - "Non-CodeObjectV2 ABI currently uses relocations"); - assert(MFI->hasPrivateSegmentBuffer()); - return MFI->PrivateSegmentBufferUserSGPR; + if (ST.isAmdCodeObjectV2(MF)) { + assert(MFI->hasPrivateSegmentBuffer()); + return MFI->PrivateSegmentBufferUserSGPR; + } + assert(MFI->hasPrivateMemoryInputPtr()); + return MFI->PrivateMemoryPtrUserSGPR; case SIRegisterInfo::KERNARG_SEGMENT_PTR: assert(MFI->hasKernargSegmentPtr()); return MFI->KernargSegmentPtrUserSGPR;