diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -710,8 +710,6 @@ TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader); } else { CCInfo.AllocateReg(Info->getScratchRSrcReg()); - CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); - CCInfo.AllocateReg(Info->getFrameOffsetReg()); TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1478,6 +1478,7 @@ } std::pair AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { + SDLoc DL(N); const MachineFunction &MF = CurDAG->getMachineFunction(); const SIMachineFunctionInfo *Info = MF.getInfo(); @@ -1492,9 +1493,8 @@ } // If we don't know this private access is a local stack object, it needs to - // be relative to the entry point's scratch wave offset register. - return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(), - MVT::i32)); + // be relative to the entry point's scratch wave offset. + return std::make_pair(N, CurDAG->getTargetConstant(0, DL, MVT::i32)); } bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, @@ -1519,10 +1519,10 @@ // In a call sequence, stores to the argument stack area are relative to the // stack pointer. const MachinePointerInfo &PtrInfo = cast(Parent)->getPointerInfo(); - unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? - Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); - SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); + SOffset = isStackPtrRelative(PtrInfo) + ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32) + : CurDAG->getTargetConstant(0, DL, MVT::i32); ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); return true; } @@ -1580,12 +1580,12 @@ SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); const MachinePointerInfo &PtrInfo = cast(Parent)->getPointerInfo(); - unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? - Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); // FIXME: Get from MachinePointerInfo? We should only be using the frame // offset if we know this is in a call sequence. - SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); + SOffset = isStackPtrRelative(PtrInfo) + ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32) + : CurDAG->getTargetConstant(0, DL, MVT::i32); Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2694,10 +2694,10 @@ const MachineMemOperand *MMO = *MI->memoperands_begin(); const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); - Register SOffsetReg = isStackPtrRelative(PtrInfo) - ? Info->getStackPtrOffsetReg() - : Info->getScratchWaveOffsetReg(); - MIB.addReg(SOffsetReg); + if (isStackPtrRelative(PtrInfo)) + MIB.addReg(Info->getStackPtrOffsetReg()); + else + MIB.addImm(0); }, [=](MachineInstrBuilder &MIB) { // offset MIB.addImm(Offset & 4095); @@ -2734,13 +2734,6 @@ } } - // If we don't know this private access is a local stack object, it needs to - // be relative to the entry point's scratch wave offset register. - // TODO: Should split large offsets that don't fit like above. - // TODO: Don't use scratch wave offset just because the offset didn't fit. - Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg() - : Info->getScratchWaveOffsetReg(); - return {{[=](MachineInstrBuilder &MIB) { // rsrc MIB.addReg(Info->getScratchRSrcReg()); }, @@ -2751,7 +2744,15 @@ MIB.addReg(VAddr); }, [=](MachineInstrBuilder &MIB) { // soffset - MIB.addReg(SOffset); + // If we don't know this private access is a local stack object, it + // needs to be relative to the entry point's scratch wave offset. + // TODO: Should split large offsets that don't fit like above. + // TODO: Don't use scratch wave offset just because the offset + // didn't fit. + if (FI.hasValue()) + MIB.addReg(Info->getStackPtrOffsetReg()); + else + MIB.addImm(0); }, [=](MachineInstrBuilder &MIB) { // offset MIB.addImm(Offset); @@ -2789,15 +2790,17 @@ const MachineMemOperand *MMO = *MI->memoperands_begin(); const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); - Register SOffsetReg = isStackPtrRelative(PtrInfo) - ? Info->getStackPtrOffsetReg() - : Info->getScratchWaveOffsetReg(); return {{ - [=](MachineInstrBuilder &MIB) { + [=](MachineInstrBuilder &MIB) { // rsrc MIB.addReg(Info->getScratchRSrcReg()); - }, // rsrc - [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset - [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset + }, + [=](MachineInstrBuilder &MIB) { // soffset + if (isStackPtrRelative(PtrInfo)) + MIB.addReg(Info->getStackPtrOffsetReg()); + else + MIB.addImm(0); + }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset }}; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1065,7 +1065,6 @@ }; if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || - parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg) || parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) return true; @@ -1075,11 +1074,6 @@ return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); } - if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG && - !AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)) { - return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg); - } - if (MFI->FrameOffsetReg != AMDGPU::FP_REG && !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -296,7 +296,6 @@ switch (RegNo) { case AMDGPU::FP_REG: case AMDGPU::SP_REG: - case AMDGPU::SCRATCH_WAVE_OFFSET_REG: case AMDGPU::PRIVATE_RSRC_REG: llvm_unreachable("pseudo-register should not ever be emitted"); case AMDGPU::SCC: diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -613,8 +613,7 @@ // Sanity check that this is a stack access. // FIXME: Should probably use stack pseudos before frame lowering. MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset); - if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() && - SOff->getReg() != MFI->getStackPtrOffsetReg())) + if (!SOff->isReg() || SOff->getReg() != MFI->getStackPtrOffsetReg()) return; if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() != diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -64,14 +64,14 @@ Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const; Register - getEntryFunctionReservedScratchWaveOffsetReg(MachineFunction &MF) const; - - void emitEntryFunctionScratchRsrcRegSetup(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const DebugLoc &DL, - Register PreloadedPrivateBufferReg, - Register ScratchRsrcReg) const; + getEntryFunctionScratchRsrcRegSavedWord(MachineFunction &MF, + Register ScratchRsrcReg) const; + + void emitEntryFunctionScratchRsrcRegSetup( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + Register PreloadedPrivateBufferReg, Register ScratchRsrcReg, + Register ScratchWaveOffsetReg) const; public: bool hasFP(const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -313,76 +313,6 @@ return ScratchRsrcReg; } -// Shift down registers reserved for the scratch wave offset. -Register SIFrameLowering::getEntryFunctionReservedScratchWaveOffsetReg( - MachineFunction &MF) const { - - const GCNSubtarget &ST = MF.getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo *TRI = &TII->getRegisterInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - SIMachineFunctionInfo *MFI = MF.getInfo(); - - assert(MFI->isEntryFunction()); - - Register ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - - if (ScratchWaveOffsetReg == AMDGPU::NoRegister || - (!MRI.isPhysRegUsed(ScratchWaveOffsetReg) && !hasFP(MF) && - !MFI->hasFlatScratchInit())) { - assert(!hasFP(MF) && !MFI->hasFlatScratchInit()); - return AMDGPU::NoRegister; - } - - if (ST.hasSGPRInitBug() || - ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) - return ScratchWaveOffsetReg; - - unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); - - ArrayRef AllSGPRs = getAllSGPRs(ST, MF); - if (NumPreloaded > AllSGPRs.size()) - return ScratchWaveOffsetReg; - - AllSGPRs = AllSGPRs.slice(NumPreloaded); - - // We need to drop register from the end of the list that we cannot use - // for the scratch wave offset. - // + 2 s102 and s103 do not exist on VI. - // + 2 for vcc - // + 2 for xnack_mask - // + 2 for flat_scratch - // + 4 for registers reserved for scratch resource register - // + 1 for register reserved for scratch wave offset. (By exluding this - // register from the list to consider, it means that when this - // register is being used for the scratch wave offset and there - // are no other free SGPRs, then the value will stay in this register. - // + 1 if stack pointer is used. - // ---- - // 13 (+1) - unsigned ReservedRegCount = 13; - - if (AllSGPRs.size() < ReservedRegCount) - return ScratchWaveOffsetReg; - - for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) { - // Pick the first unallocated SGPR. Be careful not to pick an alias of the - // scratch descriptor, since we haven’t added its uses yet. - if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { - MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); - if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) { - assert(!hasFP(MF)); - MFI->setStackPtrOffsetReg(Reg); - } - MFI->setScratchWaveOffsetReg(Reg); - MFI->setFrameOffsetReg(Reg); - return Reg; - } - } - - return ScratchWaveOffsetReg; -} - void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); @@ -401,128 +331,135 @@ SIMachineFunctionInfo *MFI = MF.getInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); assert(MFI->isEntryFunction()); - // We need to do the replacement of the private segment buffer and wave offset - // register even if there are no stack objects. There could be stores to undef - // or a constant without an associated object. - // - // These calls will return `AMDGPU::NoRegister` in cases where there are no - // actual uses of the respective registers. - Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); - Register ScratchWaveOffsetReg = - getEntryFunctionReservedScratchWaveOffsetReg(MF); - - // Make the selected registers live throughout the function. - for (MachineBasicBlock &OtherBB : MF) { - if (&OtherBB == &MBB) - continue; - - if (ScratchWaveOffsetReg != AMDGPU::NoRegister) - OtherBB.addLiveIn(ScratchWaveOffsetReg); - - if (ScratchRsrcReg != AMDGPU::NoRegister) - OtherBB.addLiveIn(ScratchRsrcReg); - } - - // Now that we have fixed the reserved registers we need to locate the - // (potentially) preloaded registers. We should always have a preloaded - // scratch wave offset register, but we only have a preloaded scratch rsrc - // register for HSA. - Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( + Register ScratchWaveOffsetReg = MFI->getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); // FIXME: Hack to not crash in situations which emitted an error. - if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister) + if (ScratchWaveOffsetReg == AMDGPU::NoRegister) return; - // We added live-ins during argument lowering, but since they were not used - // they were deleted. We're adding the uses now, so add them back. - MRI.addLiveIn(PreloadedScratchWaveOffsetReg); - MBB.addLiveIn(PreloadedScratchWaveOffsetReg); + // We need to do the replacement of the private segment buffer register even + // if there are no stack objects. There could be stores to undef or a + // constant without an associated object. + // + // This will return `AMDGPU::NoRegister` in cases where there are no actual + // uses of the SRSRC. + Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); + // Make the selected register live throughout the function. + if (ScratchRsrcReg != AMDGPU::NoRegister) + for (MachineBasicBlock &OtherBB : MF) + if (&OtherBB != &MBB) + OtherBB.addLiveIn(ScratchRsrcReg); + + // Now that we have fixed the reserved SRSRC we need to locate the + // (potentially) preloaded SRSRC. Register PreloadedScratchRsrcReg = AMDGPU::NoRegister; if (ST.isAmdHsaOrMesa(F)) { PreloadedScratchRsrcReg = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); if (ScratchRsrcReg != AMDGPU::NoRegister && PreloadedScratchRsrcReg != AMDGPU::NoRegister) { + // We added live-ins during argument lowering, but since they were not + // used they were deleted. We're adding the uses now, so add them back. MRI.addLiveIn(PreloadedScratchRsrcReg); MBB.addLiveIn(PreloadedScratchRsrcReg); } } + // Debug location must be unknown since the first debug location is used to + // determine the end of the prologue. DebugLoc DL; MachineBasicBlock::iterator I = MBB.begin(); - const bool HasFP = hasFP(MF); - - // If we are not HSA or we happened to reserved the original input registers, - // we don't need to copy to the reserved registers. - const bool CopyBuffer = ST.isAmdHsaOrMesa(F) && - ScratchRsrcReg != AMDGPU::NoRegister && - PreloadedScratchRsrcReg != AMDGPU::NoRegister && - ScratchRsrcReg != PreloadedScratchRsrcReg; - - // This needs to be careful of the copying order to avoid overwriting one of - // the input registers before it's been copied to it's final - // destination. Usually the offset should be copied first. - const bool CopyBufferFirst = - TRI->isSubRegisterEq(PreloadedScratchRsrcReg, ScratchWaveOffsetReg); - - if (CopyBuffer && CopyBufferFirst) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) - .addReg(PreloadedScratchRsrcReg, RegState::Kill); + // FIXME: Currently there is no reason to use an frame pointer at all in the + // prologue as it can always be replaced with an inline constant 0. As no + // other target seems to have this property there are some nontrivial changes + // to things like TargetRegisterInfo::getFrameRegister which would be + // required to implement this, so for now we always ensure we have a frame + // register. + unsigned SPReg = MFI->getStackPtrOffsetReg(); + assert(SPReg != AMDGPU::SP_REG); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) + .addImm(MF.getFrameInfo().hasCalls() + ? MF.getFrameInfo().getStackSize() * ST.getWavefrontSize() + : 0); + + if (hasFP(MF)) { + unsigned FPReg = MFI->getFrameOffsetReg(); + assert(FPReg != AMDGPU::FP_REG); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); } - if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) - .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0); + if (MFI->hasFlatScratchInit() || ScratchRsrcReg != AMDGPU::NoRegister) { + MRI.addLiveIn(ScratchWaveOffsetReg); + MBB.addLiveIn(ScratchWaveOffsetReg); } - if (CopyBuffer && !CopyBufferFirst) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) - .addReg(PreloadedScratchRsrcReg, RegState::Kill); + if (MFI->hasFlatScratchInit()) { + emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); } - // FIXME: This should also implement the setup path for HSA. if (ScratchRsrcReg != AMDGPU::NoRegister) { - emitEntryFunctionScratchRsrcRegSetup( - MF, MBB, I, DL, PreloadedScratchRsrcReg, ScratchRsrcReg); + emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, + PreloadedScratchRsrcReg, + ScratchRsrcReg, ScratchWaveOffsetReg); } +} - if (HasFP) { - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - int64_t StackSize = FrameInfo.getStackSize(); +// Find an SGPR to save part of the ScratchRsrcReg during setup for HSA. +Register SIFrameLowering::getEntryFunctionScratchRsrcRegSavedWord( + MachineFunction &MF, Register ScratchRsrcReg) const { - Register SPReg = MFI->getStackPtrOffsetReg(); - assert(SPReg != AMDGPU::SP_REG); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); - // On kernel entry, the private scratch wave offset is the SP value. - if (StackSize == 0) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()); - } else { - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()) - .addImm(StackSize * ST.getWavefrontSize()); - } - } + assert(MFI->isEntryFunction()); - if (MFI->hasFlatScratchInit()) { - emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, - MFI->getScratchWaveOffsetReg()); + ArrayRef AllSGPRs = getAllSGPRs(ST, MF); + // We need to drop registers from the front of the list that we cannot use. + unsigned NumPreloadedSGPRs = MFI->getNumPreloadedSGPRs(); + // We need to drop registers from the end of the list that we cannot use. + // + 2 s102 and s103 do not exist on VI. + // + 2 for vcc + // + 2 for xnack_mask + // + 2 for flat_scratch + // + 4 for registers reserved for scratch resource register + // + 1 if stack pointer is used. + // ---- + // 13 + unsigned NumReservedSGPRs = 13; + + if (NumPreloadedSGPRs + NumReservedSGPRs >= AllSGPRs.size()) + return AMDGPU::NoRegister; + + ArrayRef UnusedSGPRs = + getAllSGPRs(ST, MF).slice(NumPreloadedSGPRs).drop_back(NumReservedSGPRs); + + for (MCPhysReg SGPR : UnusedSGPRs) { + // Even after avoiding both the preloaded SGPRs at the front of the list + // and the reserved SGPRs at the end we need to make sure we avoid the + // ScratchRsrcReg we are in the process of setting up. Nothing else is live + // in the kernel prologue which we need to avoid. + if (!TRI->isSubRegisterEq(ScratchRsrcReg, SGPR) && MRI.isAllocatable(SGPR)) + return SGPR; } + + return AMDGPU::NoRegister; } -// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoRegister` +// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register PreloadedScratchRsrcReg, - Register ScratchRsrcReg) const { + Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -646,6 +583,53 @@ BuildMI(MBB, I, DL, SMovB32, Rsrc3) .addImm(Rsrc23 >> 32) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } else if (ST.isAmdHsaOrMesa(Fn)) { + assert(PreloadedScratchRsrcReg != AMDGPU::NoRegister); + + Register SavedWord = + getEntryFunctionScratchRsrcRegSavedWord(MF, ScratchRsrcReg); + if (!SavedWord) + report_fatal_error("Cannot scavenge SGPR to set up SRSRC"); + + if (ScratchRsrcReg != PreloadedScratchRsrcReg) + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) + .addReg(PreloadedScratchRsrcReg, RegState::Kill); + + Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + Register ScratchRsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + + // Save and restore SRSRC bits [48:63]. We only want to update the base + // address in bits [0:47]. + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B32), SavedWord) + .addReg(ScratchRsrcSub1) + .addImm(0xffff << 0x10); + + if (MFI->hasFlatScratchInit() && ST.flatScratchIsPointer()) { + // We have already initialized FLAT_SCRATCH with the same pointer we want + // in the SRSC. + Register FlatScratchPointerReg = + ST.getGeneration() >= AMDGPUSubtarget::GFX10 + ? MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT) + : AMDGPU::FLAT_SCR; + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), ScratchRsrcLo) + .addReg(FlatScratchPointerReg); + } else { + // We are either not using FLAT_SCRATCH, or it is not a pointer. + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) + .addReg(ScratchRsrcSub0) + .addReg(ScratchWaveOffsetReg); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) + .addReg(ScratchRsrcSub1) + .addImm(0); + } + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B32), ScratchRsrcSub1) + .addReg(ScratchRsrcSub1) + .addImm(0xffff); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B32), ScratchRsrcSub1) + .addReg(ScratchRsrcSub1) + .addReg(SavedWord, RegState::Kill); } } @@ -1121,8 +1105,7 @@ if (MFI.getStackSize() != 0) return true; - // For the entry point, the input wave scratch offset must be copied to the - // API SP if there are calls. + // The entry point must set up the ABI SP if there are calls. if (MF.getInfo()->isEntryFunction()) return true; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1898,68 +1898,36 @@ Info.setScratchRSrcReg(ReservedBufferReg); } - // hasFP should be accurate for kernels even before the frame is finalized. - if (ST.getFrameLowering()->hasFP(MF)) { - MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); - // Try to use s32 as the SP, but move it if it would interfere with input - // arguments. This won't work with calls though. - // - // FIXME: Move SP to avoid any possible inputs, or find a way to spill input - // registers. - if (!MRI.isLiveIn(AMDGPU::SGPR32)) { - Info.setStackPtrOffsetReg(AMDGPU::SGPR32); - } else { - assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); + // Try to use s32 as the SP, but move it if it would interfere with input + // arguments. This won't work with calls though. + // + // FIXME: Move SP to avoid any possible inputs, or find a way to spill input + // registers. + if (!MRI.isLiveIn(AMDGPU::SGPR32)) { + Info.setStackPtrOffsetReg(AMDGPU::SGPR32); + } else { + assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); - if (MFI.hasCalls()) - report_fatal_error("call in graphics shader with too many input SGPRs"); + if (MFI.hasCalls()) + report_fatal_error("call in graphics shader with too many input SGPRs"); - for (unsigned Reg : AMDGPU::SGPR_32RegClass) { - if (!MRI.isLiveIn(Reg)) { - Info.setStackPtrOffsetReg(Reg); - break; - } + for (unsigned Reg : AMDGPU::SGPR_32RegClass) { + if (!MRI.isLiveIn(Reg)) { + Info.setStackPtrOffsetReg(Reg); + break; } - - if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) - report_fatal_error("failed to find register for SP"); } - if (MFI.hasCalls()) { - Info.setScratchWaveOffsetReg(AMDGPU::SGPR33); - Info.setFrameOffsetReg(AMDGPU::SGPR33); - } else { - unsigned ReservedOffsetReg = - TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - Info.setFrameOffsetReg(ReservedOffsetReg); - } - } else if (RequiresStackAccess) { - assert(!MFI.hasCalls()); - // We know there are accesses and they will be done relative to SP, so just - // pin it to the input. - // - // FIXME: Should not do this if inline asm is reading/writing these - // registers. - Register PreloadedSP = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - - Info.setStackPtrOffsetReg(PreloadedSP); - Info.setScratchWaveOffsetReg(PreloadedSP); - Info.setFrameOffsetReg(PreloadedSP); - } else { - assert(!MFI.hasCalls()); - - // There may not be stack access at all. There may still be spills, or - // access of a constant pointer (in which cases an extra copy will be - // emitted in the prolog). - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - Info.setStackPtrOffsetReg(ReservedOffsetReg); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - Info.setFrameOffsetReg(ReservedOffsetReg); + if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) + report_fatal_error("failed to find register for SP"); } + + if (ST.getFrameLowering()->hasFP(MF)) + Info.setFrameOffsetReg(AMDGPU::SGPR34); + else + Info.setFrameOffsetReg(Info.getStackPtrOffsetReg()); } bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const { @@ -2213,8 +2181,6 @@ allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); } else { CCInfo.AllocateReg(Info->getScratchRSrcReg()); - CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); - CCInfo.AllocateReg(Info->getFrameOffsetReg()); allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } @@ -10612,11 +10578,6 @@ if (Info->getFrameOffsetReg() != AMDGPU::FP_REG) MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); - if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) { - MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, - Info->getScratchWaveOffsetReg()); - } - Info->limitOccupancy(MF); if (ST.isWave32() && !MF.empty()) { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -284,7 +284,6 @@ uint32_t HighBitsOf32BitAddress = 0; StringValue ScratchRSrcReg = "$private_rsrc_reg"; - StringValue ScratchWaveOffsetReg = "$scratch_wave_offset_reg"; StringValue FrameOffsetReg = "$fp_reg"; StringValue StackPtrOffsetReg = "$sp_reg"; @@ -311,8 +310,6 @@ YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false); YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg, StringValue("$private_rsrc_reg")); - YamlIO.mapOptional("scratchWaveOffsetReg", MFI.ScratchWaveOffsetReg, - StringValue("$scratch_wave_offset_reg")); YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg, StringValue("$fp_reg")); YamlIO.mapOptional("stackPtrOffsetReg", MFI.StackPtrOffsetReg, @@ -336,14 +333,15 @@ // Registers that may be reserved for spilling purposes. These may be the same // as the input registers. unsigned ScratchRSrcReg = AMDGPU::PRIVATE_RSRC_REG; - unsigned ScratchWaveOffsetReg = AMDGPU::SCRATCH_WAVE_OFFSET_REG; - // This is the current function's incremented size from the kernel's scratch - // wave offset register. For an entry function, this is exactly the same as - // the ScratchWaveOffsetReg. + // This is the the unswizzled offset from the current dispatch's scratch wave + // base to the beginning of the current function's frame. For an entry + // function, this is 0. unsigned FrameOffsetReg = AMDGPU::FP_REG; - // Top of the stack SGPR offset derived from the ScratchWaveOffsetReg. + // This is an ABI register used in the non-entry calling convention to + // communicate the unswizzled offset from the current dispatch's scratch wave + // base to the beginning of the new function's frame. unsigned StackPtrOffsetReg = AMDGPU::SP_REG; AMDGPUFunctionArgInfo ArgInfo; @@ -713,10 +711,6 @@ ScratchRSrcReg = Reg; } - unsigned getScratchWaveOffsetReg() const { - return ScratchWaveOffsetReg; - } - unsigned getFrameOffsetReg() const { return FrameOffsetReg; } @@ -739,11 +733,6 @@ return StackPtrOffsetReg; } - void setScratchWaveOffsetReg(unsigned Reg) { - assert(Reg != 0 && "Should never be unset"); - ScratchWaveOffsetReg = Reg; - } - unsigned getQueuePtrUserSGPR() const { return ArgInfo.QueuePtr.getRegister(); } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -68,7 +68,6 @@ // Non-entry functions have no special inputs for now, other registers // required for scratch access. ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; - ScratchWaveOffsetReg = AMDGPU::SGPR33; // TODO: Pick a high register, and shift down, similar to a kernel. FrameOffsetReg = AMDGPU::SGPR34; @@ -76,8 +75,6 @@ ArgInfo.PrivateSegmentBuffer = ArgDescriptor::createRegister(ScratchRSrcReg); - ArgInfo.PrivateSegmentWaveByteOffset = - ArgDescriptor::createRegister(ScratchWaveOffsetReg); if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) ImplicitArgPtr = true; @@ -487,7 +484,6 @@ WaveLimiter(MFI.needsWaveLimiter()), HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()), ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)), - ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)), FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)), StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)), ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -50,11 +50,6 @@ /// spilling is needed. unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; - /// Return the end register initially reserved for the scratch wave offset in - /// case spilling is needed. - unsigned reservedPrivateSegmentWaveByteOffsetReg( - const MachineFunction &MF) const; - BitVector getReservedRegs(const MachineFunction &MF) const override; const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -177,29 +177,6 @@ return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); } -static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { - unsigned Reg; - - // Try to place it in a hole after PrivateSegmentBufferReg. - if (RegCount & 3) { - // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to - // alignment constraints, so we have a hole where can put the wave offset. - Reg = RegCount - 1; - } else { - // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the - // wave offset before it. - Reg = RegCount - 5; - } - - return Reg; -} - -unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( - const MachineFunction &MF) const { - unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF)); - return AMDGPU::SGPR_32RegClass.getRegister(Reg); -} - BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); @@ -279,19 +256,12 @@ const SIMachineFunctionInfo *MFI = MF.getInfo(); - unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { - // Reserve 1 SGPR for scratch wave offset in case we need to spill. - reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); - } - unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); if (ScratchRSrcReg != AMDGPU::NoRegister) { // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need // to spill. // TODO: May need to reserve a VGPR if doing LDS spilling. reserveRegisterTuples(Reserved, ScratchRSrcReg); - assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); } // We have to assume the SP is needed in case there are calls in the function, @@ -825,8 +795,7 @@ MachineFrameInfo &FrameInfo = MF->getFrameInfo(); assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && - SuperReg != MFI->getFrameOffsetReg() && - SuperReg != MFI->getScratchWaveOffsetReg())); + SuperReg != MFI->getFrameOffsetReg())); assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); @@ -1135,33 +1104,22 @@ bool IsMUBUF = TII->isMUBUF(*MI); if (!IsMUBUF && !MFI->isEntryFunction()) { - // Convert to an absolute stack address by finding the offset from the - // scratch wave base and scaling by the wave size. + // Convert to a swizzled stack address by scaling by the wave size. // - // In an entry function/kernel the offset is already the absolute + // In an entry function/kernel the offset is already the swizzled // address relative to the frame register. - Register TmpDiffReg = - RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); - - // If there's no free SGPR, in-place modify the FP - Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg; - bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; - Register ResultReg = IsCopy ? - MI->getOperand(0).getReg() : - RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); - - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) - .addReg(FrameReg) - .addReg(MFI->getScratchWaveOffsetReg()); + Register ResultReg = + IsCopy ? MI->getOperand(0).getReg() + : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); int64_t Offset = FrameInfo.getObjectOffset(Index); if (Offset == 0) { // XXX - This never happens because of emergency scavenging slot at 0? BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) .addImm(ST.getWavefrontSizeLog2()) - .addReg(DiffReg); + .addReg(FrameReg); } else { if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { Register ScaledReg = @@ -1170,7 +1128,7 @@ BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg) .addImm(ST.getWavefrontSizeLog2()) - .addReg(DiffReg, RegState::Kill); + .addReg(FrameReg); const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; @@ -1207,10 +1165,10 @@ // unavailable. Only one additional mov is needed. Register TmpScaledReg = RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); - Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg; + Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) - .addReg(DiffReg, RegState::Kill) + .addReg(FrameReg) .addImm(ST.getWavefrontSizeLog2()); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg) .addReg(ScaledReg, RegState::Kill) @@ -1224,19 +1182,12 @@ .addReg(ScaledReg, RegState::Kill) .addImm(Offset); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) - .addReg(DiffReg, RegState::Kill) + .addReg(FrameReg) .addImm(ST.getWavefrontSizeLog2()); } } } - if (!TmpDiffReg.isValid()) { - // Restore the FP. - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg) - .addReg(FrameReg) - .addReg(MFI->getScratchWaveOffsetReg()); - } - // Don't introduce an extra copy if we're just materializing in a mov. if (IsCopy) MI->eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -101,7 +101,6 @@ def PRIVATE_RSRC_REG : SIReg<"private_rsrc", 0>; def FP_REG : SIReg<"fp", 0>; def SP_REG : SIReg<"sp", 0>; -def SCRATCH_WAVE_OFFSET_REG : SIReg<"scratch_wave_offset", 0>; // Pseudo-register to represent the program-counter DWARF register. def PC_REG : SIReg<"pc", 0>, DwarfRegNum<[16]> { @@ -435,7 +434,7 @@ //===----------------------------------------------------------------------===// def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> { + (add FP_REG, SP_REG)> { let isAllocatable = 0; let CopyCost = -1; } diff --git a/llvm/test/CodeGen/AMDGPU/cc-update-scavenge-fail.ll b/llvm/test/CodeGen/AMDGPU/cc-update-scavenge-fail.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cc-update-scavenge-fail.ll @@ -0,0 +1,15 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 %s + +; XFAIL: * + +define amdgpu_kernel void @test_kern_srsrc_saved_word_scavenge_fail() #1 { +entry: + ; Stack use to force us to init the SRSRC + %x = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %x, align 4 + ret void +} + +attributes #1 = { nounwind "amdgpu-num-sgpr"="27" } diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -0,0 +1,201 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 %s | FileCheck --check-prefix=GFX803 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 %s | FileCheck --check-prefix=GFX900 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 %s | FileCheck --check-prefix=GFX1010 + +define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 { +; GFX803-LABEL: test_kern_empty: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_mov_b32 s32, 0 +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_kern_empty: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_mov_b32 s32, 0 +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_kern_empty: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_mov_b32 s32, 0 +; GFX1010-NEXT: s_endpgm +entry: + ret void +} + +define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { +; GFX803-LABEL: test_kern_stack: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_and_b32 s8, s1, 0xffff0000 +; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: s_and_b32 s1, s1, 0xffff +; GFX803-NEXT: s_mov_b32 s32, 0 +; GFX803-NEXT: s_or_b32 s1, s1, s8 +; GFX803-NEXT: v_mov_b32_e32 v0, 0 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_kern_stack: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_and_b32 s8, s1, 0xffff0000 +; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX900-NEXT: s_and_b32 s1, s1, 0xffff +; GFX900-NEXT: s_mov_b32 s32, 0 +; GFX900-NEXT: s_or_b32 s1, s1, s8 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_kern_stack: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_mov_b32 s32, 0 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_and_b32 s8, s1, 0xffff0000 +; GFX1010-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1010-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-NEXT: ; implicit-def: $vcc_hi +; GFX1010-NEXT: s_and_b32 s1, s1, 0xffff +; GFX1010-NEXT: s_or_b32 s1, s1, s8 +; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GFX1010-NEXT: s_endpgm +entry: + %x = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %x, align 4 + ret void +} + +define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { +; GFX803-LABEL: test_kern_call: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_and_b32 s8, s1, 0xffff0000 +; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: s_and_b32 s1, s1, 0xffff +; GFX803-NEXT: s_or_b32 s1, s1, s8 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: s_getpc_b64 s[4:5] +; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX803-NEXT: s_mov_b32 s32, 0 +; GFX803-NEXT: s_mov_b32 s34, 0 +; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_kern_call: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_and_b32 s8, s1, 0xffff0000 +; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX900-NEXT: s_and_b32 s1, s1, 0xffff +; GFX900-NEXT: s_or_b32 s1, s1, s8 +; GFX900-NEXT: s_getpc_b64 s[4:5] +; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX900-NEXT: s_mov_b32 s32, 0 +; GFX900-NEXT: s_mov_b32 s34, 0 +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_kern_call: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_mov_b32 s32, 0 +; GFX1010-NEXT: s_mov_b32 s34, 0 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_and_b32 s8, s1, 0xffff0000 +; GFX1010-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1010-NEXT: ; implicit-def: $vcc_hi +; GFX1010-NEXT: s_and_b32 s1, s1, 0xffff +; GFX1010-NEXT: s_or_b32 s1, s1, s8 +; GFX1010-NEXT: s_getpc_b64 s[4:5] +; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1010-NEXT: s_endpgm +entry: + tail call void @ex() #0 + ret void +} + +define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { +; GFX803-LABEL: test_kern_stack_and_call: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_and_b32 s8, s1, 0xffff0000 +; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: s_and_b32 s1, s1, 0xffff +; GFX803-NEXT: s_mov_b32 s34, 0 +; GFX803-NEXT: s_or_b32 s1, s1, s8 +; GFX803-NEXT: v_mov_b32_e32 v0, 0 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: s_getpc_b64 s[4:5] +; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX803-NEXT: s_movk_i32 s32, 0x400 +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s34 offset:4 +; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_kern_stack_and_call: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_and_b32 s8, s1, 0xffff0000 +; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX900-NEXT: s_and_b32 s1, s1, 0xffff +; GFX900-NEXT: s_or_b32 s1, s1, s8 +; GFX900-NEXT: s_mov_b32 s34, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_getpc_b64 s[4:5] +; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX900-NEXT: s_movk_i32 s32, 0x400 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s34 offset:4 +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_kern_stack_and_call: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_movk_i32 s32, 0x200 +; GFX1010-NEXT: s_mov_b32 s34, 0 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_and_b32 s8, s1, 0xffff0000 +; GFX1010-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1010-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-NEXT: ; implicit-def: $vcc_hi +; GFX1010-NEXT: s_and_b32 s1, s1, 0xffff +; GFX1010-NEXT: s_or_b32 s1, s1, s8 +; GFX1010-NEXT: s_getpc_b64 s[4:5] +; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s34 offset:4 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1010-NEXT: s_endpgm +entry: + %x = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %x, align 4 + tail call void @ex() #0 + ret void +} + +declare hidden void @ex() local_unnamed_addr #0 + +attributes #0 = { nounwind }