Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1361,10 +1361,10 @@ SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); - // If we can resolve this to a frame index access, this is relative to the - // frame pointer SGPR. - return std::make_pair(TFI, CurDAG->getRegister(Info->getFrameOffsetReg(), - MVT::i32)); + // If we can resolve this to a frame index access, this will be relative to + // either the stack or frame pointer SGPR. + return std::make_pair( + TFI, CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)); } // If we don't know this private access is a local stack object, it needs to Index: lib/Target/AMDGPU/AMDGPURegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -83,6 +83,9 @@ } unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { + const SIFrameLowering *TFI = + MF.getSubtarget().getFrameLowering(); const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - return FuncInfo->getFrameOffsetReg(); + return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() + : FuncInfo->getStackPtrOffsetReg(); } Index: lib/Target/AMDGPU/SIFrameLowering.h =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.h +++ lib/Target/AMDGPU/SIFrameLowering.h @@ -58,12 +58,9 @@ SIMachineFunctionInfo *MFI, MachineFunction &MF) const; - std::pair getReservedPrivateSegmentWaveByteOffsetReg( - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, - MachineFunction &MF) const; + unsigned getReservedPrivateSegmentWaveByteOffsetReg( + const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, + SIMachineFunctionInfo *MFI, MachineFunction &MF) const; // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. void emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF, @@ -73,7 +70,6 @@ public: bool hasFP(const MachineFunction &MF) const override; - bool hasSP(const MachineFunction &MF) const; }; } // end namespace llvm Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -164,34 +164,29 @@ return ScratchRsrcReg; } -// Shift down registers reserved for the scratch wave offset and stack pointer -// SGPRs. -std::pair -SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, - MachineFunction &MF) const { +// Shift down registers reserved for the scratch wave offset. +unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( + const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, + SIMachineFunctionInfo *MFI, MachineFunction &MF) const { MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + assert(MFI->isEntryFunction()); + // No replacement necessary. if (ScratchWaveOffsetReg == AMDGPU::NoRegister || - !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) { - assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG); - return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister); + (!hasFP(MF) && !MRI.isPhysRegUsed(ScratchWaveOffsetReg))) { + return AMDGPU::NoRegister; } - unsigned SPReg = MFI->getStackPtrOffsetReg(); if (ST.hasSGPRInitBug()) - return std::make_pair(ScratchWaveOffsetReg, SPReg); + return ScratchWaveOffsetReg; unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); ArrayRef AllSGPRs = getAllSGPRs(ST, MF); if (NumPreloaded > AllSGPRs.size()) - return std::make_pair(ScratchWaveOffsetReg, SPReg); + return ScratchWaveOffsetReg; AllSGPRs = AllSGPRs.slice(NumPreloaded); @@ -212,7 +207,7 @@ unsigned ReservedRegCount = 13; if (AllSGPRs.size() < ReservedRegCount) - return std::make_pair(ScratchWaveOffsetReg, SPReg); + return ScratchWaveOffsetReg; bool HandledScratchWaveOffsetReg = ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); @@ -225,14 +220,20 @@ HandledScratchWaveOffsetReg = true; MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) { + assert(!hasFP(MF)); + MFI->setStackPtrOffsetReg(Reg); + } + MFI->setScratchWaveOffsetReg(Reg); + MFI->setFrameOffsetReg(Reg); ScratchWaveOffsetReg = Reg; break; } } } - return std::make_pair(ScratchWaveOffsetReg, SPReg); + return ScratchWaveOffsetReg; } void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, @@ -265,38 +266,11 @@ if (MFI->hasFlatScratchInit()) emitFlatScratchInit(ST, MF, MBB); - unsigned SPReg = MFI->getStackPtrOffsetReg(); - if (SPReg != AMDGPU::SP_REG) { - assert(MRI.isReserved(SPReg) && "SPReg used but not reserved"); - - DebugLoc DL; - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - int64_t StackSize = FrameInfo.getStackSize(); - - if (StackSize == 0) { - BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()); - } else { - BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::S_ADD_U32), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()) - .addImm(StackSize * ST.getWavefrontSize()); - } - } - unsigned ScratchRsrcReg = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); - unsigned ScratchWaveOffsetReg; - std::tie(ScratchWaveOffsetReg, SPReg) - = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); - - // It's possible to have uses of only ScratchWaveOffsetReg without - // ScratchRsrcReg if it's only used for the initialization of flat_scratch, - // but the inverse is not true. - if (ScratchWaveOffsetReg == AMDGPU::NoRegister) { - assert(ScratchRsrcReg == AMDGPU::NoRegister); - return; - } + unsigned ScratchWaveOffsetReg = + getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); // We need to insert initialization of the scratch resource descriptor. unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( @@ -308,18 +282,19 @@ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); } - bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg); + bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister && + MRI.isPhysRegUsed(ScratchWaveOffsetReg); bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister && MRI.isPhysRegUsed(ScratchRsrcReg); + // FIXME: Hack to not crash in situations which emitted an error. + if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister) + return; + // We added live-ins during argument lowering, but since they were not used // they were deleted. We're adding the uses now, so add them back. - if (OffsetRegUsed) { - assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister && - "scratch wave offset input is required"); - MRI.addLiveIn(PreloadedScratchWaveOffsetReg); - MBB.addLiveIn(PreloadedScratchWaveOffsetReg); - } + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F)); @@ -360,11 +335,16 @@ .addReg(PreloadedPrivateBufferReg, RegState::Kill); } - if (OffsetRegUsed && - PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { + unsigned SPReg = MFI->getStackPtrOffsetReg(); + assert(SPReg != AMDGPU::SP_REG); + + // FIXME: Remove the isPhysRegUsed checks + const bool HasFP = hasFP(MF); + + if (HasFP || OffsetRegUsed) { + assert(ScratchWaveOffsetReg); BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) - .addReg(PreloadedScratchWaveOffsetReg, - MRI.isPhysRegUsed(ScratchWaveOffsetReg) ? 0 : RegState::Kill); + .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0); } if (CopyBuffer && !CopyBufferFirst) { @@ -372,9 +352,26 @@ .addReg(PreloadedPrivateBufferReg, RegState::Kill); } - if (ResourceRegUsed) + if (ResourceRegUsed) { emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I, PreloadedPrivateBufferReg, ScratchRsrcReg); + } + + if (HasFP) { + DebugLoc DL; + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + int64_t StackSize = FrameInfo.getStackSize(); + + // On kernel entry, the private scratch wave offset is the SP value. + if (StackSize == 0) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg) + .addReg(MFI->getScratchWaveOffsetReg()); + } else { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg) + .addReg(MFI->getScratchWaveOffsetReg()) + .addImm(StackSize * ST.getWavefrontSize()); + } + } } // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. @@ -567,15 +564,12 @@ MachineBasicBlock::iterator MBBI = MBB.begin(); DebugLoc DL; - // XXX - Is this the right predicate? - - bool NeedFP = hasFP(MF); + bool HasFP = false; uint32_t NumBytes = MFI.getStackSize(); uint32_t RoundedSize = NumBytes; - const bool NeedsRealignment = TRI.needsStackRealignment(MF); - if (NeedsRealignment) { - assert(NeedFP); + if (TRI.needsStackRealignment(MF)) { + HasFP = true; const unsigned Alignment = MFI.getMaxAlignment(); RoundedSize += Alignment; @@ -599,7 +593,7 @@ .addImm(-Alignment * ST.getWavefrontSize()) .setMIFlag(MachineInstr::FrameSetup); FuncInfo->setIsStackRealigned(true); - } else if (NeedFP) { + } else if ((HasFP = hasFP(MF))) { // If we need a base pointer, set it up here. It's whatever the value of // the stack pointer is at this point. Any variable size objects will be // allocated after this, so we can still use the base pointer to reference @@ -609,7 +603,7 @@ .setMIFlag(MachineInstr::FrameSetup); } - if (RoundedSize != 0 && hasSP(MF)) { + if (HasFP && RoundedSize != 0) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) .addReg(StackPtrReg) .addImm(RoundedSize * ST.getWavefrontSize()) @@ -693,23 +687,17 @@ .addReg(ScratchExecCopy); } - unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); - if (StackPtrReg == AMDGPU::NoRegister) - return; - - const MachineFrameInfo &MFI = MF.getFrameInfo(); - uint32_t NumBytes = MFI.getStackSize(); - - // FIXME: Clarify distinction between no set SP and SP. For callee functions, - // it's really whether we need SP to be accurate or not. - - if (NumBytes != 0 && hasSP(MF)) { + if (hasFP(MF)) { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + uint32_t NumBytes = MFI.getStackSize(); uint32_t RoundedSize = FuncInfo->isStackRealigned() ? NumBytes + MFI.getMaxAlignment() : NumBytes; + const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) .addReg(StackPtrReg) - .addImm(RoundedSize * ST.getWavefrontSize()); + .addImm(RoundedSize * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameDestroy); } } @@ -849,18 +837,25 @@ } bool SIFrameLowering::hasFP(const MachineFunction &MF) const { - // All stack operations are relative to the frame offset SGPR. - // TODO: Still want to eliminate sometimes. const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.hasCalls()) { + // All offsets are unsigned, so need to be addressed in the same direction + // as stack growth. + if (MFI.getStackSize() != 0) + return true; + + // For the entry point, the input wave scratch offset must be copied to the + // API SP if there are calls. + if (MF.getInfo()->isEntryFunction()) + return true; + + // Retain behavior of always omitting the FP for leaf functions when + // possible. + if (MF.getTarget().Options.DisableFramePointerElim(MF)) + return true; + } - // XXX - Is this only called after frame is finalized? Should be able to check - // frame size. - return MFI.hasStackObjects() && !allStackObjectsAreDead(MFI); -} - -bool SIFrameLowering::hasSP(const MachineFunction &MF) const { - const SIRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - // All stack operations are relative to the frame offset SGPR. - const MachineFrameInfo &MFI = MF.getFrameInfo(); - return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF); + return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || + MFI.hasStackMap() || MFI.hasPatchPoint() || + MF.getSubtarget().getRegisterInfo()->needsStackRealignment(MF); } Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1770,6 +1770,7 @@ // should reserve the arguments and use them directly. MachineFrameInfo &MFI = MF.getFrameInfo(); bool HasStackObjects = MFI.hasStackObjects(); + const GCNSubtarget &ST = MF.getSubtarget(); // Record that we know we have non-spill stack objects so we don't need to // check all stack objects later. @@ -1785,65 +1786,85 @@ // the scratch registers to pass in. bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); - const GCNSubtarget &ST = MF.getSubtarget(); - if (ST.isAmdHsaOrMesa(MF.getFunction())) { - if (RequiresStackAccess) { - // If we have stack objects, we unquestionably need the private buffer - // resource. For the Code Object V2 ABI, this will be the first 4 user - // SGPR inputs. We can reserve those and use them directly. - - unsigned PrivateSegmentBufferReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); - Info.setScratchRSrcReg(PrivateSegmentBufferReg); - - if (MFI.hasCalls()) { - // If we have calls, we need to keep the frame register in a register - // that won't be clobbered by a call, so ensure it is copied somewhere. - - // This is not a problem for the scratch wave offset, because the same - // registers are reserved in all functions. - - // FIXME: Nothing is really ensuring this is a call preserved register, - // it's just selected from the end so it happens to be. - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - } else { - unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); - } - } else { - unsigned ReservedBufferReg - = TRI.reservedPrivateSegmentBufferReg(MF); - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - - // We tentatively reserve the last registers (skipping the last two - // which may contain VCC). After register allocation, we'll replace - // these with the ones immediately after those which were really - // allocated. In the prologue copies will be inserted from the argument - // to these reserved registers. - Info.setScratchRSrcReg(ReservedBufferReg); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - } + if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) { + // If we have stack objects, we unquestionably need the private buffer + // resource. For the Code Object V2 ABI, this will be the first 4 user + // SGPR inputs. We can reserve those and use them directly. + + unsigned PrivateSegmentBufferReg = + Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); + Info.setScratchRSrcReg(PrivateSegmentBufferReg); } else { unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); + // We tentatively reserve the last registers (skipping the last registers + // which may contain VCC, FLAT_SCR, and XNACK). After register allocation, + // we'll replace these with the ones immediately after those which were + // really allocated. In the prologue copies will be inserted from the + // argument to these reserved registers. // Without HSA, relocations are used for the scratch pointer and the // buffer resource setup is always inserted in the prologue. Scratch wave // offset is still in an input SGPR. Info.setScratchRSrcReg(ReservedBufferReg); + } - if (HasStackObjects && !MFI.hasCalls()) { - unsigned ScratchWaveOffsetReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg); + // This should be accurate for kernels even before the frame is finalized. + const bool HasFP = ST.getFrameLowering()->hasFP(MF); + if (HasFP) { + unsigned ReservedOffsetReg = + TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Try to use s32 as the SP, but move it if it would interfere with input + // arguments. This won't work with calls though. + // + // FIXME: Move SP to avoid any possible inputs, or find a way to spill input + // registers. + if (!MRI.isLiveIn(AMDGPU::SGPR32)) { + Info.setStackPtrOffsetReg(AMDGPU::SGPR32); } else { - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); + assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); + + if (MFI.hasCalls()) + report_fatal_error("call in graphics shader with too many input SGPRs"); + + for (unsigned Reg : AMDGPU::SGPR_32RegClass) { + if (!MRI.isLiveIn(Reg)) { + Info.setStackPtrOffsetReg(Reg); + break; + } + } + + if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) + report_fatal_error("failed to find register for SP"); } + + Info.setScratchWaveOffsetReg(ReservedOffsetReg); + Info.setFrameOffsetReg(ReservedOffsetReg); + } else if (RequiresStackAccess) { + assert(!MFI.hasCalls()); + // We know there are accesses and they will be done relative to SP, so just + // pin it to the input. + // + // FIXME: Should not do this if inline asm is reading/writing these + // registers. + unsigned PreloadedSP = Info.getPreloadedReg( + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + + Info.setStackPtrOffsetReg(PreloadedSP); + Info.setScratchWaveOffsetReg(PreloadedSP); + Info.setFrameOffsetReg(PreloadedSP); + } else { + assert(!MFI.hasCalls()); + + // There may not be stack access at all. There may still be spills, or + // access of a constant pointer (in which cases an extra copy will be + // emitted in the prolog). + unsigned ReservedOffsetReg + = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + Info.setStackPtrOffsetReg(ReservedOffsetReg); + Info.setScratchWaveOffsetReg(ReservedOffsetReg); + Info.setFrameOffsetReg(ReservedOffsetReg); } } @@ -9939,7 +9960,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *Info = MF.getInfo(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); if (Info->isEntryFunction()) { @@ -9947,24 +9967,10 @@ reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); } - // We have to assume the SP is needed in case there are calls in the function - // during lowering. Calls are only detected after the function is - // lowered. We're about to reserve registers, so don't bother using it if we - // aren't really going to use it. - bool NeedSP = !Info->isEntryFunction() || - MFI.hasVarSizedObjects() || - MFI.hasCalls(); - - if (NeedSP) { - unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF); - Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg); - - assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg()); - assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), - Info->getStackPtrOffsetReg())); - if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG) - MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); - } + assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), + Info->getStackPtrOffsetReg())); + if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG) + MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); // We need to worry about replacing the default register with itself in case // of MIR testcases missing the MFI. Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -928,7 +928,7 @@ .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) - .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); + .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); // Add the scratch resource registers as implicit uses because we may end up // needing them, and need to ensure that the reserved registers are // correctly handled. @@ -950,7 +950,7 @@ .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getFrameOffsetReg()) // scratch_offset + .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset .addImm(0) // offset .addMemOperand(MMO); } @@ -1032,7 +1032,7 @@ .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) - .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); + .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); if (ST.hasScalarStores()) { // m0 is used for offset to scalar stores if used to spill. @@ -1046,10 +1046,10 @@ unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize); BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) // vaddr - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getFrameOffsetReg()) // scratch_offset - .addImm(0) // offset + .addFrameIndex(FrameIndex) // vaddr + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset + .addImm(0) // offset .addMemOperand(MMO); } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -444,7 +444,8 @@ } unsigned getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const { - return ArgInfo.getPreloadedValue(Value).first->getRegister(); + auto Arg = ArgInfo.getPreloadedValue(Value).first; + return Arg ? Arg->getRegister() : 0; } unsigned getGITPtrHigh() const { @@ -486,6 +487,11 @@ return FrameOffsetReg; } + void setFrameOffsetReg(unsigned Reg) { + assert(Reg != 0 && "Should never be unset"); + FrameOffsetReg = Reg; + } + void setStackPtrOffsetReg(unsigned Reg) { assert(Reg != 0 && "Should never be unset"); StackPtrOffsetReg = Reg; @@ -502,8 +508,6 @@ void setScratchWaveOffsetReg(unsigned Reg) { assert(Reg != 0 && "Should never be unset"); ScratchWaveOffsetReg = Reg; - if (isEntryFunction()) - FrameOffsetReg = ScratchWaveOffsetReg; } unsigned getQueuePtrUserSGPR() const { Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -56,8 +56,6 @@ unsigned reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const; - unsigned reservedStackPtrOffsetReg(const MachineFunction &MF) const; - BitVector getReservedRegs(const MachineFunction &MF) const override; const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -138,11 +138,6 @@ return AMDGPU::SGPR_32RegClass.getRegister(Reg); } -unsigned SIRegisterInfo::reservedStackPtrOffsetReg( - const MachineFunction &MF) const { - return AMDGPU::SGPR32; -} - BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); @@ -718,6 +713,8 @@ if (SpillToSMEM && OnlyToVGPR) return false; + unsigned FrameReg = getFrameRegister(*MF); + assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && SuperReg != MFI->getFrameOffsetReg() && SuperReg != MFI->getScratchWaveOffsetReg())); @@ -777,11 +774,11 @@ int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(MFI->getFrameOffsetReg()) + .addReg(FrameReg) .addImm(Offset); } else { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(MFI->getFrameOffsetReg()); + .addReg(FrameReg); } BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) @@ -849,11 +846,11 @@ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) - .addReg(TmpReg, RegState::Kill) // src - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srrsrc - .addReg(MFI->getFrameOffsetReg()) // soffset - .addImm(i * 4) // offset + .addReg(TmpReg, RegState::Kill) // src + .addFrameIndex(Index) // vaddr + .addReg(MFI->getScratchRSrcReg()) // srrsrc + .addReg(MFI->getStackPtrOffsetReg()) // soffset + .addImm(i * 4) // offset .addMemOperand(MMO); } } @@ -909,6 +906,8 @@ unsigned EltSize = 4; unsigned ScalarLoadOp; + unsigned FrameReg = getFrameRegister(*MF); + const TargetRegisterClass *RC = getPhysRegClass(SuperReg); if (SpillToSMEM && isSGPRClass(RC)) { // XXX - if private_element_size is larger than 4 it might be useful to be @@ -940,11 +939,11 @@ int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(MFI->getFrameOffsetReg()) + .addReg(FrameReg) .addImm(Offset); } else { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(MFI->getFrameOffsetReg()); + .addReg(FrameReg); } auto MIB = @@ -988,10 +987,10 @@ MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srsrc - .addReg(MFI->getFrameOffsetReg()) // soffset - .addImm(i * 4) // offset + .addFrameIndex(Index) // vaddr + .addReg(MFI->getScratchRSrcReg()) // srsrc + .addReg(MFI->getStackPtrOffsetReg()) // soffset + .addImm(i * 4) // offset .addMemOperand(MMO); auto MIB = @@ -1056,6 +1055,8 @@ MachineOperand &FIOp = MI->getOperand(FIOperandNum); int Index = MI->getOperand(FIOperandNum).getIndex(); + unsigned FrameReg = getFrameRegister(*MF); + switch (MI->getOpcode()) { // SGPR register spill case AMDGPU::SI_SPILL_S512_SAVE: @@ -1091,11 +1092,14 @@ case AMDGPU::SI_SPILL_V32_SAVE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); + assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == + MFI->getStackPtrOffsetReg()); + buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), + FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); @@ -1112,12 +1116,14 @@ case AMDGPU::SI_SPILL_V512_RESTORE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); + assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == + MFI->getStackPtrOffsetReg()); buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, Index, VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), + FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); @@ -1129,13 +1135,12 @@ const DebugLoc &DL = MI->getDebugLoc(); bool IsMUBUF = TII->isMUBUF(*MI); - if (!IsMUBUF && - MFI->getFrameOffsetReg() != MFI->getScratchWaveOffsetReg()) { + if (!IsMUBUF && !MFI->isEntryFunction()) { // Convert to an absolute stack address by finding the offset from the // scratch wave base and scaling by the wave size. // - // In an entry function/kernel the stack address is already the - // absolute address relative to the scratch wave offset. + // In an entry function/kernel the offset is already the absolute + // address relative to the frame register. unsigned DiffReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -1146,7 +1151,7 @@ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) - .addReg(MFI->getFrameOffsetReg()) + .addReg(FrameReg) .addReg(MFI->getScratchWaveOffsetReg()); int64_t Offset = FrameInfo.getObjectOffset(Index); @@ -1196,8 +1201,10 @@ AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr)); - assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() - == MFI->getFrameOffsetReg()); + assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == + MFI->getStackPtrOffsetReg()); + + TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg); int64_t Offset = FrameInfo.getObjectOffset(Index); int64_t OldImm Index: test/CodeGen/AMDGPU/byval-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/byval-frame-setup.ll +++ test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -4,15 +4,14 @@ %struct.ByValStruct = type { [4 x i32] } ; GCN-LABEL: {{^}}void_func_byval_struct: -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} ; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s5 offset:4{{$}} +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:4{{$}} ; GCN-NOT: s32 -; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:20{{$}} +; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:20{{$}} ; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:20{{$}} +; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:20{{$}} ; GCN-NOT: s32 define hidden void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 { entry: @@ -183,15 +182,14 @@ } ; GCN-LABEL: {{^}}void_func_byval_struct_align8: -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} ; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s5 offset:8{{$}} +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}} ; GCN-NOT: s32 -; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:24{{$}} +; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:24{{$}} ; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:24{{$}} +; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:24{{$}} ; GCN-NOT: s32 define hidden void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg1) #1 { entry: Index: test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- test/CodeGen/AMDGPU/call-argument-types.ll +++ test/CodeGen/AMDGPU/call-argument-types.ll @@ -768,16 +768,17 @@ } ; GCN-LABEL: {{^}}tail_call_byval_align16: -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:28 ; 4-byte Folded Spill -; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:24 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:32 -; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:36 -; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:20 -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:16 +; GCN-NOT: s32 +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:20 +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; GCN: s_getpc_b64 -; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:24 ; 4-byte Folded Reload -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:28 ; 4-byte Folded Reload +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { entry: @@ -787,16 +788,17 @@ } ; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64: -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill -; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 -; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:8 -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 -; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:8 +; GCN-NOT: s32 +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; GCN: s_getpc_b64 -; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 { entry: Index: test/CodeGen/AMDGPU/callee-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/callee-frame-setup.ll +++ test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -22,9 +22,8 @@ ; GCN-LABEL: {{^}}callee_with_stack: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_mov_b32 s5, s32 ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}} +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack() #0 { @@ -100,7 +99,7 @@ ; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and restored ; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls: ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN: v_writelane_b32 v32 @@ -108,7 +107,7 @@ ; GCN: v_readlane_b32 s{{[0-9]+}}, v32 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt Index: test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -115,8 +115,8 @@ ; GCN-LABEL: {{^}}use_stack_workgroup_id_x: ; GCN: s_waitcnt -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:4 +; GCN-NOT: s32 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GCN: ; use s6 ; GCN: s_setpc_b64 define void @use_stack_workgroup_id_x() #1 { @@ -429,7 +429,7 @@ } ; GCN-LABEL: {{^}}use_every_sgpr_input: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 ; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} Index: test/CodeGen/AMDGPU/callee-special-input-vgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -230,12 +230,11 @@ } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x: -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4{{$}} +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @too_many_args_use_workitem_id_x( @@ -357,12 +356,12 @@ ; frame[3] = VGPR spill slot ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval: -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32 -; GCN: buffer_load_dword v0, off, s[0:3], s5 offset:4 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload +; GCN: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN: s_setpc_b64 define void @too_many_args_use_workitem_id_x_byval( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, @@ -476,16 +475,15 @@ } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz: -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4{{$}} +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8{{$}} +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:12{{$}} +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @too_many_args_use_workitem_id_xyz( @@ -574,11 +572,10 @@ ; frame[2] = workitem Z ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz: -; GCN: s_mov_b32 s5, s32 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 -; GCN: buffer_load_dword v31, off, s[0:3], s5 offset:4{{$}} +; GCN: buffer_load_dword v31, off, s[0:3], s32 offset:4{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 -; GCN: buffer_load_dword v31, off, s[0:3], s5 offset:8{{$}} +; GCN: buffer_load_dword v31, off, s[0:3], s32 offset:8{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 ; GCN: s_waitcnt Index: test/CodeGen/AMDGPU/frame-index-elimination.ll =================================================================== --- test/CodeGen/AMDGPU/frame-index-elimination.ll +++ test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -7,7 +7,7 @@ ; Materialize into a mov. Make sure there isn't an unnecessary copy. ; GCN-LABEL: {{^}}func_mov_fi_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: s_sub_u32 s6, s5, s4 +; GCN: s_sub_u32 s6, s32, s4 ; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 ; CI-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]] @@ -28,7 +28,7 @@ ; GCN-LABEL: {{^}}func_add_constant_to_fi_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: s_sub_u32 s6, s5, s4 +; GCN: s_sub_u32 s6, s32, s4 ; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 ; CI-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]] @@ -52,7 +52,7 @@ ; into. ; GCN-LABEL: {{^}}func_other_fi_user_i32: -; GCN: s_sub_u32 s6, s5, s4 +; GCN: s_sub_u32 s6, s32, s4 ; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 ; CI-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]] @@ -89,8 +89,7 @@ ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr: ; GCN: s_waitcnt -; GCN-NEXT: s_mov_b32 s5, s32 -; GCN-NEXT: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s5, s4 +; GCN-NEXT: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s4 ; CI-NEXT: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 ; CI-NEXT: v_add_i32_e64 [[ADD:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 4, [[SHIFT]] @@ -112,9 +111,8 @@ ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_value: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s5, s32 -; GCN-NEXT: buffer_load_ubyte v0, off, s[0:3], s5 -; GCN_NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4 +; GCN-NEXT: buffer_load_ubyte v0, off, s[0:3], s32 +; GCN_NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* byval %arg0) #0 { %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 0 %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 1 @@ -129,7 +127,7 @@ ; FrameIndex is hidden behind a CopyFromReg in the second block. ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block: -; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s5, s4 +; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s4 ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 ; CI: v_add_i32_e64 [[ADD:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 4, [[SHIFT]] @@ -163,7 +161,7 @@ ; Added offset can't be used with VOP3 add ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32: -; GCN: s_sub_u32 s6, s5, s4 +; GCN: s_sub_u32 s6, s32, s4 ; GCN-DAG: s_movk_i32 s6, 0x204 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 @@ -187,7 +185,7 @@ } ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32_vcc_live: -; GCN: s_sub_u32 [[DIFF:s[0-9]+]], s5, s4 +; GCN: s_sub_u32 [[DIFF:s[0-9]+]], s32, s4 ; GCN-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x204 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[DIFF]], 6 @@ -243,7 +241,7 @@ ; GCN-LABEL: {{^}}alloca_ptr_nonentry_block: ; GCN: s_and_saveexec_b64 -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s5 offset:12 +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:12 define void @alloca_ptr_nonentry_block(i32 %arg0) #0 { %alloca0 = alloca { i8, i32 }, align 4, addrspace(5) %cmp = icmp eq i32 %arg0, 0 Index: test/CodeGen/AMDGPU/function-args.ll =================================================================== --- test/CodeGen/AMDGPU/function-args.ll +++ test/CodeGen/AMDGPU/function-args.ll @@ -220,7 +220,7 @@ ; GCN-DAG: buffer_store_dwordx4 v[4:7], off ; GCN-DAG: buffer_store_dwordx4 v[8:11], off ; GCN-DAG: buffer_store_dwordx4 v[12:15], off -; GCN-DAG: buffer_load_dword [[STACKLOAD:v[0-9]+]], off, s[0:3], s5 +; GCN-DAG: buffer_load_dword [[STACKLOAD:v[0-9]+]], off, s[0:3], s32 ; GCN-DAG: buffer_store_dwordx4 v[16:19], off ; GCN-DAG: buffer_store_dwordx4 v[20:23], off ; GCN-DAG: buffer_store_dwordx4 v[24:27], off @@ -516,8 +516,8 @@ } ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32: -; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} ; GCN-DAG: buffer_store_dword v[[ELT1]] ; GCN-DAG: buffer_store_byte v[[ELT0]] define void @void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval %arg0) #0 { @@ -527,10 +527,10 @@ } ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2: -; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s5 offset:12{{$}} -; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:16{{$}} ; GCN: ds_write_b32 v0, v0 ; GCN: s_setpc_b64 @@ -544,9 +544,9 @@ } ; GCN-LABEL: {{^}}void_func_byval_i32_byval_i64: -; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s5 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s32 offset:12{{$}} ; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off ; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ARG1_LOAD0]]:[[ARG1_LOAD1]]{{\]}}, off define void @void_func_byval_i32_byval_i64(i32 addrspace(5)* byval %arg0, i64 addrspace(5)* byval %arg1) #0 { @@ -566,9 +566,9 @@ ; GCN-DAG: buffer_store_dwordx4 v[20:23], off ; GCN-DAG: buffer_store_dwordx4 v[24:27], off ; GCN-DAG: buffer_store_dwordx4 v[28:31], off -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:8 -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:12 +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:12 ; GCN: buffer_store_dword v[[LOAD_ARG1]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off @@ -581,14 +581,14 @@ ; FIXME: Different ext load types on CI vs. VI ; GCN-LABEL: {{^}}void_func_v32i32_i1_i8_i16: -; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:12{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:12{{$}} -; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} ; GCN-DAG: v_and_b32_e32 [[TRUNC_ARG1_I1:v[0-9]+]], 1, [[LOAD_ARG1]] ; CI-DAG: v_cvt_f16_f32_e32 [[CVT_ARG4:v[0-9]+]], [[LOAD_ARG4]] @@ -609,10 +609,10 @@ } ; GCN-LABEL: {{^}}void_func_v32i32_v2i32_v2f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:16{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]{{\]}}, off ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off @@ -624,8 +624,8 @@ } ; GCN-LABEL: {{^}}void_func_v32i32_v2i16_v2f16: -; GFX9-DAG: buffer_load_dword [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GFX9-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GFX9-DAG: buffer_load_dword [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GFX9-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} ; GFX9: buffer_store_dword [[LOAD_ARG1]], off ; GFX9: buffer_store_short [[LOAD_ARG2]], off define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2) #0 { @@ -636,15 +636,15 @@ } ; GCN-LABEL: {{^}}void_func_v32i32_v2i64_v2f64: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}} ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off @@ -656,15 +656,15 @@ } ; GCN-LABEL: {{^}}void_func_v32i32_v4i32_v4f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}} ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off @@ -676,23 +676,23 @@ } ; GCN-LABEL: {{^}}void_func_v32i32_v8i32_v8f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:32{{$}} - -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:36{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:40{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:44{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:48{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:52{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:56{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:60{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:64{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:28{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:32{{$}} + +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:36{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:40{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:44{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:48{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:52{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:56{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:60{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:64{{$}} ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]{{\]}}, off ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off @@ -706,39 +706,39 @@ } ; GCN-LABEL: {{^}}void_func_v32i32_v16i32_v16f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_:[0-9]+]], off, s[0:3], s5 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:32{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s5 offset:36{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s5 offset:40{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s5 offset:44{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s5 offset:48{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s5 offset:52{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s5 offset:56{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s5 offset:60{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s5 offset:64{{$}} - -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:68{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:72{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:76{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:80{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:84{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:88{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:92{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:96{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s5 offset:100{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s5 offset:104{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s5 offset:108{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s5 offset:112{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s5 offset:116{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s5 offset:120{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s5 offset:124{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s5 offset:128{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_:[0-9]+]], off, s[0:3], s32 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:28{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s32 offset:36{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s32 offset:40{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s32 offset:44{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s32 offset:48{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s32 offset:52{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s32 offset:56{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s32 offset:60{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:64{{$}} + +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:68{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:72{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:76{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:80{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:84{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:88{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:92{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:96{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s32 offset:100{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s32 offset:104{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s32 offset:108{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s32 offset:112{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s32 offset:116{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s32 offset:120{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s32 offset:124{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:128{{$}} define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 { store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef store volatile <16 x i32> %arg1, <16 x i32> addrspace(1)* undef Index: test/CodeGen/AMDGPU/load-hi16.ll =================================================================== --- test/CodeGen/AMDGPU/load-hi16.ll +++ test/CodeGen/AMDGPU/load-hi16.ll @@ -494,13 +494,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg: ; GCN: s_waitcnt -; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}} +; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 { entry: %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045 @@ -513,13 +513,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg: ; GCN: s_waitcnt -; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}} +; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval %in, half %reg) #0 { entry: %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2045 @@ -568,13 +568,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}} +; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 { entry: %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 @@ -588,13 +588,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_zexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}} +; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, half %reg) #0 { entry: %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 @@ -609,13 +609,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_sexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}} +; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, half %reg) #0 { entry: %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 @@ -630,13 +630,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}} +; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 { entry: %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 @@ -789,7 +789,7 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset: ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4094 +; GFX900-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094 define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -806,7 +806,7 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset: ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095 +; GFX900-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -824,7 +824,7 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset: ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095 +; GFX900-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -975,9 +975,9 @@ ; FIXME: Is there a cost to using the extload over not? ; GCN-LABEL: {{^}}load_private_v2i16_split: ; GCN: s_waitcnt -; GFX900: buffer_load_ushort v0, off, s[0:3], s5 offset:4{{$}} +; GFX900: buffer_load_ushort v0, off, s[0:3], s32 offset:4{{$}} ; GFX900-NEXT: s_waitcnt -; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:6 +; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:6 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 { Index: test/CodeGen/AMDGPU/load-lo16.ll =================================================================== --- test/CodeGen/AMDGPU/load-lo16.ll +++ test/CodeGen/AMDGPU/load-lo16.ll @@ -590,13 +590,13 @@ ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg: ; GCN: s_waitcnt -; GFX900: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094{{$}} +; GFX900: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -609,7 +609,7 @@ ; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg: ; GCN: s_waitcnt -; GFX900: buffer_load_ushort v1, off, s[0:3], s5 offset:4094{{$}} +; GFX900: buffer_load_ushort v1, off, s[0:3], s32 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900: v_and_b32 ; GFX900: v_lshl_or_b32 @@ -618,7 +618,7 @@ ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 { entry: %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045 @@ -631,13 +631,13 @@ ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg: ; GCN: s_waitcnt -; GFX900: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094{{$}} +; GFX900: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -704,13 +704,13 @@ ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095{{$}} +; GFX900: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -724,13 +724,13 @@ ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095{{$}} +; GFX900: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -895,7 +895,7 @@ ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_to_offset: ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094 +; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 ; NO-D16-HI: buffer_load_ushort v define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 { @@ -914,7 +914,7 @@ ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8_to_offset: ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095 +; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 ; NO-D16-HI: buffer_load_sbyte v define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 { @@ -934,7 +934,7 @@ ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8_to_offset: ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095 +; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 ; NO-D16-HI: buffer_load_ubyte v define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 { @@ -954,7 +954,7 @@ ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_sexti8_to_offset: ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095 +; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 ; NO-D16-HI: buffer_load_sbyte v define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 { @@ -975,7 +975,7 @@ ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_zexti8_to_offset: ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095 +; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 ; NO-D16-HI: buffer_load_ubyte v define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 { Index: test/CodeGen/AMDGPU/mubuf-legalize-operands.ll =================================================================== --- test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -126,7 +126,7 @@ ; CHECK-O0-DAG: s_mov_b32 [[IDX_S:s[0-9]+]], s4 ; CHECK-O0-DAG: v_mov_b32_e32 [[IDX_V:v[0-9]+]], [[IDX_S]] ; CHECK-O0-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec -; CHECK-O0-DAG: buffer_store_dword [[IDX_V]], off, s[0:3], s5 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill +; CHECK-O0-DAG: buffer_store_dword [[IDX_V]], off, s[0:3], s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill ; CHECK-O0: [[LOOPBB0:BB[0-9]+_[0-9]+]]: ; CHECK-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload @@ -149,22 +149,22 @@ ; CHECK-O0: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}} ; CHECK-O0: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]] ; CHECK-O0: s_and_saveexec_b64 [[CMP]], [[CMP]] -; CHECK-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s5 offset:[[IDX_OFF]] ; 4-byte Folded Reload +; CHECK-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload ; CHECK-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, {{.*}} idxen ; CHECK-O0: s_waitcnt vmcnt(0) -; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s5 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill +; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill ; CHECK-O0: s_xor_b64 exec, exec, [[CMP]] ; CHECK-O0-NEXT: s_cbranch_execnz [[LOOPBB0]] ; CHECK-O0: v_readlane_b32 s[[S1:[0-9]+]], v{{[0-9]+}}, 4 ; CHECK-O0: v_readlane_b32 s[[S2:[0-9]+]], v{{[0-9]+}}, 5 ; CHECK-O0: s_mov_b64 exec, s{{\[}}[[S1]]:[[S2]]{{\]}} -; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s5 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload -; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s5 offset:[[RES_OFF:[0-9]+]] ; 4-byte Folded Spill +; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload +; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF:[0-9]+]] ; 4-byte Folded Spill ; CHECK-O0: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]] ; CHECK-O0: BB{{[0-9]+_[0-9]+}}: ; CHECK-O0-DAG: s_mov_b64 s{{\[}}[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]{{\]}}, exec -; CHECK-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s5 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill +; CHECK-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill ; CHECK-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]] ; CHECK-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]] @@ -189,21 +189,21 @@ ; CHECK-O0: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}} ; CHECK-O0: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]] ; CHECK-O0: s_and_saveexec_b64 [[CMP]], [[CMP]] -; CHECK-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s5 offset:[[IDX_OFF]] ; 4-byte Folded Reload +; CHECK-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload ; CHECK-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, {{.*}} idxen ; CHECK-O0: s_waitcnt vmcnt(0) -; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s5 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill +; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill ; CHECK-O0: s_xor_b64 exec, exec, [[CMP]] ; CHECK-O0-NEXT: s_cbranch_execnz [[LOOPBB1]] ; CHECK-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]] ; CHECK-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]] ; CHECK-O0: s_mov_b64 exec, s{{\[}}[[SAVEEXEC0]]:[[SAVEEXEC1]]{{\]}} -; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s5 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload -; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s5 offset:[[RES_OFF]] ; 4-byte Folded Spill +; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload +; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF]] ; 4-byte Folded Spill ; CHECK-O0: [[TERMBB]]: -; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s5 offset:[[RES_OFF]] ; 4-byte Folded Reload +; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF]] ; 4-byte Folded Reload ; CHECK-O0: global_store_dword v[{{[0-9]+:[0-9]+}}], [[RES]], off define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, float addrspace(1)* %in, float addrspace(1)* %out) #0 { Index: test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir =================================================================== --- test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir +++ test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir @@ -19,26 +19,26 @@ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 scratchWaveOffsetReg: $sgpr5 frameOffsetReg: $sgpr5 + stackPtrOffsetReg: $sgpr32 body: | ; CHECK-LABEL: name: scavenge_register_position ; CHECK: bb.0: ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $sgpr4, $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: $sgpr5 = COPY $sgpr4 - ; CHECK: $sgpr6 = S_ADD_U32 $sgpr5, 524288, implicit-def $scc + ; CHECK: $sgpr6 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_BRANCH %bb.1 ; CHECK: bb.1: - ; CHECK: liveins: $sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: $sgpr4 = S_ADD_U32 $sgpr5, 524288, implicit-def $scc + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: $sgpr4 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_ENDPGM 0, implicit $vgpr0 bb.0: - $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) S_BRANCH %bb.1 bb.1: - $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) S_ENDPGM 0, implicit $vgpr0 ... Index: test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir =================================================================== --- test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir +++ test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir @@ -33,16 +33,16 @@ # SHARE: stack-id: 1, callee-saved-register: '', callee-saved-restored: true, # SHARE: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -# SHARE: SI_SPILL_S32_SAVE $sgpr5, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (store 4 into %stack.2, addrspace 5) -# SHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) -# SHARE: SI_SPILL_S64_SAVE killed renamable $sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (store 8 into %stack.1, align 4, addrspace 5) -# SHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (load 8 from %stack.1, align 4, addrspace 5) +# SHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) +# SHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) +# SHARE: SI_SPILL_S64_SAVE killed renamable $sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) +# SHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) # SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0 -# SHARE: $sgpr5 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (load 4 from %stack.2, addrspace 5) -# SHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) -# SHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (load 8 from %stack.1, align 4, addrspace 5) +# SHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) +# SHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) +# SHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) # SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0 -# SHARE: $sgpr5 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (load 4 from %stack.2, addrspace 5) +# SHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) # NOSHARE: stack: # NOSHARE: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, @@ -58,17 +58,17 @@ # NOSHARE: stack-id: 1, callee-saved-register: '', callee-saved-restored: true, # NOSHARE: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -# NOSHARE: SI_SPILL_S32_SAVE $sgpr5, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (store 4 into %stack.2, addrspace 5) -# NOSHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) -# NOSHARE: SI_SPILL_S64_SAVE killed renamable $sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (store 8 into %stack.1, align 4, addrspace 5) -# NOSHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (load 8 from %stack.1, align 4, addrspace 5) +# NOSHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) +# NOSHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) +# NOSHARE: SI_SPILL_S64_SAVE killed renamable $sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) +# NOSHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) # NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0 -# NOSHARE: $sgpr5 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (load 4 from %stack.2, addrspace 5) -# NOSHARE: SI_SPILL_S32_SAVE $sgpr5, %stack.3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (store 4 into %stack.3, addrspace 5) -# NOSHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) -# NOSHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (load 8 from %stack.1, align 4, addrspace 5) +# NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) +# NOSHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.3, addrspace 5) +# NOSHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) +# NOSHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) # NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0 -# NOSHARE: $sgpr5 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (load 4 from %stack.3, addrspace 5) +# NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.3, addrspace 5) ... @@ -79,23 +79,23 @@ machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 scratchWaveOffsetReg: $sgpr4 - frameOffsetReg: $sgpr5 + frameOffsetReg: $sgpr32 stackPtrOffsetReg: $sgpr32 body: | bb.0: - %0:sreg_32_xm0 = COPY $sgpr5 + %0:sreg_32_xm0 = COPY $sgpr32 %1:vreg_64 = IMPLICIT_DEF %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc - ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr5 + ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0 - $sgpr5 = COPY %0 - %4:sreg_32_xm0 = COPY $sgpr5 - ADJCALLSTACKDOWN 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr5 - ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr5 + $sgpr32 = COPY %0 + %4:sreg_32_xm0 = COPY $sgpr32 + ADJCALLSTACKDOWN 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 + ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 $vgpr0 = COPY %2 dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit killed $vgpr0 - $sgpr5 = COPY %4 - ADJCALLSTACKDOWN 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr5 + $sgpr32 = COPY %4 + ADJCALLSTACKDOWN 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 ... Index: test/CodeGen/AMDGPU/sibling-call.ll =================================================================== --- test/CodeGen/AMDGPU/sibling-call.ll +++ test/CodeGen/AMDGPU/sibling-call.ll @@ -16,10 +16,10 @@ ; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 9 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:24 +; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:24 ; GCN: s_waitcnt vmcnt(0) ; GCN: s_setpc_b64 ; GCN: ; ScratchSize: 68 @@ -40,7 +40,7 @@ ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object: ; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:24 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:24 ; GCN: s_setpc_b64 ; GCN: ; ScratchSize: 68 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 { @@ -54,7 +54,7 @@ ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object: ; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:24 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:24 ; GCN: s_setpc_b64 ; GCN: ; ScratchSize: 136 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 { @@ -84,8 +84,7 @@ ; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32: ; GCN: s_waitcnt -; GCN-NEXT: s_mov_b32 s5, s32 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 @@ -116,8 +115,7 @@ ; GCN-NOT: v0 ; GCN-NOT: s32 ; GCN: buffer_load_dword v1, off, s[0:3], s4 offset:16 -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v1, off, s[0:3], s5 offset:4 +; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GCN-NEXT: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [16 x i32] %large) #1 { entry: @@ -127,8 +125,8 @@ ; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4 -; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 ; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]] @@ -150,21 +148,20 @@ ; FIXME: Why load and store same location for stack args? ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32: -; GCN: s_mov_b32 s5, s32 -; GCN-DAG: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4 -; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8 ; GCN-NOT: s32 -; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s5 offset:4 -; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s5 offset:8 +; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload -; GCN-DAG: buffer_load_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NOT: s32 ; GCN: s_setpc_b64 @@ -175,12 +172,8 @@ } ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: -; GCN-DAG: s_mov_b32 s5, s32 -; GCN-NOT: s32 ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:44 - -; GCN-NOT: s32 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:44 ; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: @@ -217,7 +210,6 @@ ; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill ; GCN-DAG: v_writelane_b32 v34, s33, 0 ; GCN-DAG: v_writelane_b32 v34, s34, 1 -; GCN-DAG: v_writelane_b32 v34, s35, 2 ; GCN-DAG: s_getpc_b64 ; GCN: s_swappc_b64 @@ -228,7 +220,6 @@ ; GCN-DAG: v_readlane_b32 s33, v34, 0 ; GCN-DAG: v_readlane_b32 s34, v34, 1 -; GCN-DAG: v_readlane_b32 s35, v34, 2 ; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 @@ -249,8 +240,12 @@ ; in same place at function exit. ; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: -; GCN: s_mov_b32 s5, s32 -; GCN-NOT: s32 +; GCN-NOT: s33 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset: + +; GCN-NOT: s33 + +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset: ; GCN: s_setpc_b64 s[6:7] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: @@ -262,8 +257,10 @@ } ; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: -; GCN: s_mov_b32 s5, s32 -; GCN-NOT: s32 +; GCN-NOT: s33 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48 + +; GCN-NOT: s33 ; GCN: s_setpc_b64 s[6:7] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 { entry: Index: test/CodeGen/AMDGPU/sp-too-many-input-sgprs.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sp-too-many-input-sgprs.ll @@ -0,0 +1,102 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefixes=MESA3D,ALL %s +; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=UNKNOWN,ALL %s + +; Make sure shaders pick a workable SP with > 32 input SGPRs. +; FIXME: Doesn't seem to be getting initial value from right register? + +; ALL-LABEL: {{^}}too_many_input_sgprs_32: +; MESA3D-NOT: s34 +; MESA3D: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s34 offset:4 + +; Happens to end up in s32 anyway +; UNKNOWN-NOT: s32 +; UNKNOWN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4 +define amdgpu_ps i32 @too_many_input_sgprs_32(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7, + i32 inreg %arg8, i32 inreg %arg9, i32 inreg %arg10, i32 inreg %arg11, i32 inreg %arg12, i32 inreg %arg13, i32 inreg %arg14, i32 inreg %arg15, + i32 inreg %arg16, i32 inreg %arg17, i32 inreg %arg18, i32 inreg %arg19, i32 inreg %arg20, i32 inreg %arg21, i32 inreg %arg22, i32 inreg %arg23, + i32 inreg %arg24, i32 inreg %arg25, i32 inreg %arg26, i32 inreg %arg27, i32 inreg %arg28, i32 inreg %arg29, i32 inreg %arg30, i32 inreg %arg31) { +bb: + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + %tmp = add i32 %arg, %arg1 + %tmp32 = add i32 %tmp, %arg2 + %tmp33 = add i32 %tmp32, %arg3 + %tmp34 = add i32 %tmp33, %arg4 + %tmp35 = add i32 %tmp34, %arg5 + %tmp36 = add i32 %tmp35, %arg6 + %tmp37 = add i32 %tmp36, %arg7 + %tmp38 = add i32 %tmp37, %arg8 + %tmp39 = add i32 %tmp38, %arg9 + %tmp40 = add i32 %tmp39, %arg10 + %tmp41 = add i32 %tmp40, %arg11 + %tmp42 = add i32 %tmp41, %arg12 + %tmp43 = add i32 %tmp42, %arg13 + %tmp44 = add i32 %tmp43, %arg14 + %tmp45 = add i32 %tmp44, %arg15 + %tmp46 = add i32 %tmp45, %arg16 + %tmp47 = add i32 %tmp46, %arg17 + %tmp48 = add i32 %tmp47, %arg18 + %tmp49 = add i32 %tmp48, %arg19 + %tmp50 = add i32 %tmp49, %arg20 + %tmp51 = add i32 %tmp50, %arg21 + %tmp52 = add i32 %tmp51, %arg22 + %tmp53 = add i32 %tmp52, %arg23 + %tmp54 = add i32 %tmp53, %arg24 + %tmp55 = add i32 %tmp54, %arg25 + %tmp56 = add i32 %tmp55, %arg26 + %tmp57 = add i32 %tmp56, %arg27 + %tmp58 = add i32 %tmp57, %arg28 + %tmp59 = add i32 %tmp58, %arg29 + %tmp60 = add i32 %tmp59, %arg30 + %tmp61 = add i32 %tmp60, %arg31 + ret i32 %tmp61 +} + +; ALL-LABEL: {{^}}too_many_input_sgprs_33: +; MESA3D-NOT: s35 +; MESA3D: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s35 offset:4 + +; UNKNOWN-NOT: s33 +; UNKNOWN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s33 offset:4 +define amdgpu_ps i32 @too_many_input_sgprs_33(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7, + i32 inreg %arg8, i32 inreg %arg9, i32 inreg %arg10, i32 inreg %arg11, i32 inreg %arg12, i32 inreg %arg13, i32 inreg %arg14, i32 inreg %arg15, + i32 inreg %arg16, i32 inreg %arg17, i32 inreg %arg18, i32 inreg %arg19, i32 inreg %arg20, i32 inreg %arg21, i32 inreg %arg22, i32 inreg %arg23, + i32 inreg %arg24, i32 inreg %arg25, i32 inreg %arg26, i32 inreg %arg27, i32 inreg %arg28, i32 inreg %arg29, i32 inreg %arg30, i32 inreg %arg31, + i32 inreg %arg32) { +bb: + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + %tmp = add i32 %arg, %arg1 + %tmp32 = add i32 %tmp, %arg2 + %tmp33 = add i32 %tmp32, %arg3 + %tmp34 = add i32 %tmp33, %arg4 + %tmp35 = add i32 %tmp34, %arg5 + %tmp36 = add i32 %tmp35, %arg6 + %tmp37 = add i32 %tmp36, %arg7 + %tmp38 = add i32 %tmp37, %arg8 + %tmp39 = add i32 %tmp38, %arg9 + %tmp40 = add i32 %tmp39, %arg10 + %tmp41 = add i32 %tmp40, %arg11 + %tmp42 = add i32 %tmp41, %arg12 + %tmp43 = add i32 %tmp42, %arg13 + %tmp44 = add i32 %tmp43, %arg14 + %tmp45 = add i32 %tmp44, %arg15 + %tmp46 = add i32 %tmp45, %arg16 + %tmp47 = add i32 %tmp46, %arg17 + %tmp48 = add i32 %tmp47, %arg18 + %tmp49 = add i32 %tmp48, %arg19 + %tmp50 = add i32 %tmp49, %arg20 + %tmp51 = add i32 %tmp50, %arg21 + %tmp52 = add i32 %tmp51, %arg22 + %tmp53 = add i32 %tmp52, %arg23 + %tmp54 = add i32 %tmp53, %arg24 + %tmp55 = add i32 %tmp54, %arg25 + %tmp56 = add i32 %tmp55, %arg26 + %tmp57 = add i32 %tmp56, %arg27 + %tmp58 = add i32 %tmp57, %arg28 + %tmp59 = add i32 %tmp58, %arg29 + %tmp60 = add i32 %tmp59, %arg30 + %tmp61 = add i32 %tmp60, %arg31 + %tmp62 = add i32 %tmp61, %arg32 + ret i32 %tmp62 +} Index: test/CodeGen/AMDGPU/spill-empty-live-interval.mir =================================================================== --- test/CodeGen/AMDGPU/spill-empty-live-interval.mir +++ test/CodeGen/AMDGPU/spill-empty-live-interval.mir @@ -8,12 +8,12 @@ # CHECK-LABEL: name: expecting_non_empty_interval # CHECK: undef %7.sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef %7.sub1, implicit $exec -# CHECK-NEXT: SI_SPILL_V64_SAVE %7, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) +# CHECK-NEXT: SI_SPILL_V64_SAVE %7, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) # CHECK-NEXT: undef %5.sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec # CHECK-NEXT: dead %3:vgpr_32 = V_MUL_F32_e32 0, %5.sub1, implicit $exec # CHECK: S_NOP 0, implicit %6.sub1 -# CHECK-NEXT: %8:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) +# CHECK-NEXT: %8:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) # CHECK-NEXT: S_NOP 0, implicit %8.sub1 # CHECK-NEXT: S_NOP 0, implicit undef %9.sub0 @@ -22,7 +22,6 @@ machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 scratchWaveOffsetReg: $sgpr4 - frameOffsetReg: $sgpr5 stackPtrOffsetReg: $sgpr32 body: | bb.0: @@ -57,7 +56,6 @@ machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 scratchWaveOffsetReg: $sgpr4 - frameOffsetReg: $sgpr5 stackPtrOffsetReg: $sgpr32 body: | bb.0: Index: test/CodeGen/AMDGPU/spill-offset-calculation.ll =================================================================== --- test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -171,7 +171,7 @@ %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 ; 0x40000 / 64 = 4096 (for wave64) - ; CHECK: s_add_u32 s6, s5, 0x40000 + ; CHECK: s_add_u32 s6, s32, 0x40000 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr @@ -223,7 +223,7 @@ %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* ; 0x3ff00 / 64 = 4092 (for wave64) - ; CHECK: s_add_u32 s6, s5, 0x3ff00 + ; CHECK: s_add_u32 s6, s32, 0x3ff00 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 Index: test/CodeGen/AMDGPU/stack-realign.ll =================================================================== --- test/CodeGen/AMDGPU/stack-realign.ll +++ test/CodeGen/AMDGPU/stack-realign.ll @@ -9,7 +9,10 @@ ; = 144 bytes with padding between them ; GCN-LABEL: {{^}}needs_align16_default_stack_align: -; GCN: s_mov_b32 s5, s32 +; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s4 +; GCN-NEXT: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, [[SUB]] +; GCN: v_add_u32_e64 [[FI:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 16, [[FRAMEDIFF]] + ; GCN-NOT: s32 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen @@ -135,9 +138,7 @@ ; GCN-LABEL: {{^}}disable_realign_align128: ; GCN-NOT: s32 -; GCN: s_mov_b32 s5, s32 -; GCN-NOT: s32 -; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:16 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; GCN-NOT: s32 define void @disable_realign_align128(i32 %idx) #3 { %alloca.align = alloca i32, align 128, addrspace(5) Index: test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir =================================================================== --- test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir +++ test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir @@ -9,11 +9,11 @@ # CHECK: - { id: 1, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, # CHECK-NEXT: stack-id: 1, -# CHECK: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) -# CHECK: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) +# CHECK: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) +# CHECK: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) -# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr6, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5, implicit-def dead $m0 :: (store 4 into %stack.1, addrspace 5) -# CHECK: $sgpr6 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5, implicit-def dead $m0 :: (load 4 from %stack.1, addrspace 5) +# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr6, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32, implicit-def dead $m0 :: (store 4 into %stack.1, addrspace 5) +# CHECK: $sgpr6 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32, implicit-def dead $m0 :: (load 4 from %stack.1, addrspace 5) name: no_merge_sgpr_vgpr_spill_slot tracksRegLiveness: true Index: test/CodeGen/AMDGPU/store-hi16.ll =================================================================== --- test/CodeGen/AMDGPU/store-hi16.ll +++ test/CodeGen/AMDGPU/store-hi16.ll @@ -481,10 +481,10 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_max_offset: ; GCN: s_waitcnt -; GFX900: buffer_store_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}} +; GFX900: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 -; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s5 offset:4094{{$}} +; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s32 offset:4094{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -635,7 +635,7 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset: ; GCN: s_waitcnt ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s5 offset:4094 +; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094 define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -652,7 +652,7 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset: ; GCN: s_waitcnt ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s5 offset:4095 +; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4095 define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) Index: test/CodeGen/AMDGPU/subreg-split-live-in-error.mir =================================================================== --- test/CodeGen/AMDGPU/subreg-split-live-in-error.mir +++ test/CodeGen/AMDGPU/subreg-split-live-in-error.mir @@ -42,7 +42,6 @@ machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 scratchWaveOffsetReg: $sgpr4 - frameOffsetReg: $sgpr5 stackPtrOffsetReg: $sgpr32 liveins: - { reg: '$vgpr2', virtual-reg: '%0' } @@ -112,7 +111,7 @@ ; and inserting a spill. Here we just check that the point where the error ; occurs we see a correctly generated spill. ; GCN-LABEL: bb.7: - ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec + ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 @@ -128,7 +127,7 @@ successors: %bb.12(0x80000000) ; GCN-LABEL: bb.9: - ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec + ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 @@ -139,7 +138,7 @@ successors: %bb.12(0x80000000) ; GCN-LABEL: bb.10: - ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec + ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 2143289344, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 Index: test/CodeGen/MIR/AMDGPU/machine-function-info.ll =================================================================== --- test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -18,7 +18,7 @@ ; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' ; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr101' ; CHECK-NEXT: frameOffsetReg: '$sgpr101' -; CHECK-NEXT: stackPtrOffsetReg: '$sp_reg' +; CHECK-NEXT: stackPtrOffsetReg: '$sgpr101' ; CHECK-NEXT: body: define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { %gep = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %arg0 @@ -38,7 +38,7 @@ ; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' ; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr101' ; CHECK-NEXT: frameOffsetReg: '$sgpr101' -; CHECK-NEXT: stackPtrOffsetReg: '$sp_reg' +; CHECK-NEXT: stackPtrOffsetReg: '$sgpr101' ; CHECK-NEXT: body: define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) { ret void