diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -887,7 +887,11 @@ ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs(); ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs(); - ProgInfo.LDSSize = MFI->getLDSSize(); + unsigned MaxWorkGroupSize = STM.getFlatWorkGroupSizes(F).second; + unsigned LDSSpillSize = MFI->getLdsSpill().TotalSize * MaxWorkGroupSize; + + ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize; + ProgInfo.LDSBlocks = alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -181,7 +181,8 @@ Info.UsesFlatScratch = false; } - Info.PrivateSegmentSize = FrameInfo.getStackSize(); + unsigned LdsSpillTotalSize = MFI->getLdsSpill().TotalSize; + Info.PrivateSegmentSize = FrameInfo.getStackSize() - LdsSpillTotalSize; // Assume a big number if there are any unknown sized objects. Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -795,6 +795,11 @@ return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); } +unsigned GCNSubtarget::getLdsSpillLimitDwords(const MachineFunction &MF) const { + const Function &F = MF.getFunction(); + return AMDGPU::getIntegerAttribute(F, "amdgpu-lds-spill-limit-dwords", 0); +} + void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep) const { if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1064,6 +1064,8 @@ // hasGFX90AInsts is also true. bool hasGFX940Insts() const { return GFX940Insts; } + bool hasDSAddTid() const { return getGeneration() >= GFX9; } + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; @@ -1228,6 +1230,11 @@ /// unit requirement. unsigned getMaxNumVGPRs(const MachineFunction &MF) const; + /// \returns Maximum amount of LDS space to be used for spilling explicitly + /// requested using "amdgpu-lds-spill-limit-dwords attribute attached to + /// function \p F. + unsigned getLdsSpillLimitDwords(const MachineFunction &MF) const; + void getPostRAMutations( std::vector> &Mutations) const override; diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -70,6 +70,10 @@ Register PreloadedPrivateBufferReg, Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const; + void setupLDSSpilling(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL) const; + public: bool hasFP(const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -400,6 +400,157 @@ return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); } +// Determine which stack objects should be spilled to LDS, set up +// SIMachineFunctionInfo::LdsSpill structure and initialize +// m0 for LDS spilling if possible. +void SIFrameLowering::setupLDSSpilling(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL) const { + SIMachineFunctionInfo *MFI = MF.getInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const Function &F = MF.getFunction(); + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F).second; + + assert(MFI->isEntryFunction()); + + int LDSSpillLimitInBytes = ST.getLdsSpillLimitDwords(MF) * 4; + LDSSpillLimitInBytes = + std::max(0, LDSSpillLimitInBytes - (int)MFI->getLDSSize()); + + // Go through the stack slots starting from the end and assign them to LDS + // as long as they fit in the remaining size. + SmallVector LdsOffsets(FrameInfo.getObjectIndexEnd(), -1); + bool AllStackSlotsHandled = true; + int TotalSize = 0; + int RemainingSize = LDSSpillLimitInBytes; + for (int i = FrameInfo.getObjectIndexEnd() - 1; i >= 0; --i) { + if (FrameInfo.isDeadObjectIndex(i)) { + continue; + } + if (FrameInfo.isObjectPreAllocated(i)) { + AllStackSlotsHandled = false; + break; + } + int ObjSize = FrameInfo.getObjectSize(i); + assert(ObjSize > 0); + int ObjSizeForAllThreads = ObjSize * WorkGroupSize; + + if (ObjSizeForAllThreads <= RemainingSize) { + RemainingSize -= ObjSizeForAllThreads; + LdsOffsets[i] = TotalSize; + + TotalSize += ObjSize; + } else { + AllStackSlotsHandled = false; + break; + } + } + + // No stack slots will use LDS - exit early. + if (TotalSize == 0) + return; + + // Register to use for m0 save/restore for each spill, or NoRegister if the + // save/restore is not needed, and the initialization takes place here once. + Register M0SaveRestoreReg; + if (MRI.isPhysRegUsed(AMDGPU::M0)) { + if (requiresStackPointerReference(MF)) { + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + ArrayRef AllSGPRs = TRI->getAllSGPR32(MF); + AllSGPRs = AllSGPRs.slice( + std::min(static_cast(AllSGPRs.size()), NumPreloaded)); + for (MCPhysReg Reg : AllSGPRs) { + if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { + M0SaveRestoreReg = Reg; + break; + } + } + } else { + assert(!requiresStackPointerReference(MF)); + M0SaveRestoreReg = MFI->getStackPtrOffsetReg(); + } + // Could not find a free SGPR for M0 init so exit early. + if (M0SaveRestoreReg == AMDGPU::NoRegister) + return; + } + + Register M0InitVal; + // The addtid addressing is as follows: + // LDS_Addr = LDS_BASE + {Inst_offset1, Inst_offset0} + TID(0..63)*4 + M0 + // If the workgroup size is not larger than the wave size we can safely init + // m0 with 0. Otherwise, we need to make sure that the lds addresses do not + // override data for other slots so we initialize m0 to + // current_wave_id_in_group * wave size. + if (WorkGroupSize > ST.getWavefrontSize()) { + Register PreloadedWorkgroupInfoReg = MFI->getWorkgroupInfoReg(); + if (!PreloadedWorkgroupInfoReg) { + // This should never happen, but it depends on how front-end sets up + // input sgprs, so it is safer to make it an early out rather than assert. + return; + } + + if (!MRI.isPhysRegUsed(PreloadedWorkgroupInfoReg)) { + M0InitVal = PreloadedWorkgroupInfoReg; + } else { + + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + ArrayRef AllSGPRs = TRI->getAllSGPR32(MF); + AllSGPRs = AllSGPRs.slice( + std::min(static_cast(AllSGPRs.size()), NumPreloaded)); + for (MCPhysReg Reg : AllSGPRs) { + if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { + M0InitVal = Reg; + break; + } + } + } + + // Could not find a free SGPR for M0 init so exit early. + // FIXME: We could also check some of the preloads to see if one of them + // could be re-used. + if (M0InitVal == AMDGPU::NoRegister) + return; + + // Load ordered_append_term to get the current wave id in a group. + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_BFE_U32), M0InitVal) + .addReg(PreloadedWorkgroupInfoReg) + .addImm(0xc0006); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MUL_I32), M0InitVal) + .addReg(M0InitVal) + .addImm(ST.getWavefrontSize() * 4); + } + + // If save/restore is not needed we can init m0 here and be done with it. + if (M0SaveRestoreReg == AMDGPU::NoRegister) { + if (M0InitVal == AMDGPU::NoRegister) + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).addImm(0); + else + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(M0InitVal); + } + + SIMachineFunctionInfo::LdsSpill LdsSpillInfo; + LdsSpillInfo.M0InitVal = M0InitVal; + LdsSpillInfo.M0SaveRestoreReg = M0SaveRestoreReg; + LdsSpillInfo.LdsOffsets = LdsOffsets; + LdsSpillInfo.TotalSize = TotalSize; + MFI->setLdsSpill(LdsSpillInfo); + + // Earlier we set ScavengeFI based on the fact that there were stack accesses. + // In the event no slots will use stack, we can safely remove it. + if (AllStackSlotsHandled) { + int ScavengeFI = MFI->getScavengeFI(FrameInfo, *TRI); + FrameInfo.setStackSize(FrameInfo.getStackSize() - + FrameInfo.getObjectSize(ScavengeFI)); + FrameInfo.RemoveStackObject(ScavengeFI); + } +} + void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); @@ -425,6 +576,14 @@ assert(MFI->isEntryFunction()); + // Debug location must be unknown since the first debug location is used to + // determine the end of the prologue. + DebugLoc DL; + MachineBasicBlock::iterator I = MBB.begin(); + + if (FrameInfo.getStackSize() > 0 && MFI->ldsSpillingEnabled(MF)) + setupLDSSpilling(MF, MBB, I, DL); + Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); @@ -461,11 +620,6 @@ } } - // Debug location must be unknown since the first debug location is used to - // determine the end of the prologue. - DebugLoc DL; - MachineBasicBlock::iterator I = MBB.begin(); - // We found the SRSRC first because it needs four registers and has an // alignment requirement. If the SRSRC that we found is clobbering with // the scratch wave offset, which may be in a fixed SGPR or a free SGPR diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2555,6 +2555,31 @@ InVals.push_back(Val); } + SIMachineFunctionInfo *MFI = MF.getInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + if (MFI->ldsSpillingEnabled(MF) && + ST.getFlatWorkGroupSizes(Fn).second > ST.getWavefrontSize()) { + + int WorkGroupInfoSgprNo = + AMDGPU::getIntegerAttribute(Fn, "amdgpu-work-group-info-arg-no", -1); + if (WorkGroupInfoSgprNo != -1) + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + const ISD::InputArg &Arg = Ins[i]; + if (Arg.getOrigArgIndex() == (unsigned)WorkGroupInfoSgprNo) { + + CCValAssign &VA = ArgLocs[i]; + Register WorkGroupInfoReg = VA.getLocReg(); + assert(AMDGPU::SGPR_32RegClass.contains(WorkGroupInfoReg)); + + Info->setWorkgroupInfoReg(WorkGroupInfoReg); + MF.addLiveIn(WorkGroupInfoReg, &AMDGPU::SGPR_32RegClass); + MF.front().addLiveIn(WorkGroupInfoReg, &AMDGPU::SGPR_32RegClass); + + break; + } + } + } + // Start adding system SGPRs. if (IsEntryFunc) { allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -364,6 +364,11 @@ // base to the beginning of the new function's frame. Register StackPtrOffsetReg = AMDGPU::SP_REG; + // This is WorkgroupInfo register set up for LDS spilling for cases where + // workgroup size is larger than wave size. It relies on user input + // registers set up by the front-end. + Register WorkgroupInfoReg = 0; + AMDGPUFunctionArgInfo ArgInfo; // Graphics info. @@ -472,6 +477,20 @@ bool IsDead = false; }; + struct LdsSpill { + // Value to init m0 with. + Register M0InitVal; + // Register to save/restore current value of m0 for each spill. If + // NoRegister, the m0 initialization takes place in the prolog once. + Register M0SaveRestoreReg; + // Offset in LDS indexed by a stack object index. Value (-1) means there is + // no LDS spilling for such stack object index. The values are properly + // initialized only if TotalSize > 0. + SmallVector LdsOffsets; + // Total size of all LDS spill objects in bytes (per thread). + unsigned TotalSize = 0; + }; + // Track VGPRs reserved for WWM. SmallSetVector WWMReservedRegs; @@ -509,6 +528,8 @@ // frame, so save it here and add it to the RegScavenger later. Optional ScavengeFI; + LdsSpill LdsSpillInfo; + private: Register VGPRForAGPRCopy; @@ -800,6 +821,13 @@ StackPtrOffsetReg = Reg; } + void setWorkgroupInfoReg(Register Reg) { + assert(Reg != 0); + WorkgroupInfoReg = Reg; + } + + Register getWorkgroupInfoReg() const { return WorkgroupInfoReg; } + // Note the unset value for this is AMDGPU::SP_REG rather than // NoRegister. This is mostly a workaround for MIR tests where state that // can't be directly computed from the function is not preserved in serialized @@ -988,6 +1016,12 @@ // \returns true if a function needs or may need AGPRs. bool usesAGPRs(const MachineFunction &MF) const; + + void setLdsSpill(LdsSpill Info) { LdsSpillInfo = Info; } + + LdsSpill getLdsSpill() const { return LdsSpillInfo; } + + bool ldsSpillingEnabled(const MachineFunction &MF) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -734,3 +734,18 @@ UsesAGPRs = false; return false; } + +bool SIMachineFunctionInfo::ldsSpillingEnabled( + const MachineFunction &MF) const { + const GCNSubtarget &ST = MF.getSubtarget(); + if (!ST.hasDSAddTid()) + return false; + + if (MF.getFrameInfo().hasCalls()) + return false; + + if (ST.getLdsSpillLimitDwords(MF) == 0) + return false; + + return true; +} diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -405,6 +405,12 @@ /// of the subtarget. ArrayRef getAllSGPR32(const MachineFunction &MF) const; + bool buildLdsSpillLoadStore(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, bool IsLoad, int Index, + Register ValueReg, bool ValueIsKill, + int64_t InstrOffset, + MachineMemOperand *MMO) const; // Insert spill or restore instructions. // When lowering spill pseudos, the RegScavenger should be set. // For creating spill instructions during frame lowering, where no scavenger diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1267,6 +1267,89 @@ return LoadStoreOp; } +bool SIRegisterInfo::buildLdsSpillLoadStore(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, bool IsLoad, + int Index, Register ValueReg, + bool IsKill, int64_t InstOffset, + MachineMemOperand *MMO) const { + + const SIInstrInfo *TII = ST.getInstrInfo(); + + MachineFunction *MF = MBB.getParent(); + const MachineFrameInfo &MFI = MF->getFrameInfo(); + const SIMachineFunctionInfo *FuncInfo = MF->getInfo(); + + SIMachineFunctionInfo::LdsSpill LdsSpillInfo = FuncInfo->getLdsSpill(); + int64_t LdsOffsetForIndex = LdsSpillInfo.LdsOffsets[Index]; + if (LdsOffsetForIndex == -1) + return false; + + if (LdsSpillInfo.M0SaveRestoreReg) { + + // FIXME: If we could prove that there are no m0 defs/uses between two LDS + // spill instructions we could avoid doing some save/restore. + + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), + LdsSpillInfo.M0SaveRestoreReg) + .addReg(AMDGPU::M0); + if (LdsSpillInfo.M0InitVal == AMDGPU::NoRegister) + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).addImm(0x0); + else + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addImm(LdsSpillInfo.M0InitVal); + } + + const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); + unsigned EltCount = AMDGPU::getRegBitWidth(RC->getID()) / 32; + + Align Alignment = MFI.getObjectAlign(Index); + const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); + for (unsigned R = 0; R < EltCount; ++R) { + MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(4 * R); + MachineMemOperand *NewMMO = MF->getMachineMemOperand( + PInfo, MMO->getFlags(), 4, commonAlignment(Alignment, 4 * R)); + + Register SubReg = + EltCount == 1 ? ValueReg + : Register(getSubReg(ValueReg, getSubRegFromChannel(R))); + + unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(MF->getFunction()).second; + // The addtid addressing is as follows: + // LDS_Addr = LDS_BASE + {Inst_offset1, Inst_offset0} + TID(0..63)*4 + M0 + // We calculate offset for the zeroth lane and make room for other lanes by + // multiplying by the wave size. The earlier m0 setup handles the case + // when the workgroup size is larger than thread size. + + int64_t LdsOffsetForIndex = FuncInfo->getLdsSpill().LdsOffsets[Index]; + assert(LdsOffsetForIndex != -1); + + int64_t StackOffset = InstOffset + LdsOffsetForIndex + 4 * R; + int64_t StackOffsetZerothLane = + StackOffset * WorkGroupSize + FuncInfo->getLDSSize(); + + if (IsLoad) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::DS_READ_ADDTID_B32), SubReg) + .addImm(StackOffsetZerothLane) + .addImm(0 /* gds */) + .addMemOperand(NewMMO); + + } else { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::DS_WRITE_ADDTID_B32)) + .addReg(SubReg, getKillRegState(R == EltCount - 1 ? IsKill : false)) + .addImm(StackOffsetZerothLane) + .addImm(0 /* gds */) + .addMemOperand(NewMMO); + } + } + + if (LdsSpillInfo.M0SaveRestoreReg) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(LdsSpillInfo.M0SaveRestoreReg, RegState::Kill); + } + return true; +} + void SIRegisterInfo::buildSpillLoadStore( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, @@ -2030,7 +2113,28 @@ case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V96_SAVE: case AMDGPU::SI_SPILL_V64_SAVE: - case AMDGPU::SI_SPILL_V32_SAVE: + case AMDGPU::SI_SPILL_V32_SAVE: { + if (MFI->getLdsSpill().TotalSize > 0) { + + const MachineOperand *VData = + TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); + assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == + MFI->getStackPtrOffsetReg()); + + bool ldsSpill = buildLdsSpillLoadStore( + *MBB, MI, DL, /*IsLoad*/ false, Index, VData->getReg(), + /*IsKill*/ VData->isKill(), + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), + *MI->memoperands_begin()); + + if (ldsSpill) { + MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); + MI->eraseFromParent(); + break; + } + } + LLVM_FALLTHROUGH; + } case AMDGPU::SI_SPILL_A1024_SAVE: case AMDGPU::SI_SPILL_A512_SAVE: case AMDGPU::SI_SPILL_A256_SAVE: @@ -2076,7 +2180,25 @@ case AMDGPU::SI_SPILL_V224_RESTORE: case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_V512_RESTORE: - case AMDGPU::SI_SPILL_V1024_RESTORE: + case AMDGPU::SI_SPILL_V1024_RESTORE: { + if (MFI->getLdsSpill().TotalSize > 0) { + const MachineOperand *VData = + TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); + assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == + MFI->getStackPtrOffsetReg()); + + bool ldsSpill = buildLdsSpillLoadStore( + *MBB, MI, DL, /*IsLoad */ true, Index, VData->getReg(), false, + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), + *MI->memoperands_begin()); + if (ldsSpill) { + MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); + MI->eraseFromParent(); + break; + } + } + LLVM_FALLTHROUGH; + } case AMDGPU::SI_SPILL_A32_RESTORE: case AMDGPU::SI_SPILL_A64_RESTORE: case AMDGPU::SI_SPILL_A96_RESTORE: diff --git a/llvm/test/CodeGen/AMDGPU/lds-spill-cs.ll b/llvm/test/CodeGen/AMDGPU/lds-spill-cs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-spill-cs.ll @@ -0,0 +1,64 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=W32 +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=W64 +; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=W32 +; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=W64 + +; The test checks if the part of spilling goes to LDS with the right m0 setup. +; Without the vgpr limit, the test needs to use 16 vgprs as there are four vec4 variables in-flight. +; With "num-vgpr"="12" limit, one vec4 needs to be spilled to memory (16 bytes). +; 16 bytes will occupy 256 dwords, so setting "amdgpu-lds-spill-limit-dwords"="256" will suffice. +; Note: 16 bytes * 64 (workgroup size) = 16 * 64 * 8 bits = 16 * 64 * 8 / 32 dwords = 256 dwords + +define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %globalTable, i32 inreg %perShaderTable, i32 inreg %descTable0, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId, <2 x i32> inreg %ptr) #3 { +; W32-LABEL: _amdgpu_cs_main: +; W32: ; %bb.0: ; %.entry +; W32: s_bfe_u32 s7, s7, 0xc0006 +; W32: s_mulk_i32 s7, 0x80 +; W32: s_mov_b32 m0, s7 +; W32: ds_write_addtid_b32 v0 ; 4-byte Folded Spill +; W32: ds_write_addtid_b32 v1 offset:256 ; 4-byte Folded Spill +; W32: ds_write_addtid_b32 v2 offset:512 ; 4-byte Folded Spill +; W32: ds_write_addtid_b32 v3 offset:768 ; 4-byte Folded Spill +; W32: ds_read_addtid_b32 v0 ; 4-byte Folded Reload +; W32: ds_read_addtid_b32 v1 offset:256 ; 4-byte Folded Reload +; W32: ds_read_addtid_b32 v2 offset:512 ; 4-byte Folded Reload +; W32: ds_read_addtid_b32 v3 offset:768 ; 4-byte Folded Reload +; +; W64-LABEL: _amdgpu_cs_main: +; W64: ; %bb.0: ; %.entry +; W64: s_mov_b32 m0, 0 +; W64: ds_write_addtid_b32 v0 ; 4-byte Folded Spill +; W64: ds_write_addtid_b32 v1 offset:256 ; 4-byte Folded Spill +; W64: ds_write_addtid_b32 v2 offset:512 ; 4-byte Folded Spill +; W64: ds_write_addtid_b32 v3 offset:768 ; 4-byte Folded Spill +; W64: ds_read_addtid_b32 v0 ; 4-byte Folded Reload +; W64: ds_read_addtid_b32 v1 offset:256 ; 4-byte Folded Reload +; W64: ds_read_addtid_b32 v2 offset:512 ; 4-byte Folded Reload +; W64: ds_read_addtid_b32 v3 offset:768 ; 4-byte Folded Reload +.entry: + %i6 = bitcast <2 x i32> %ptr to i64 + %i7 = inttoptr i64 %i6 to <4 x i32> addrspace(4)* + %i8 = load <4 x i32>, <4 x i32> addrspace(4)* %i7, align 16 + %i9 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %i8, i32 0, i32 0, i32 0) + %i10 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %i8, i32 16, i32 0, i32 0) + %i11 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %i8, i32 32, i32 0, i32 0) + %i12 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %i8, i32 48, i32 0, i32 0) + fence syncscope("workgroup") acq_rel + call void @llvm.amdgcn.s.barrier() + call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i9, <4 x i32> %i8, i32 64, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i10, <4 x i32> %i8, i32 80, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i11, <4 x i32> %i8, i32 96, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i12, <4 x i32> %i8, i32 112, i32 0, i32 0) + ret void +} + +; Function Attrs: convergent nounwind willreturn +declare void @llvm.amdgcn.s.barrier() #0 + +; Function Attrs: nounwind readonly willreturn +declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg) #4 + +; Function Attrs: nounwind willreturn writeonly +declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32 immarg) #5 + +attributes #3 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-lds-spill-limit-dwords"="256" "amdgpu-work-group-info-arg-no"="5" "amdgpu-num-vgpr"="12" } diff --git a/llvm/test/CodeGen/AMDGPU/lds-spill-ps.ll b/llvm/test/CodeGen/AMDGPU/lds-spill-ps.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-spill-ps.ll @@ -0,0 +1,64 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=W32 +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=W64 +; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=W32 +; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=W64 + +; The test checks if the part of spilling goes to LDS with the right m0 setup. +; Since "amdgpu-lds-spill-limit-dwords"="256" limit is respected: +; - In wave32, 8 dword slots get allocated to LDS (8 * 32), equaling 1024 bytes. +; - In wave64, 4 dword slots get allodated to LDS (4 * 64), equaling 1024 bytes. + +define dllexport amdgpu_ps void @_amdgpu_ps_main(i32 inreg %globalTable, i32 inreg %perShaderTable, i32 inreg %descTable0, i32 inreg %spillTable, i32 inreg %PrimMask, <2 x float> %PerspInterpSample, <2 x float> %PerspInterpCenter, <2 x float> %PerspInterpCentroid, <3 x float> %PerspInterpPullMode, <2 x float> %LinearInterpSample, <2 x float> %LinearInterpCenter, <2 x float> %LinearInterpCentroid, float %LineStipple, float %FragCoordX, float %FragCoordY, float %FragCoordZ, float %FragCoordW, i32 %FrontFacing, i32 %Ancillary, i32 %SampleCoverage, i32 %FixedXY, <2 x i32> inreg %ptr) #0 { +; W32-LABEL: _amdgpu_ps_main: +; W32: ; %bb.0: ; %.entry +; W32: s_mov_b32 m0, 0 +; W32: ds_write_addtid_b32 v0 ; 4-byte Folded Spill +; W32: ds_write_addtid_b32 v1 offset:128 ; 4-byte Folded Spill +; W32: ds_write_addtid_b32 v2 offset:256 ; 4-byte Folded Spill +; W32: ds_write_addtid_b32 v3 offset:384 ; 4-byte Folded Spill +; W32: ds_write_addtid_b32 v0 offset:512 ; 4-byte Folded Spill +; W32: ds_write_addtid_b32 v1 offset:640 ; 4-byte Folded Spill +; W32: ds_write_addtid_b32 v2 offset:768 ; 4-byte Folded Spill +; W32: ds_write_addtid_b32 v3 offset:896 ; 4-byte Folded Spill +; W32: ds_read_addtid_b32 v0 ; 4-byte Folded Reload +; W32: ds_read_addtid_b32 v1 offset:128 ; 4-byte Folded Reload +; W32: ds_read_addtid_b32 v2 offset:256 ; 4-byte Folded Reload +; W32: ds_read_addtid_b32 v3 offset:384 ; 4-byte Folded Reload +; W32: ds_read_addtid_b32 v0 offset:512 ; 4-byte Folded Reload +; W32: ds_read_addtid_b32 v1 offset:640 ; 4-byte Folded Reload +; W32: ds_read_addtid_b32 v2 offset:768 ; 4-byte Folded Reload +; W32: ds_read_addtid_b32 v3 offset:896 ; 4-byte Folded Reload +; +; W64-LABEL: _amdgpu_ps_main: +; W64: ; %bb.0: ; %.entry +; W64: s_mov_b32 m0, 0 +; W64: ds_write_addtid_b32 v0 ; 4-byte Folded Spill +; W64: ds_write_addtid_b32 v1 offset:256 ; 4-byte Folded Spill +; W64: ds_write_addtid_b32 v2 offset:512 ; 4-byte Folded Spill +; W64: ds_write_addtid_b32 v3 offset:768 ; 4-byte Folded Spill +; W64: ds_read_addtid_b32 v0 ; 4-byte Folded Reload +; W64: ds_read_addtid_b32 v1 offset:256 ; 4-byte Folded Reload +; W64: ds_read_addtid_b32 v2 offset:512 ; 4-byte Folded Reload +; W64: ds_read_addtid_b32 v3 offset:768 ; 4-byte Folded Reload +.entry: + %i6 = bitcast <2 x i32> %ptr to i64 + %i7 = inttoptr i64 %i6 to <4 x i32> addrspace(4)* + %i8 = load <4 x i32>, <4 x i32> addrspace(4)* %i7, align 16 + %i9 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %i8, i32 0, i32 0, i32 0) + %i10 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %i8, i32 16, i32 0, i32 0) + %i11 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %i8, i32 32, i32 0, i32 0) + %i12 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %i8, i32 48, i32 0, i32 0) + fence acq_rel + call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i9, <4 x i32> %i8, i32 64, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i10, <4 x i32> %i8, i32 80, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i11, <4 x i32> %i8, i32 96, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i12, <4 x i32> %i8, i32 112, i32 0, i32 0) + ret void +} + +; Function Attrs: nounwind readonly willreturn +declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg) +; Function Attrs: nounwind willreturn writeonly +declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32 immarg) + +attributes #0 = { "amdgpu-lds-spill-limit-dwords"="256" "amdgpu-num-vgpr"="12" }