Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -417,14 +417,16 @@ } } - if (VCCUsed || FlatUsed) + if (VCCUsed) MaxSGPR += 2; if (FlatUsed) { MaxSGPR += 2; // 2 additional for VI+. - if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - MaxSGPR += 2; + if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Assume XNACK_MASK is not used + //MaxSGPR += 2; + } } // We found the maximum register index. They start at 0, so add one to get the Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -105,51 +105,53 @@ MBB.addLiveIn(PreloadedPrivateBufferReg); } - // We reserved the last registers for this. Shift it down to the end of those - // which were actually used. - // - // FIXME: It might be safer to use a pseudoregister before replacement. - - // FIXME: We should be able to eliminate unused input registers. We only - // cannot do this for the resources required for scratch access. For now we - // skip over user SGPRs and may leave unused holes. - - // We find the resource first because it has an alignment requirement. - if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - - unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4; - // Skip the last 2 elements because the last one is reserved for VCC, and - // this is the 2nd to last element already. - for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) { - // Pick the first unallocated one. Make sure we don't clobber the other - // reserved input we needed. - if (!MRI.isPhysRegUsed(Reg)) { - assert(MRI.isAllocatable(Reg)); - MRI.replaceRegWith(ScratchRsrcReg, Reg); - ScratchRsrcReg = Reg; - MFI->setScratchRSrcReg(ScratchRsrcReg); - break; + if (!ST.hasSGPRInitBug()) { + // We reserved the last registers for this. Shift it down to the end of those + // which were actually used. + // + // FIXME: It might be safer to use a pseudoregister before replacement. + + // FIXME: We should be able to eliminate unused input registers. We only + // cannot do this for the resources required for scratch access. For now we + // skip over user SGPRs and may leave unused holes. + + // We find the resource first because it has an alignment requirement. + if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4; + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) { + // Pick the first unallocated one. Make sure we don't clobber the other + // reserved input we needed. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg)); + MRI.replaceRegWith(ScratchRsrcReg, Reg); + ScratchRsrcReg = Reg; + MFI->setScratchRSrcReg(ScratchRsrcReg); + break; + } } } - } - if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - // Skip the last 2 elements because the last one is reserved for VCC, and - // this is the 2nd to last element already. - unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); - for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) { - // Pick the first unallocated SGPR. Be careful not to pick an alias of the - // scratch descriptor, since we haven’t added its uses yet. - if (!MRI.isPhysRegUsed(Reg)) { - assert(MRI.isAllocatable(Reg) && - !TRI->isSubRegisterEq(ScratchRsrcReg, Reg)); - - MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); - ScratchWaveOffsetReg = Reg; - MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg); - break; + if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) { + // Pick the first unallocated SGPR. Be careful not to pick an alias of the + // scratch descriptor, since we haven’t added its uses yet. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(ScratchRsrcReg, Reg)); + + MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + ScratchWaveOffsetReg = Reg; + MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + break; + } } } } Index: test/CodeGen/AMDGPU/spill-stress.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/spill-stress.ll @@ -0,0 +1,40 @@ +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=TONGA %s + +; On Tonga and Iceland, limited SGPR availability means care must be taken to +; allocate scratch registers correctly. +; TONGA-LABEL: test +define void @test(<60 x i32> addrspace(1)* %out, i32 addrspace(1)* %foil, <60 x i32> addrspace(1)* %in, <60 x i32> %f) { +entry: + %tid = call i32 @llvm.SI.tid() nounwind readnone + %aptr = getelementptr <60 x i32>, <60 x i32> addrspace(1)* %in, i32 %tid + %a = load <60 x i32>, <60 x i32> addrspace(1)* %aptr + store i32 %tid, i32 addrspace(1)* %foil + %bidx = add i32 %tid, 60 + %bptr = getelementptr <60 x i32>, <60 x i32> addrspace(1)* %in, i32 %bidx + %b = load <60 x i32>, <60 x i32> addrspace(1)* %bptr + %bfoil = getelementptr i32, i32 addrspace(1)* %foil, i32 1 + store i32 %bidx, i32 addrspace(1)* %bfoil + %cidx = add i32 %tid, 128 + %cptr = getelementptr <60 x i32>, <60 x i32> addrspace(1)* %in, i32 %cidx + %c = load <60 x i32>, <60 x i32> addrspace(1)* %cptr + %cfoil = getelementptr i32, i32 addrspace(1)* %bfoil, i32 1 + store i32 %cidx, i32 addrspace(1)* %cfoil + %didx = add i32 %tid, 196 + %dptr = getelementptr <60 x i32>, <60 x i32> addrspace(1)* %in, i32 %didx + %d = load <60 x i32>, <60 x i32> addrspace(1)* %dptr + %dfoil = getelementptr i32, i32 addrspace(1)* %cfoil, i32 1 + store i32 %cidx, i32 addrspace(1)* %dfoil + %eidx = add i32 %tid, 256 + %eptr = getelementptr <60 x i32>, <60 x i32> addrspace(1)* %in, i32 %eidx + %e = load <60 x i32>, <60 x i32> addrspace(1)* %eptr + %am = mul <60 x i32> %a, %f + %s1 = add <60 x i32> %b, %am + %s2 = add <60 x i32> %s1, %c + %s3 = add <60 x i32> %s2, %d + %s = add <60 x i32> %e, %s3 + %outptr = getelementptr <60 x i32>, <60 x i32> addrspace(1)* %out, i32 %tid + store <60 x i32> %s, <60 x i32> addrspace(1)* %outptr + ret void +} + +declare i32 @llvm.SI.tid() nounwind readnone