Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -21,12 +21,6 @@ using namespace llvm; -static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo, - const MachineFrameInfo &MFI) { - return FuncInfo->hasSpilledSGPRs() && - (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects()); -} - static ArrayRef getAllSGPR128(const MachineFunction &MF, const SIRegisterInfo *TRI) { return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), @@ -75,7 +69,6 @@ const SIMachineFunctionInfo *MFI = MF.getInfo(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - // Add wave offset in bytes to private base offset. // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) @@ -97,7 +90,8 @@ // We need to insert initialization of the scratch resource descriptor. unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); - assert(ScratchRsrcReg != AMDGPU::NoRegister); + if (ScratchRsrcReg == AMDGPU::NoRegister) + return AMDGPU::NoRegister; if (ST.hasSGPRInitBug() || ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) @@ -116,14 +110,17 @@ MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4; + unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; + ArrayRef AllSGPR128s = getAllSGPR128(MF, TRI); + AllSGPR128s = AllSGPR128s.slice(std::min(static_cast(AllSGPR128s.size()), NumPreloaded)); + // Skip the last 2 elements because the last one is reserved for VCC, and // this is the 2nd to last element already. - for (MCPhysReg Reg : getAllSGPR128(MF, TRI).drop_back(2).slice(NumPreloaded)) { + for (MCPhysReg Reg : AllSGPR128s) { // Pick the first unallocated one. Make sure we don't clobber the other // reserved input we needed. - if (!MRI.isPhysRegUsed(Reg)) { - assert(MRI.isAllocatable(Reg)); + if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { + //assert(MRI.isAllocatable(Reg)); MRI.replaceRegWith(ScratchRsrcReg, Reg); MFI->setScratchRSrcReg(Reg); return Reg; @@ -146,8 +143,15 @@ unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); MachineRegisterInfo &MRI = MF.getRegInfo(); + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + ArrayRef AllSGPRs = getAllSGPRs(MF, TRI); + if (NumPreloaded > AllSGPRs.size()) + return ScratchWaveOffsetReg; + + AllSGPRs = AllSGPRs.slice(NumPreloaded); + // We need to drop register from the end of the list that we cannot use // for the scratch wave offset. // + 2 s102 and s103 do not exist on VI. @@ -161,7 +165,10 @@ // are no other free SGPRs, then the value will stay in this register. // ---- // 13 - for (MCPhysReg Reg : getAllSGPRs(MF, TRI).drop_back(13).slice(NumPreloaded)) { + if (AllSGPRs.size() < 13) + return ScratchWaveOffsetReg; + + for (MCPhysReg Reg : AllSGPRs.drop_back(13)) { // Pick the first unallocated SGPR. Be careful not to pick an alias of the // scratch descriptor, since we haven’t added its uses yet. if (!MRI.isPhysRegUsed(Reg)) { @@ -186,9 +193,6 @@ if (ST.debuggerEmitPrologue()) emitDebuggerPrologue(MF, MBB); - if (!MF.getFrameInfo().hasStackObjects()) - return; - assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); SIMachineFunctionInfo *MFI = MF.getInfo(); @@ -198,8 +202,6 @@ // // FIXME: We should be cleaning up these unused SGPR spill frame indices // somewhere. - if (hasOnlySGPRSpills(MFI, MF.getFrameInfo())) - return; const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); @@ -209,38 +211,51 @@ = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); unsigned ScratchWaveOffsetReg = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); - assert(ScratchRsrcReg != AMDGPU::NoRegister); - assert(ScratchWaveOffsetReg != AMDGPU::NoRegister); + + if (ScratchRsrcReg == AMDGPU::NoRegister) { + assert(ScratchWaveOffsetReg == AMDGPU::NoRegister); + return; + } + assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); - if (MFI->hasFlatScratchInit()) + // We need to do the replacement of the private segment buffer and wave offset + // register even if there are no stack objects. There could be stores to undef + // or a constant without an associated object. + + // FIXME: We still have implicit uses on SGPR spill instructions in case they + // need to spill to vector memory. It's likely that will not happen, but at + // this point it appears we need the setup. This part of the prolog should be + // emitted after frame indices are eliminated. + + if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit()) emitFlatScratchInit(TII, TRI, MF, MBB); // We need to insert initialization of the scratch resource descriptor. unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; if (ST.isAmdCodeObjectV2()) { PreloadedPrivateBufferReg = TRI->getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); } - // If we reserved the original input registers, we don't need to copy to the - // reserved registers. - if (ScratchRsrcReg == PreloadedPrivateBufferReg) { - // We should always reserve these 5 registers at the same time. - assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg && - "scratch wave offset and private segment buffer inconsistent"); - return; - } + bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg); + bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg); // We added live-ins during argument lowering, but since they were not used // they were deleted. We're adding the uses now, so add them back. - MRI.addLiveIn(PreloadedScratchWaveOffsetReg); - MBB.addLiveIn(PreloadedScratchWaveOffsetReg); + if (OffsetRegUsed) { + assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister && + "scratch wave offset input is required"); + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); + } - if (ST.isAmdCodeObjectV2()) { + if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { + assert(ST.isAmdCodeObjectV2()); MRI.addLiveIn(PreloadedPrivateBufferReg); MBB.addLiveIn(PreloadedPrivateBufferReg); } @@ -250,30 +265,46 @@ if (&OtherBB == &MBB) continue; - OtherBB.addLiveIn(ScratchRsrcReg); - OtherBB.addLiveIn(ScratchWaveOffsetReg); + if (OffsetRegUsed) + OtherBB.addLiveIn(ScratchWaveOffsetReg); + + if (ResourceRegUsed) + OtherBB.addLiveIn(ScratchRsrcReg); } DebugLoc DL; MachineBasicBlock::iterator I = MBB.begin(); - if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { - // Make sure we emit the copy for the offset first. We may have chosen to - // copy the buffer resource into a register that aliases the input offset - // register. + // If we reserved the original input registers, we don't need to copy to the + // reserved registers. + + bool CopyBuffer = ResourceRegUsed && + PreloadedPrivateBufferReg != AMDGPU::NoRegister && + ScratchRsrcReg != PreloadedPrivateBufferReg; + + // This needs to be careful of the copying order to avoid overwriting one of + // the input registers before it's been copied to it's final + // destination. Usually the offset should be copied first. + bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg, + ScratchWaveOffsetReg); + if (CopyBuffer && CopyBufferFirst) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) + .addReg(PreloadedPrivateBufferReg, RegState::Kill); + } + + if (OffsetRegUsed && + PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); } - if (ST.isAmdCodeObjectV2()) { - // Insert copies from argument register. - assert( - !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) && - !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg)); - + if (CopyBuffer && !CopyBufferFirst) { BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) .addReg(PreloadedPrivateBufferReg, RegState::Kill); - } else { + } + + if (ResourceRegUsed && PreloadedPrivateBufferReg == AMDGPU::NoRegister) { + assert(!ST.isAmdCodeObjectV2()); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -614,7 +614,12 @@ BuildMI(MBB, MI, DL, OpDesc) .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr - .addMemOperand(MMO); + .addMemOperand(MMO) + .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) + .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); + // Add the scratch resource registers as implicit uses because we may end up + // needing them, and need to ensure that the reserved registers are + // correctly handled. return; } @@ -707,7 +712,9 @@ BuildMI(MBB, MI, DL, OpDesc, DestReg) .addFrameIndex(FrameIndex) // addr - .addMemOperand(MMO); + .addMemOperand(MMO) + .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) + .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); return; } Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -973,9 +973,20 @@ F, "amdgpu-num-sgpr", MaxNumSGPRs); // Make sure requested value does not violate subtarget's specifications. - if (Requested && Requested <= getNumReservedSGPRs(ST)) + if (Requested && (Requested <= getNumReservedSGPRs(ST))) Requested = 0; + // If more SGPRs are required to support the input user/system SGPRs, + // increase to accomodate them. + // + // FIXME: This really ends up using the requested number of SGPRs + number + // of reserved special registers in total. Theoretically you could re-use + // the last input registers for these special registers, but this would + // require a lot of complexity to deal with the weird aliasing. + unsigned NumInputSGPRs = MFI.getNumPreloadedSGPRs(); + if (Requested && Requested < NumInputSGPRs) + Requested = NumInputSGPRs; + // Make sure requested value is compatible with values implied by // default/requested minimum/maximum number of waves per execution unit. if (Requested && Requested > getMaxNumSGPRs(ST, WavesPerEU.first)) Index: test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll =================================================================== --- test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -1,9 +1,16 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s -; CHECK-LABEL: {{^}}max_18_sgprs: +; CHECK-LABEL: {{^}}max_14_sgprs: + +; FIXME: Should be ablo to skip this copying of the private segment +; buffer because all the SGPR spills are to VGPRs. + +; CHECK: s_mov_b64 s[6:7], s[2:3] +; CHECK: s_mov_b64 s[4:5], s[0:1] + ; CHECK: SGPRBlocks: 1 -; CHECK: NumSGPRsForWavesPerEU: 13 -define void @max_18_sgprs(i32 addrspace(1)* %out1, +; CHECK: NumSGPRsForWavesPerEU: 14 +define void @max_14_sgprs(i32 addrspace(1)* %out1, i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, i32 addrspace(1)* %out4, @@ -14,4 +21,102 @@ store i32 %four, i32 addrspace(1)* %out4 ret void } -attributes #0 = {"amdgpu-num-sgpr"="18"} + +; private resource: 4 +; scratch wave offset: 1 +; workgroup ids: 3 +; dispatch id: 2 +; queue ptr: 2 +; flat scratch init: 2 +; --------------------- +; total: 14 + +; + reserved vcc, flat_scratch = 18 + +; Because we can't handle re-using the last few input registers as the +; special vcc etc. registers (as well as decide to not use the unused +; features when the number of registers is frozen), this ends up using +; more than expected. + +; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs: +; TOSGPR: SGPRBlocks: 2 +; TOSGPR: NumSGPRsForWavesPerEU: 18 + +; TOSMEM: s_mov_b64 s[6:7], s[2:3] +; TOSMEM: s_mov_b32 s9, s13 +; TOSMEM: s_mov_b64 s[4:5], s[0:1] + +; TOSMEM: SGPRBlocks: 2 +; TOSMEM: NumSGPRsForWavesPerEU: 18 +define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1, + i32 addrspace(1)* %out2, + i32 addrspace(1)* %out3, + i32 addrspace(1)* %out4, + i32 %one, i32 %two, i32 %three, i32 %four) #2 { + store volatile i32 0, i32* undef + %x.0 = call i32 @llvm.amdgcn.workgroup.id.x() + store volatile i32 %x.0, i32 addrspace(1)* undef + %x.1 = call i32 @llvm.amdgcn.workgroup.id.y() + store volatile i32 %x.0, i32 addrspace(1)* undef + %x.2 = call i32 @llvm.amdgcn.workgroup.id.z() + store volatile i32 %x.0, i32 addrspace(1)* undef + %x.3 = call i64 @llvm.amdgcn.dispatch.id() + store volatile i64 %x.3, i64 addrspace(1)* undef + %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef + %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() + store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef + + store i32 %one, i32 addrspace(1)* %out1 + store i32 %two, i32 addrspace(1)* %out2 + store i32 %three, i32 addrspace(1)* %out3 + store i32 %four, i32 addrspace(1)* %out4 + ret void +} + +; ALL-LABEL: max_12_sgprs_12_input_sgprs{{$}} +; ; Make sure copies for input buffer are not clobbered. This requires +; ; swapping the order the registers are copied from what normally +; ; happens. + +; TOSMEM: s_mov_b64 s[6:7], s[2:3] +; TOSMEM: s_mov_b64 s[4:5], s[0:1] +; TOSMEM: s_mov_b32 s3, s11 + +; ALL: SGPRBlocks: 1 +; ALL: NumSGPRsForWavesPerEU: 16 +define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1, + i32 addrspace(1)* %out2, + i32 addrspace(1)* %out3, + i32 addrspace(1)* %out4, + i32 %one, i32 %two, i32 %three, i32 %four) #2 { + store volatile i32 0, i32* undef + %x.0 = call i32 @llvm.amdgcn.workgroup.id.x() + store volatile i32 %x.0, i32 addrspace(1)* undef + %x.1 = call i32 @llvm.amdgcn.workgroup.id.y() + store volatile i32 %x.0, i32 addrspace(1)* undef + %x.2 = call i32 @llvm.amdgcn.workgroup.id.z() + store volatile i32 %x.0, i32 addrspace(1)* undef + %x.3 = call i64 @llvm.amdgcn.dispatch.id() + store volatile i64 %x.3, i64 addrspace(1)* undef + %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef + + store i32 %one, i32 addrspace(1)* %out1 + store i32 %two, i32 addrspace(1)* %out2 + store i32 %three, i32 addrspace(1)* %out3 + store i32 %four, i32 addrspace(1)* %out4 + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() #1 +declare i32 @llvm.amdgcn.workgroup.id.y() #1 +declare i32 @llvm.amdgcn.workgroup.id.z() #1 +declare i64 @llvm.amdgcn.dispatch.id() #1 +declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1 +declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #1 + +attributes #0 = { nounwind "amdgpu-num-sgpr"="14" } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind "amdgpu-num-sgpr"="12" } +attributes #3 = { nounwind "amdgpu-num-sgpr"="11" } Index: test/CodeGen/AMDGPU/private-access-no-objects.ll =================================================================== --- test/CodeGen/AMDGPU/private-access-no-objects.ll +++ test/CodeGen/AMDGPU/private-access-no-objects.ll @@ -1,6 +1,17 @@ -; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPTNONE %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=OPT %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=iceland -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s +; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=OPTNONE %s + +; There are no stack objects, but still a private memory access. The +; private access regiters need to be correctly initialized anyway, and +; shifted down to the end of the used registers. ; GCN-LABEL: {{^}}store_to_undef: +; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] +; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] +; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} +; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} ; -O0 should assume spilling, so the input scratch resource descriptor ; -should be used directly without any copies. @@ -13,18 +24,30 @@ } ; GCN-LABEL: {{^}}store_to_inttoptr: +; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] +; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] +; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} +; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} define void @store_to_inttoptr() #0 { store volatile i32 0, i32* inttoptr (i32 123 to i32*) ret void } ; GCN-LABEL: {{^}}load_from_undef: +; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] +; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] +; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} +; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} define void @load_from_undef() #0 { %ld = load volatile i32, i32* undef ret void } ; GCN-LABEL: {{^}}load_from_inttoptr: +; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] +; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] +; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} +; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} define void @load_from_inttoptr() #0 { %ld = load volatile i32, i32* inttoptr (i32 123 to i32*) ret void Index: test/CodeGen/AMDGPU/si-sgpr-spill.ll =================================================================== --- test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s ; These tests check that the compiler won't crash when it needs to spill ; SGPRs. Index: test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll =================================================================== --- test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll +++ test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=TONGA %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=TONGA %s ; On Tonga and Iceland, limited SGPR availability means care must be taken to ; allocate scratch registers correctly. Check that this test compiles without Index: test/CodeGen/AMDGPU/spill-m0.ll =================================================================== --- test/CodeGen/AMDGPU/spill-m0.ll +++ test/CodeGen/AMDGPU/spill-m0.ll @@ -6,6 +6,8 @@ ; XXX - Why does it like to use vcc? ; GCN-LABEL: {{^}}spill_m0: +; TOSMEM: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 + ; GCN: s_cmp_lg_u32 ; TOVGPR: s_mov_b32 vcc_hi, m0 Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -459,7 +459,7 @@ br i1 %cc, label %if, label %else if: - store volatile <4 x float> %dtex, <4 x float>* undef + store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef unreachable else: