Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -587,17 +587,18 @@ if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); + // We are only allowed to create one new instruction when spilling + // registers, so we need to use pseudo instruction for spilling SGPRs. + const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize())); + + // The SGPR spill/restore instructions only work on number sgprs, so we need + // to make sure we are using the correct register class. if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) { - // m0 may not be allowed for readlane. MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); } - // We are only allowed to create one new instruction when spilling - // registers, so we need to use pseudo instruction for spilling - // SGPRs. - unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); - BuildMI(MBB, MI, DL, get(Opcode)) + BuildMI(MBB, MI, DL, OpDesc) .addReg(SrcReg, getKillRegState(isKill)) // src .addFrameIndex(FrameIndex) // frame_idx .addMemOperand(MMO); @@ -621,10 +622,10 @@ MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg, getKillRegState(isKill)) // src - .addFrameIndex(FrameIndex) // frame_idx - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset - .addImm(0) // offset + .addFrameIndex(FrameIndex) // frame_idx + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addImm(0) // offset .addMemOperand(MMO); } @@ -685,15 +686,13 @@ if (RI.isSGPRClass(RC)) { // FIXME: Maybe this should not include a memoperand because it will be // lowered to non-memory instructions. - unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); - + const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize())); if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) { - // m0 may not be allowed for readlane. MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } - BuildMI(MBB, MI, DL, get(Opcode), DestReg) + BuildMI(MBB, MI, DL, OpDesc, DestReg) .addFrameIndex(FrameIndex) // frame_idx .addMemOperand(MMO); Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1452,10 +1452,11 @@ } // End UseNamedOperandTable = 1 } -// It's unclear whether you can use M0 as the output of v_readlane_b32 -// instructions, so use SReg_32_XM0 register class for spills to prevent -// this from happening. -defm SI_SPILL_S32 : SI_SPILL_SGPR ; +// You cannot use M0 as the output of v_readlane_b32 instructions or +// use it in the sdata operand of SMEM instructions. We still need to +// be able to spill the physical register m0, so allow it for +// SI_SPILL_32_* instructions. +defm SI_SPILL_S32 : SI_SPILL_SGPR ; defm SI_SPILL_S64 : SI_SPILL_SGPR ; defm SI_SPILL_S128 : SI_SPILL_SGPR ; defm SI_SPILL_S256 : SI_SPILL_SGPR ; Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -540,9 +540,9 @@ case AMDGPU::SI_SPILL_S32_SAVE: { unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned SuperReg = MI->getOperand(0).getReg(); bool IsKill = MI->getOperand(0).isKill(); + // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { @@ -551,8 +551,19 @@ struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); - if (Spill.hasReg()) { + if (SuperReg == AMDGPU::M0) { + assert(NumSubRegs == 1); + unsigned CopyM0 + = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), CopyM0) + .addReg(SuperReg, getKillRegState(IsKill)); + + // The real spill now kills the temp copy. + SubReg = SuperReg = CopyM0; + IsKill = true; + } + BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), Spill.VGPR) @@ -611,6 +622,14 @@ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned SuperReg = MI->getOperand(0).getReg(); + // m0 is not allowed as with readlane/writelane, so a temporary SGPR and + // extra copy is needed. + bool IsM0 = (SuperReg == AMDGPU::M0); + if (IsM0) { + assert(NumSubRegs == 1); + SuperReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + } + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i)); @@ -651,6 +670,11 @@ } } + if (IsM0 && SuperReg != AMDGPU::M0) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(SuperReg); + } + MI->eraseFromParent(); break; } Index: test/CodeGen/AMDGPU/m0-spill.ll =================================================================== --- test/CodeGen/AMDGPU/m0-spill.ll +++ /dev/null @@ -1,35 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -@lds = external addrspace(3) global [64 x float] - -; CHECK-LABEL: {{^}}main: -; CHECK-NOT: v_readlane_b32 m0 -define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) { -main_body: - %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) - %cmp = fcmp ueq float 0.0, %4 - br i1 %cmp, label %if, label %else - -if: - %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0 - %lds_data = load float, float addrspace(3)* %lds_ptr - br label %endif - -else: - %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) - br label %endif - -endif: - %export = phi float [%lds_data, %if], [%interp, %else] - %5 = call i32 @llvm.SI.packf16(float %export, float %export) - %6 = bitcast i32 %5 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %6, float %6, float %6, float %6) - ret void -} - -declare float @llvm.SI.fs.constant(i32, i32, i32) readnone - -declare i32 @llvm.SI.packf16(float, float) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) Index: test/CodeGen/AMDGPU/spill-m0.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/spill-m0.ll @@ -0,0 +1,78 @@ +; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s +; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s +; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s +; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s + +; XXX - Why does it like to use vcc? + +; GCN-LABEL: {{^}}spill_m0: +; GCN: s_cmp_lg_i32 + +; TOVGPR: s_mov_b32 vcc_hi, m0 +; TOVGPR: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], vcc_hi, 0 + +; TOVMEM: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], m0 +; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill +; TOVMEM: s_waitcnt vmcnt(0) +; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: [[ENDIF]]: +; TOVGPR: v_readlane_b32 vcc_hi, [[SPILL_VREG]], 0 +; TOVGPR: s_mov_b32 m0, vcc_hi + +; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Reload +; TOVMEM: s_waitcnt vmcnt(0) +; TOVMEM: v_readfirstlane_b32 vcc_hi, [[RELOAD_VREG]] +; TOVMEM: s_mov_b32 m0, vcc_hi + +; GCN: s_add_i32 m0, m0, 1 +define void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 { +entry: + %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0 + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %endif + +if: + call void asm sideeffect "v_nop", ""() #0 + br label %endif + +endif: + %foo = call i32 asm sideeffect "s_add_i32 $0, $1, 1", "=s,{M0}"(i32 %m0) #0 + store i32 %foo, i32 addrspace(1)* %out + ret void +} + +@lds = internal addrspace(3) global [64 x float] undef + +; GCN-LABEL: {{^}}spill_m0_lds: +; GCN-NOT: v_readlane_b32 m0 +define amdgpu_ps void @spill_m0_lds(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) #0 { +main_body: + %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) + %cmp = fcmp ueq float 0.0, %4 + br i1 %cmp, label %if, label %else + +if: + %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0 + %lds_data = load float, float addrspace(3)* %lds_ptr + br label %endif + +else: + %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) + br label %endif + +endif: + %export = phi float [%lds_data, %if], [%interp, %else] + %5 = call i32 @llvm.SI.packf16(float %export, float %export) + %6 = bitcast i32 %5 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %6, float %6, float %6, float %6) + ret void +} + +declare float @llvm.SI.fs.constant(i32, i32, i32) readnone + +declare i32 @llvm.SI.packf16(float, float) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { nounwind }