Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -499,6 +499,17 @@ assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); + unsigned OffsetReg = AMDGPU::M0; + unsigned M0CopyReg = AMDGPU::NoRegister; + + if (SpillToSMEM) { + if (RS->isRegUsed(AMDGPU::M0)) { + M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) + .addReg(AMDGPU::M0); + } + } + // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { @@ -515,7 +526,6 @@ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, Size, Align); - unsigned OffsetReg = AMDGPU::M0; // Add i * 4 wave offset. // // SMEM instructions only support a single offset, so increment the wave @@ -534,7 +544,7 @@ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_STORE_DWORD_SGPR)) .addReg(SubReg, getKillRegState(IsKill)) // sdata .addReg(MFI->getScratchRSrcReg()) // sbase - .addReg(OffsetReg) // soff + .addReg(OffsetReg, RegState::Kill) // soff .addImm(0) // glc .addMemOperand(MMO); @@ -591,6 +601,11 @@ } } + if (M0CopyReg != AMDGPU::NoRegister) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) + .addReg(M0CopyReg, RegState::Kill); + } + MI->eraseFromParent(); MFI->addToSpilledSGPRs(NumSubRegs); } @@ -613,6 +628,18 @@ assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); + unsigned OffsetReg = AMDGPU::M0; + unsigned M0CopyReg = AMDGPU::NoRegister; + + if (SpillToSMEM) { + if (RS->isRegUsed(AMDGPU::M0)) { + M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) + .addReg(AMDGPU::M0); + } + } + + // SubReg carries the "Kill" flag when SubReg == SuperReg. int64_t FrOffset = FrameInfo.getObjectOffset(Index); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { @@ -628,8 +655,6 @@ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, Size, Align); - unsigned OffsetReg = AMDGPU::M0; - // Add i * 4 offset int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i); if (Offset != 0) { @@ -642,9 +667,9 @@ } BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), SubReg) - .addReg(MFI->getScratchRSrcReg()) // sbase - .addReg(OffsetReg) // soff - .addImm(0) // glc + .addReg(MFI->getScratchRSrcReg()) // sbase + .addReg(OffsetReg, RegState::Kill) // soff + .addImm(0) // glc .addMemOperand(MMO) .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); @@ -688,6 +713,11 @@ } } + if (M0CopyReg != AMDGPU::NoRegister) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) + .addReg(M0CopyReg, RegState::Kill); + } + MI->eraseFromParent(); } Index: test/CodeGen/AMDGPU/spill-m0.ll =================================================================== --- test/CodeGen/AMDGPU/spill-m0.ll +++ test/CodeGen/AMDGPU/spill-m0.ll @@ -61,6 +61,30 @@ @lds = internal addrspace(3) global [64 x float] undef ; GCN-LABEL: {{^}}spill_m0_lds: +; GCN: s_mov_b32 m0, s6 +; GCN: v_interp_mov_f32 + +; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM: s_mov_b32 m0, s7 +; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill +; TOSMEM: s_mov_b32 m0, vcc_hi + +; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM: s_add_u32 m0, s7, 0x100 +; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill +; TOSMEM: s_add_u32 m0, s7, 0x200 +; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill +; TOSMEM: s_mov_b32 m0, vcc_hi + +; TOSMEM: s_mov_b64 exec, +; TOSMEM: s_cbranch_execz +; TOSMEM: s_branch + +; TOSMEM: BB{{[0-9]+_[0-9]+}}: +; TOSMEM-NEXT: s_add_u32 m0, s7, 0x100 +; TOSMEM-NEXT: s_buffer_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload + + ; GCN-NOT: v_readlane_b32 m0 ; GCN-NOT: s_buffer_store_dword m0 ; GCN-NOT: s_buffer_load_dword m0 @@ -87,6 +111,52 @@ ret void } +; GCN-LABEL: {{^}}restore_m0_lds: +; TOSMEM: s_cmp_eq_u32 +; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM: s_mov_b32 m0, s3 +; TOSMEM: s_buffer_store_dword s4, s[84:87], m0 ; 4-byte Folded Spill +; TOSMEM: s_mov_b32 m0, vcc_hi +; TOSMEM: s_cbranch_scc1 + +; TOSMEM: s_mov_b32 m0, -1 + +; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM: s_mov_b32 m0, s3 +; TOSMEM: s_buffer_load_dword s4, s[84:87], m0 ; 8-byte Folded Reload +; TOSMEM: s_add_u32 m0, s3, 0x100 +; TOSMEM: s_waitcnt lgkmcnt(0) +; TOSMEM: s_buffer_load_dword s5, s[84:87], m0 ; 8-byte Folded Reload +; TOSMEM: s_mov_b32 m0, vcc_hi +; TOSMEM: s_waitcnt lgkmcnt(0) + +; TOSMEM: ds_write_b64 + +; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM: s_add_u32 m0, s3, 0x200 +; TOSMEM: s_buffer_load_dword s0, s[84:87], m0 ; 4-byte Folded Reload +; TOSMEM: s_mov_b32 m0, vcc_hi +; TOSMEM: s_waitcnt lgkmcnt(0) +; TOSMEM: s_mov_b32 m0, s0 +; TOSMEM: ; use m0 + +; TOSMEM: s_dcache_wb +; TOSMEM: s_endpgm +define void @restore_m0_lds(i32 %arg) { + %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0 + %sval = load volatile i64, i64 addrspace(2)* undef + %cmp = icmp eq i32 %arg, 0 + br i1 %cmp, label %ret, label %bb + +bb: + store volatile i64 %sval, i64 addrspace(3)* undef + call void asm sideeffect "; use $0", "{M0}"(i32 %m0) #0 + br label %ret + +ret: + ret void +} + declare float @llvm.SI.fs.constant(i32, i32, i32) readnone declare i32 @llvm.SI.packf16(float, float) readnone