Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -478,6 +478,20 @@ } } +static std::pair getSpillEltSize(unsigned SuperRegSize, + bool Store) { + if (SuperRegSize % 16 == 0) + return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR : + AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR }; + + if (SuperRegSize % 8 == 0) + return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR : + AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR }; + + return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR : + AMDGPU::S_BUFFER_LOAD_DWORD_SGPR}; +} + bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, @@ -488,7 +502,6 @@ const SISubtarget &ST = MF->getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned SuperReg = MI->getOperand(0).getReg(); bool IsKill = MI->getOperand(0).isKill(); const DebugLoc &DL = MI->getDebugLoc(); @@ -500,11 +513,30 @@ if (SpillToSMEM && OnlyToVGPR) return false; + unsigned EltSize = 4; + unsigned ScalarStoreOp; + + const TargetRegisterClass *RC = getPhysRegClass(SuperReg); + if (SpillToSMEM && isSGPRClass(RC)) { + // XXX - if private_element_size is larger than 4 it might be useful to be + // able to spill wider vmem spills. + std::tie(EltSize, ScalarStoreOp) = getSpillEltSize(RC->getSize(), true); + } + + const TargetRegisterClass *SubRC = nullptr; + unsigned NumSubRegs = 1; + ArrayRef SplitParts = getRegSplitParts(RC, EltSize); + + if (!SplitParts.empty()) { + NumSubRegs = SplitParts.size(); + SubRC = getSubRegClass(RC, SplitParts[0]); + } + // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = NumSubRegs == 1 ? - SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i)); + SuperReg : getSubReg(SuperReg, SplitParts[i]); if (SpillToSMEM) { if (SuperReg == AMDGPU::M0) { @@ -521,7 +553,15 @@ } int64_t FrOffset = FrameInfo.getObjectOffset(Index); - unsigned Size = FrameInfo.getObjectSize(Index); + + // The allocated memory size is really the wavefront size * the frame + // index size. The widest register class is 64 bytes, so a 4-byte scratch + // allocation is enough to spill this in a single stack object. + // + // FIXME: Frame size/offsets are computed earlier than this, so the extra + // space is still unnecessarily allocated. + unsigned Size = 4; + unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, Index); @@ -530,12 +570,11 @@ Size, Align); unsigned OffsetReg = AMDGPU::M0; - // Add i * 4 wave offset. - // + // SMEM instructions only support a single offset, so increment the wave // offset. - int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i); + int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) .addReg(MFI->getScratchWaveOffsetReg()) @@ -545,7 +584,7 @@ .addReg(MFI->getScratchWaveOffsetReg()); } - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_STORE_DWORD_SGPR)) + BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) .addReg(SubReg, getKillRegState(IsKill)) // sdata .addReg(MFI->getScratchRSrcReg()) // sbase .addReg(OffsetReg) // soff @@ -639,12 +678,30 @@ const SIInstrInfo *TII = ST.getInstrInfo(); const DebugLoc &DL = MI->getDebugLoc(); - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned SuperReg = MI->getOperand(0).getReg(); bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; if (SpillToSMEM && OnlyToVGPR) return false; + unsigned EltSize = 4; + unsigned ScalarLoadOp; + + const TargetRegisterClass *RC = getPhysRegClass(SuperReg); + if (SpillToSMEM && isSGPRClass(RC)) { + // XXX - if private_element_size is larger than 4 it might be useful to be + // able to spill wider vmem spills. + std::tie(EltSize, ScalarLoadOp) = getSpillEltSize(RC->getSize(), false); + } + + const TargetRegisterClass *SubRC = nullptr; + unsigned NumSubRegs = 1; + ArrayRef SplitParts = getRegSplitParts(RC, EltSize); + + if (!SplitParts.empty()) { + NumSubRegs = SplitParts.size(); + SubRC = getSubRegClass(RC, SplitParts[0]); + } + // m0 is not allowed as with readlane/writelane, so a temporary SGPR and // extra copy is needed. bool IsM0 = (SuperReg == AMDGPU::M0); @@ -657,10 +714,11 @@ for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = NumSubRegs == 1 ? - SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i)); + SuperReg : getSubReg(SuperReg, SplitParts[i]); if (SpillToSMEM) { - unsigned Size = FrameInfo.getObjectSize(Index); + // FIXME: Size may be > 4 but extra bytes wasted. + unsigned Size = 4; unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, Index); @@ -671,7 +729,7 @@ unsigned OffsetReg = AMDGPU::M0; // Add i * 4 offset - int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i); + int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) .addReg(MFI->getScratchWaveOffsetReg()) @@ -681,12 +739,12 @@ .addReg(MFI->getScratchWaveOffsetReg()); } - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), SubReg) + BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg) .addReg(MFI->getScratchRSrcReg()) // sbase .addReg(OffsetReg) // soff .addImm(0) // glc .addMemOperand(MMO) - .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + .addReg(SuperReg, RegState::ImplicitDefine); continue; } @@ -700,7 +758,7 @@ SubReg) .addReg(Spill.VGPR) .addImm(Spill.Lane) - .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + .addReg(SuperReg, RegState::ImplicitDefine); } else { if (OnlyToVGPR) return false; Index: test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll =================================================================== --- test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -15,26 +15,10 @@ ; Make sure scratch wave offset register is correctly incremented and ; then restored. ; SMEM: s_mov_b32 m0, s97{{$}} -; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill -; SMEM: s_add_u32 m0, s97, 0x100{{$}} -; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill -; SMEM: s_add_u32 m0, s97, 0x200{{$}} -; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill -; SMEM: s_add_u32 m0, s97, 0x300{{$}} -; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill - +; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]\]}}, s[92:95], m0 ; 4-byte Folded Spill ; SMEM: s_mov_b32 m0, s97{{$}} -; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload -; SMEM: s_add_u32 m0, s97, 0x100{{$}} -; SMEM: s_waitcnt lgkmcnt(0) -; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload -; SMEM: s_add_u32 m0, s97, 0x200{{$}} -; SMEM: s_waitcnt lgkmcnt(0) -; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload -; SMEM: s_add_u32 m0, s97, 0x300{{$}} -; SMEM: s_waitcnt lgkmcnt(0) -; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload +; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[92:95], m0 ; 4-byte Folded Reload ; ALL: s_endpgm define void @test(i32 addrspace(1)* %out, i32 %in) { Index: test/CodeGen/AMDGPU/spill-wide-sgpr.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/spill-wide-sgpr.ll @@ -0,0 +1,176 @@ +; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VGPR %s +; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SMEM %s +; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VMEM %s + +; ALL-LABEL: {{^}}spill_sgpr_x2: +; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 4-byte Folded Spill +; SMEM: s_cbranch_scc1 + +; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 4-byte Folded Reload + +; SMEM: s_dcache_wb +; SMEM: s_endpgm + +; FIXME: Should only need 4 bytes +; SMEM: ScratchSize: 12 + + +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 +; VGPR: s_cbranch_scc1 + +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 + +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: s_cbranch_scc1 + +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +define amdgpu_kernel void @spill_sgpr_x2(i32 addrspace(1)* %out, i32 %in) #0 { + %wide.sgpr = call <2 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr) #0 + br label %ret + +ret: + ret void +} + +; ALL-LABEL: {{^}}spill_sgpr_x4: +; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 4-byte Folded Spill +; SMEM: s_cbranch_scc1 + +; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 4-byte Folded Reload +; SMEM: s_dcache_wb +; SMEM: s_endpgm + +; FIXME: Should only need 4 bytes +; SMEM: ScratchSize: 20 + +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3 +; VGPR: s_cbranch_scc1 + +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3 + + +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: s_cbranch_scc1 + +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +define amdgpu_kernel void @spill_sgpr_x4(i32 addrspace(1)* %out, i32 %in) #0 { + %wide.sgpr = call <4 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<4 x i32> %wide.sgpr) #0 + br label %ret + +ret: + ret void +} + +; ALL-LABEL: {{^}}spill_sgpr_x8: + +; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 4-byte Folded Spill +; SMEM: s_add_u32 m0, s3, 16 +; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 4-byte Folded Spill +; SMEM: s_cbranch_scc1 + +; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 4-byte Folded Reload +; SMEM: s_add_u32 m0, s3, 16 +; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 4-byte Folded Reload + +; SMEM: s_dcache_wb +; SMEM: s_endpgm + +; SMEM: ScratchSize: 36 + +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 4 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 5 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 6 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 7 +; VGPR: s_cbranch_scc1 + +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 4 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 5 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 6 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 7 + +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: s_cbranch_scc1 + +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +define amdgpu_kernel void @spill_sgpr_x8(i32 addrspace(1)* %out, i32 %in) #0 { + %wide.sgpr = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr) #0 + br label %ret + +ret: + ret void +} + +; FIXME: x16 inlineasm seems broken +; define amdgpu_kernel void @spill_sgpr_x16(i32 addrspace(1)* %out, i32 %in) #0 { +; %wide.sgpr = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 +; %cmp = icmp eq i32 %in, 0 +; br i1 %cmp, label %bb0, label %ret + +; bb0: +; call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr) #0 +; br label %ret + +; ret: +; ret void +; } + +attributes #0 = { nounwind }