diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -35,7 +35,6 @@ BitVector VGPRPressureSets; BitVector AGPRPressureSets; bool SpillSGPRToVGPR; - bool SpillSGPRToSMEM; bool isWave32; void classifyPressureSet(unsigned PSetID, unsigned Reg, @@ -47,10 +46,6 @@ return SpillSGPRToVGPR; } - bool spillSGPRToSMEM() const { - return SpillSGPRToSMEM; - } - /// Return the end register initially reserved for the scratch buffer in case /// spilling is needed. unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -48,11 +48,6 @@ } } -static cl::opt EnableSpillSGPRToSMEM( - "amdgpu-spill-sgpr-to-smem", - cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"), - cl::init(false)); - static cl::opt EnableSpillSGPRToVGPR( "amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling VGPRs to SGPRs"), @@ -65,14 +60,8 @@ SGPRPressureSets(getNumRegPressureSets()), VGPRPressureSets(getNumRegPressureSets()), AGPRPressureSets(getNumRegPressureSets()), - SpillSGPRToVGPR(false), - SpillSGPRToSMEM(false), + SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { - if (EnableSpillSGPRToSMEM && ST.hasScalarStores()) - SpillSGPRToSMEM = true; - else if (EnableSpillSGPRToVGPR) - SpillSGPRToVGPR = true; - unsigned NumRegPressureSets = getNumRegPressureSets(); SGPRSetID = NumRegPressureSets; @@ -759,22 +748,6 @@ } } -static std::pair getSpillEltSize(unsigned SuperRegSize, - bool Store) { - if (SuperRegSize % 16 == 0) { - return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR : - AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR }; - } - - if (SuperRegSize % 8 == 0) { - return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR : - AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR }; - } - - return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR : - AMDGPU::S_BUFFER_LOAD_DWORD_SGPR}; -} - bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, @@ -799,38 +772,16 @@ MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - bool SpillToSMEM = spillSGPRToSMEM(); - if (SpillToSMEM && OnlyToVGPR) - return false; - - Register FrameReg = getFrameRegister(*MF); - assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && SuperReg != MFI->getFrameOffsetReg() && SuperReg != MFI->getScratchWaveOffsetReg())); assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); - unsigned OffsetReg = AMDGPU::M0; unsigned M0CopyReg = AMDGPU::NoRegister; - if (SpillToSMEM) { - if (RS->isRegUsed(AMDGPU::M0)) { - M0CopyReg = RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) - .addReg(AMDGPU::M0); - } - } - - unsigned ScalarStoreOp; unsigned EltSize = 4; const TargetRegisterClass *RC = getPhysRegClass(SuperReg); - if (SpillToSMEM && isSGPRClass(RC)) { - // XXX - if private_element_size is larger than 4 it might be useful to be - // able to spill wider vmem spills. - std::tie(EltSize, ScalarStoreOp) = - getSpillEltSize(getRegSizeInBits(*RC) / 8, true); - } ArrayRef SplitParts = getRegSplitParts(RC, EltSize); unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); @@ -845,47 +796,6 @@ Register SubReg = NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); - if (SpillToSMEM) { - int64_t FrOffset = FrameInfo.getObjectOffset(Index); - - // The allocated memory size is really the wavefront size * the frame - // index size. The widest register class is 64 bytes, so a 4-byte scratch - // allocation is enough to spill this in a single stack object. - // - // FIXME: Frame size/offsets are computed earlier than this, so the extra - // space is still unnecessarily allocated. - - unsigned Align = FrameInfo.getObjectAlignment(Index); - MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); - MachineMemOperand *MMO - = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, - EltSize, MinAlign(Align, EltSize * i)); - - // SMEM instructions only support a single offset, so increment the wave - // offset. - - int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); - if (Offset != 0) { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(FrameReg) - .addImm(Offset); - } else { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(FrameReg); - } - - BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) - .addReg(SubReg, getKillRegState(IsKill)) // sdata - .addReg(MFI->getScratchRSrcReg()) // sbase - .addReg(OffsetReg, RegState::Kill) // soff - .addImm(0) // glc - .addImm(0) // dlc - .addMemOperand(MMO); - - continue; - } - if (SpillToVGPR) { SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; @@ -914,10 +824,8 @@ return false; // Spill SGPR to a frame index. - // TODO: Should VI try to spill to VGPR and then spill to SMEM? if (!TmpVGPR.isValid()) TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); - // TODO: Should VI try to spill to VGPR and then spill to SMEM? MachineInstrBuilder Mov = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) @@ -979,82 +887,24 @@ const DebugLoc &DL = MI->getDebugLoc(); Register SuperReg = MI->getOperand(0).getReg(); - bool SpillToSMEM = spillSGPRToSMEM(); - if (SpillToSMEM && OnlyToVGPR) - return false; assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); - unsigned OffsetReg = AMDGPU::M0; unsigned M0CopyReg = AMDGPU::NoRegister; - if (SpillToSMEM) { - if (RS->isRegUsed(AMDGPU::M0)) { - M0CopyReg = RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) - .addReg(AMDGPU::M0); - } - } - unsigned EltSize = 4; - unsigned ScalarLoadOp; - - Register FrameReg = getFrameRegister(*MF); const TargetRegisterClass *RC = getPhysRegClass(SuperReg); - if (SpillToSMEM && isSGPRClass(RC)) { - // XXX - if private_element_size is larger than 4 it might be useful to be - // able to spill wider vmem spills. - std::tie(EltSize, ScalarLoadOp) = - getSpillEltSize(getRegSizeInBits(*RC) / 8, false); - } ArrayRef SplitParts = getRegSplitParts(RC, EltSize); unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); - // SubReg carries the "Kill" flag when SubReg == SuperReg. - int64_t FrOffset = FrameInfo.getObjectOffset(Index); - Register TmpVGPR; for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { Register SubReg = NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); - if (SpillToSMEM) { - // FIXME: Size may be > 4 but extra bytes wasted. - unsigned Align = FrameInfo.getObjectAlignment(Index); - MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); - MachineMemOperand *MMO - = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, - EltSize, MinAlign(Align, EltSize * i)); - - // Add i * 4 offset - int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); - if (Offset != 0) { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(FrameReg) - .addImm(Offset); - } else { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(FrameReg); - } - - auto MIB = - BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg) - .addReg(MFI->getScratchRSrcReg()) // sbase - .addReg(OffsetReg, RegState::Kill) // soff - .addImm(0) // glc - .addImm(0) // dlc - .addMemOperand(MMO); - - if (NumSubRegs > 1 && i == 0) - MIB.addReg(SuperReg, RegState::ImplicitDefine); - - continue; - } - if (SpillToVGPR) { SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; auto MIB = diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s - -; FIXME: SGPR-to-SMEM requires an additional SGPR always to scavenge m0 - -; ALL-LABEL: {{^}}max_9_sgprs: -; ALL: SGPRBlocks: 1 -; ALL: NumSGPRsForWavesPerEU: 9 -define amdgpu_kernel void @max_9_sgprs() #0 { - %one = load volatile i32, i32 addrspace(4)* undef - %two = load volatile i32, i32 addrspace(4)* undef - %three = load volatile i32, i32 addrspace(4)* undef - %four = load volatile i32, i32 addrspace(4)* undef - %five = load volatile i32, i32 addrspace(4)* undef - %six = load volatile i32, i32 addrspace(4)* undef - %seven = load volatile i32, i32 addrspace(4)* undef - %eight = load volatile i32, i32 addrspace(4)* undef - %nine = load volatile i32, i32 addrspace(4)* undef - %ten = load volatile i32, i32 addrspace(4)* undef - call void asm sideeffect "", "s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight) - store volatile i32 %one, i32 addrspace(1)* undef - store volatile i32 %two, i32 addrspace(1)* undef - store volatile i32 %three, i32 addrspace(1)* undef - store volatile i32 %four, i32 addrspace(1)* undef - store volatile i32 %five, i32 addrspace(1)* undef - store volatile i32 %six, i32 addrspace(1)* undef - store volatile i32 %seven, i32 addrspace(1)* undef - store volatile i32 %eight, i32 addrspace(1)* undef - store volatile i32 %nine, i32 addrspace(1)* undef - store volatile i32 %ten, i32 addrspace(1)* undef - ret void -} - -attributes #0 = { nounwind "amdgpu-num-sgpr"="14" } diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -1,10 +1,6 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s - -; If spilling to smem, additional registers are used for the resource -; descriptor. +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s ; FIXME: Vectorization can increase required SGPR count beyond limit. -; FIXME: SGPR-to-SMEM requires an additional SGPR always to scavenge m0 ; ALL-LABEL: {{^}}max_9_sgprs: @@ -55,13 +51,6 @@ ; XTOSGPR: SGPRBlocks: 1 ; XTOSGPR: NumSGPRsForWavesPerEU: 16 -; XTOSMEM: s_mov_b64 s[10:11], s[2:3] -; XTOSMEM: s_mov_b64 s[8:9], s[0:1] -; XTOSMEM: s_mov_b32 s7, s13 - -; XTOSMEM: SGPRBlocks: 1 -; XTOSMEM: NumSGPRsForWavesPerEU: 16 -; ; This test case is disabled: When calculating the spillslot addresses AMDGPU ; creates an extra vreg to save/restore m0 which in a point of maximum register ; pressure would trigger an endless loop; the compiler aborts earlier with @@ -101,10 +90,6 @@ ; ; swapping the order the registers are copied from what normally ; ; happens. -; XTOSMEM: s_mov_b32 s5, s11 -; XTOSMEM: s_add_u32 m0, s5, -; XTOSMEM: s_buffer_store_dword vcc_lo, s[0:3], m0 - ; XALL: SGPRBlocks: 2 ; XALL: NumSGPRsForWavesPerEU: 18 ;define amdgpu_kernel void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1, diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll --- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll +++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll @@ -1,6 +1,6 @@ ; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s -; RUN: llc -O0 -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,-WavefrontSize32,+WavefrontSize64 -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s +; RUN: llc -O0 -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,-WavefrontSize32,+WavefrontSize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -1,5 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SGPR %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SMEM %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SGPR %s ; Make sure this doesn't crash. ; ALL-LABEL: {{^}}test: @@ -14,15 +13,6 @@ ; SGPR-NEXT: s_nop 4 ; SGPR-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0 -; Make sure scratch wave offset register is correctly incremented and -; then restored. -; SMEM: s_add_u32 m0, s[[OFF]], 0x100{{$}} -; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]], m0 ; 16-byte Folded Spill - -; SMEM: s_add_u32 m0, s[[OFF]], 0x100{{$}} -; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]], m0 ; 16-byte Folded Reload - -; SMEM: s_dcache_wb ; ALL: s_endpgm define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) { call void asm sideeffect "", "~{s[0:7]}" () diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll --- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll @@ -1,14 +1,11 @@ ; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s -; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s +; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s ; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s -; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s -; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=1 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=GCN %s +; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s ; XXX - Why does it like to use vcc? ; GCN-LABEL: {{^}}spill_m0: -; TOSMEM: s_mov_b32 s[[LO:[0-9]+]], SCRATCH_RSRC_DWORD0 -; TOSMEM: s_mov_b32 s[[HI:[0-9]+]], 0xe80000 ; GCN-DAG: s_cmp_lg_u32 @@ -19,11 +16,6 @@ ; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]] ; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12 ; 4-byte Folded Spill -; TOSMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0 -; TOSMEM: s_add_u32 m0, s3, 0x300{{$}} -; TOSMEM-NOT: [[M0_COPY]] -; TOSMEM: s_buffer_store_dword [[M0_COPY]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Spill - ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN: [[ENDIF]]: @@ -35,11 +27,6 @@ ; TOVMEM: v_readfirstlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]] ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]] -; TOSMEM: s_add_u32 m0, s3, 0x300{{$}} -; TOSMEM: s_buffer_load_dword [[M0_RESTORE:s[0-9]+]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Reload -; TOSMEM-NOT: [[M0_RESTORE]] -; TOSMEM: s_mov_b32 m0, [[M0_RESTORE]] - ; GCN: s_add_i32 s{{[0-9]+}}, m0, 1 define amdgpu_kernel void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 { entry: @@ -64,26 +51,6 @@ ; GCN: s_mov_b32 m0, s6 ; GCN: v_interp_mov_f32 -; TOSMEM-NOT: s_m0 -; TOSMEM: s_add_u32 m0, s7, 0x100 -; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill -; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it -; FIXME-TOSMEM-NOT: m0 - -; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s7, 0x300 -; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill -; FIXME-TOSMEM-NOT: m0 - -; TOSMEM: s_mov_b64 exec, -; TOSMEM: s_cbranch_execz -; TOSMEM: s_branch - -; TOSMEM: BB{{[0-9]+_[0-9]+}}: -; TOSMEM: s_add_u32 m0, s7, 0x500 -; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload - - ; GCN-NOT: v_readlane_b32 m0 ; GCN-NOT: s_buffer_store_dword m0 ; GCN-NOT: s_buffer_load_dword m0 @@ -110,101 +77,6 @@ ret void } -; Force save and restore of m0 during SMEM spill -; GCN-LABEL: {{^}}m0_unavailable_spill: - -; GCN: ; def m0, 1 - -; GCN: s_mov_b32 m0, s2 -; GCN: v_interp_mov_f32 - -; GCN: ; clobber m0 - -; TOSMEM: s_mov_b32 vcc_hi, m0 -; TOSMEM: s_add_u32 m0, s3, 0x100 -; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill -; TOSMEM: s_mov_b32 m0, vcc_hi - -; TOSMEM: s_mov_b64 exec, -; TOSMEM: s_cbranch_execz -; TOSMEM: s_branch - -; TOSMEM: BB{{[0-9]+_[0-9]+}}: -; TOSMEM: s_add_u32 m0, s3, 0x100 -; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload - -; GCN-NOT: v_readlane_b32 m0 -; GCN-NOT: s_buffer_store_dword m0 -; GCN-NOT: s_buffer_load_dword m0 -define amdgpu_kernel void @m0_unavailable_spill(i32 %m0.arg) #0 { -main_body: - %m0 = call i32 asm sideeffect "; def $0, 1", "={m0}"() #0 - %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0.arg) - call void asm sideeffect "; clobber $0", "~{m0}"() #0 - %cmp = fcmp ueq float 0.000000e+00, %tmp - br i1 %cmp, label %if, label %else - -if: ; preds = %main_body - store volatile i32 8, i32 addrspace(1)* undef - br label %endif - -else: ; preds = %main_body - store volatile i32 11, i32 addrspace(1)* undef - br label %endif - -endif: - ret void -} - -; GCN-LABEL: {{^}}restore_m0_lds: -; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]] -; TOSMEM: s_cmp_eq_u32 -; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it -; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x100 -; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill -; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x200 -; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill -; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_cbranch_scc1 - -; TOSMEM: s_mov_b32 m0, -1 - -; TOSMEM: s_mov_b32 vcc_hi, m0 -; TOSMEM: s_add_u32 m0, s3, 0x200 -; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload -; TOSMEM: s_mov_b32 m0, vcc_hi -; TOSMEM: s_waitcnt lgkmcnt(0) - -; TOSMEM: ds_write_b64 - -; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x100 -; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload -; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_waitcnt lgkmcnt(0) -; TOSMEM-NOT: m0 -; TOSMEM: s_mov_b32 m0, s0 -; TOSMEM: ; use m0 - -; TOSMEM: s_dcache_wb -; TOSMEM: s_endpgm -define amdgpu_kernel void @restore_m0_lds(i32 %arg) { - %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={m0}"() #0 - %sval = load volatile i64, i64 addrspace(4)* undef - %cmp = icmp eq i32 %arg, 0 - br i1 %cmp, label %ret, label %bb - -bb: - store volatile i64 %sval, i64 addrspace(3)* undef - call void asm sideeffect "; use $0", "{m0}"(i32 %m0) #0 - br label %ret - -ret: - ret void -} - declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-to-smem-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-to-smem-m0.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-to-smem-m0.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs -stop-before=prologepilog < %s - -; Spill to SMEM clobbers M0. Check that the implicit-def dead operand is present -; in the pseudo instructions. - -; CHECK-LABEL: {{^}}spill_sgpr: -; CHECK: SI_SPILL_S32_SAVE {{.*}}, implicit-def dead %m0 -; CHECK: SI_SPILL_S32_RESTORE {{.*}}, implicit-def dead %m0 -define amdgpu_kernel void @spill_sgpr(i32 addrspace(1)* %out, i32 %in) #0 { - %sgpr = call i32 asm sideeffect "; def $0", "=s" () #0 - %cmp = icmp eq i32 %in, 0 - br i1 %cmp, label %bb0, label %ret - -bb0: - call void asm sideeffect "; use $0", "s"(i32 %sgpr) #0 - br label %ret - -ret: - ret void -} - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll --- a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll @@ -1,21 +1,7 @@ -; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VGPR %s -; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SMEM %s -; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VMEM %s +; RUN: llc -O0 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VGPR %s +; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VMEM %s ; ALL-LABEL: {{^}}spill_sgpr_x2: -; SMEM: s_add_u32 m0, s3, 0x100{{$}} -; SMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Spill -; SMEM: s_cbranch_scc1 - -; SMEM: s_add_u32 m0, s3, 0x100{{$}} -; SMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Reload - -; SMEM: s_dcache_wb -; SMEM: s_endpgm - -; FIXME: Should only need 4 bytes -; SMEM: ScratchSize: 12 - ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 @@ -24,6 +10,7 @@ ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 + ; VMEM: buffer_store_dword ; VMEM: buffer_store_dword ; VMEM: s_cbranch_scc1 @@ -44,21 +31,6 @@ } ; ALL-LABEL: {{^}}spill_sgpr_x3: -; SMEM: s_add_u32 m0, s3, 0x100{{$}} -; SMEM: s_buffer_store_dword s -; SMEM: s_buffer_store_dword s -; SMEM: s_buffer_store_dword s -; SMEM: s_cbranch_scc1 - -; SMEM: s_add_u32 m0, s3, 0x100{{$}} -; SMEM: s_buffer_load_dword s -; SMEM: s_buffer_load_dword s -; SMEM: s_buffer_load_dword s -; SMEM: s_dcache_wb -; SMEM: s_endpgm - -; FIXME: Should only need 4 bytes -; SMEM: ScratchSize: 16 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 @@ -92,17 +64,6 @@ } ; ALL-LABEL: {{^}}spill_sgpr_x4: -; SMEM: s_add_u32 m0, s3, 0x100{{$}} -; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS:[0-9]+:[0-9]+]]{{\]}}, m0 ; 16-byte Folded Spill -; SMEM: s_cbranch_scc1 - -; SMEM: s_add_u32 m0, s3, 0x100{{$}} -; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Reload -; SMEM: s_dcache_wb -; SMEM: s_endpgm - -; FIXME: Should only need 4 bytes -; SMEM: ScratchSize: 20 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 @@ -140,25 +101,6 @@ } ; ALL-LABEL: {{^}}spill_sgpr_x5: -; SMEM: s_add_u32 m0, s3, 0x100{{$}} -; SMEM: s_buffer_store_dword s -; SMEM: s_buffer_store_dword s -; SMEM: s_buffer_store_dword s -; SMEM: s_buffer_store_dword s -; SMEM: s_buffer_store_dword s -; SMEM: s_cbranch_scc1 - -; SMEM: s_add_u32 m0, s3, 0x100{{$}} -; SMEM: s_buffer_load_dword s -; SMEM: s_buffer_load_dword s -; SMEM: s_buffer_load_dword s -; SMEM: s_buffer_load_dword s -; SMEM: s_buffer_load_dword s -; SMEM: s_dcache_wb -; SMEM: s_endpgm - -; FIXME: Should only need 4 bytes -; SMEM: ScratchSize: 24 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 @@ -201,22 +143,6 @@ ; ALL-LABEL: {{^}}spill_sgpr_x8: -; SMEM: s_add_u32 m0, s3, 0x100{{$}} -; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS:[0-9]+:[0-9]+]]{{\]}}, m0 ; 16-byte Folded Spill -; SMEM: s_add_u32 m0, s3, 0x110{{$}} -; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Spill -; SMEM: s_cbranch_scc1 - -; SMEM: s_add_u32 m0, s3, 0x100{{$}} -; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Reload -; SMEM: s_add_u32 m0, s3, 0x110{{$}} -; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Reload - -; SMEM: s_dcache_wb -; SMEM: s_endpgm - -; SMEM: ScratchSize: 36 - ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2