Index: llvm/lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -134,7 +134,8 @@ // We need to specially emit stack operations here because a different frame // register is used than in the rest of the function, as getFrameRegister would // use. -static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, +static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs, + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const SIInstrInfo *TII, Register SpillReg, Register ScratchRsrcReg, Register SPReg, int FI) { @@ -147,7 +148,19 @@ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, MFI.getObjectAlign(FI)); - if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) { + if (ST.enableFlatScratch()) { + if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) { + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR)) + .addReg(SpillReg, RegState::Kill) + .addReg(SPReg) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // dlc + .addMemOperand(MMO); + return; + } + } else if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) { BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) .addReg(SpillReg, RegState::Kill) .addReg(ScratchRsrcReg) @@ -166,29 +179,48 @@ // offset in the spill. LiveRegs.addReg(SpillReg); - MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( - MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); + if (ST.enableFlatScratch()) { + MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( + MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass); - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) - .addImm(Offset); + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg) + .addReg(SPReg) + .addImm(Offset); - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN)) - .addReg(SpillReg, RegState::Kill) - .addReg(OffsetReg, RegState::Kill) - .addReg(ScratchRsrcReg) - .addReg(SPReg) - .addImm(0) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addImm(0) // dlc - .addImm(0) // swz - .addMemOperand(MMO); + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR)) + .addReg(SpillReg, RegState::Kill) + .addReg(OffsetReg, RegState::Kill) + .addImm(0) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // dlc + .addMemOperand(MMO); + } else { + MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( + MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) + .addImm(Offset); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN)) + .addReg(SpillReg, RegState::Kill) + .addReg(OffsetReg, RegState::Kill) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(0) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addImm(0) // swz + .addMemOperand(MMO); + } LiveRegs.removeReg(SpillReg); } -static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, +static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs, + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const SIInstrInfo *TII, Register SpillReg, Register ScratchRsrcReg, Register SPReg, int FI) { @@ -200,6 +232,35 @@ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4, MFI.getObjectAlign(FI)); + if (ST.enableFlatScratch()) { + if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) { + BuildMI(MBB, I, DebugLoc(), + TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR), SpillReg) + .addReg(SPReg) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // dlc + .addMemOperand(MMO); + return; + } + MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( + MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg) + .addReg(SPReg) + .addImm(Offset); + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR), + SpillReg) + .addReg(OffsetReg, RegState::Kill) + .addImm(0) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // dlc + .addMemOperand(MMO); + return; + } + if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) { BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg) @@ -784,7 +845,7 @@ if (!ScratchExecCopy) ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); - buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR, + buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR, FuncInfo->getScratchRSrcReg(), StackPtrReg, Reg.FI.getValue()); @@ -802,7 +863,7 @@ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(FramePtrReg); - buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR, + buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, FuncInfo->getScratchRSrcReg(), StackPtrReg, FuncInfo->FramePointerSaveIndex.getValue()); } @@ -819,7 +880,7 @@ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(BasePtrReg); - buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR, + buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, FuncInfo->getScratchRSrcReg(), StackPtrReg, *FuncInfo->BasePointerSaveIndex); } @@ -1006,7 +1067,7 @@ MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); - buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR, + buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR, FuncInfo->getScratchRSrcReg(), StackPtrReg, FI); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg) .addReg(TempVGPR, RegState::Kill); @@ -1032,7 +1093,7 @@ MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); - buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR, + buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR, FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg) .addReg(TempVGPR, RegState::Kill); @@ -1057,7 +1118,7 @@ if (!ScratchExecCopy) ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); - buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR, + buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR, FuncInfo->getScratchRSrcReg(), StackPtrReg, Reg.FI.getValue()); } Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -507,11 +507,20 @@ return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT); } + bool isSegmentSpecificFLAT(uint16_t Opcode) const { + auto Flags = get(Opcode).TSFlags; + return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT); + } + // FIXME: Make this more precise static bool isFLATScratch(const MachineInstr &MI) { return isSegmentSpecificFLAT(MI); } + bool isFLATScratch(uint16_t Opcode) const { + return isSegmentSpecificFLAT(Opcode); + } + // Any FLAT encoded instruction, including global_* and scratch_*. bool isFLAT(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::FLAT; Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -754,9 +754,10 @@ const MachineFrameInfo &MFI = MF->getFrameInfo(); const SIMachineFunctionInfo *FuncInfo = MF->getInfo(); - const MCInstrDesc &Desc = TII->get(LoadStoreOp); + const MCInstrDesc *Desc = &TII->get(LoadStoreOp); const DebugLoc &DL = MI->getDebugLoc(); - bool IsStore = Desc.mayStore(); + bool IsStore = Desc->mayStore(); + bool IsFlat = TII->isFLATScratch(LoadStoreOp); bool Scavenged = false; MCRegister SOffset = ScratchOffsetReg; @@ -766,6 +767,7 @@ unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT); unsigned Size = NumSubRegs * EltSize; int64_t Offset = InstOffset + MFI.getObjectOffset(Index); + int64_t MaxOffset = Offset + Size - EltSize; int64_t ScratchOffsetRegDelta = 0; Align Alignment = MFI.getObjectAlign(Index); @@ -773,13 +775,16 @@ assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset"); - if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset + Size - EltSize)) { + if (IsFlat + ? !TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, true) + : !SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset)) { SOffset = MCRegister(); // We currently only support spilling VGPRs to EltSize boundaries, meaning // we can simplify the adjustment of Offset here to just scale with // WavefrontSize. - Offset *= ST.getWavefrontSize(); + if (!IsFlat) + Offset *= ST.getWavefrontSize(); // We don't have access to the register scavenger if this function is called // during PEI::scavengeFrameVirtualRegs(). @@ -817,8 +822,33 @@ Offset = 0; } + if (IsFlat && SOffset == AMDGPU::NoRegister) { + assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 + && "Unexpected vaddr for flat scratch with a FI operand"); + + if (ST.hasFlatScratchSTMode()) { + LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); + Desc = &TII->get(LoadStoreOp); + } else { + // We need a register for SADDR even if to just hold a zero. + if (RS) + SOffset = RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, + false); + // This can fail unless we have a way to spill without extra registers + // needed during spilling. There is no such problem on GFX10 though + // because we can use NULL register. + if (!SOffset) + report_fatal_error("could not scavenge SGPR to spill in entry function"); + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset) + .addImm(0); + Scavenged = true; + } + } + Register TmpReg; + // FIXME: Flat scratch does not have to be limited to a dword per store. for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) { Register SubReg = NumSubRegs == 1 ? Register(ValueReg) @@ -863,22 +893,26 @@ MF->getMachineMemOperand(PInfo, MMO->getFlags(), EltSize, commonAlignment(Alignment, EltSize * i)); - MIB = BuildMI(*MBB, MI, DL, Desc) + MIB = BuildMI(*MBB, MI, DL, *Desc) .addReg(SubReg, - getDefRegState(!IsStore) | getKillRegState(IsKill)) - .addReg(ScratchRsrcReg); + getDefRegState(!IsStore) | getKillRegState(IsKill)); + if (!IsFlat) + MIB.addReg(ScratchRsrcReg); + if (SOffset == AMDGPU::NoRegister) { - MIB.addImm(0); + if (!IsFlat) + MIB.addImm(0); } else { MIB.addReg(SOffset, SOffsetRegState); } MIB.addImm(Offset) .addImm(0) // glc .addImm(0) // slc - .addImm(0) // tfe - .addImm(0) // dlc - .addImm(0) // swz - .addMemOperand(NewMMO); + .addImm(0); // tfe for MUBUF or dlc for FLAT + if (!IsFlat) + MIB.addImm(0) // dlc + .addImm(0); // swz + MIB.addMemOperand(NewMMO); if (!IsAGPR && NeedSuperRegDef) MIB.addReg(ValueReg, RegState::ImplicitDefine); @@ -979,14 +1013,18 @@ EltSize, Alignment); if (IsLoad) { - buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, + unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR + : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; + buildSpillLoadStore(MI, Opc, Index, VGPR, false, MFI->getScratchRSrcReg(), FrameReg, Offset * EltSize, MMO, RS); } else { - buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, VGPR, + unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR + : AMDGPU::BUFFER_STORE_DWORD_OFFSET; + buildSpillLoadStore(MI, Opc, Index, VGPR, IsKill, MFI->getScratchRSrcReg(), FrameReg, Offset * EltSize, MMO, RS); // This only ever adds one VGPR spill @@ -1326,7 +1364,9 @@ assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == MFI->getStackPtrOffsetReg()); - buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, + unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR + : AMDGPU::BUFFER_STORE_DWORD_OFFSET; + buildSpillLoadStore(MI, Opc, Index, VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), @@ -1360,7 +1400,9 @@ assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == MFI->getStackPtrOffsetReg()); - buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, + unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR + : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; + buildSpillLoadStore(MI, Opc, Index, VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), Index: llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -23,7 +23,8 @@ } ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: -; GCN: buffer_store_dword +; MUBUF: buffer_store_dword +; FLATSCR: scratch_store_dword ; GCN: v_writelane_b32 v40, s33, 4 ; GCN: v_writelane_b32 v40, s34, 0 ; GCN: v_writelane_b32 v40, s35, 1 @@ -40,7 +41,8 @@ ; GCN: v_readlane_b32 s34, v40, 0 ; GCN: v_readlane_b32 s33, v40, 4 -; GCN: buffer_load_dword +; MUBUF: buffer_load_dword +; FLATSCR: scratch_load_dword ; GCN: s_setpc_b64 define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { call void @external_void_func_void() @@ -50,7 +52,8 @@ } ; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: -; GCN: buffer_store_dword v40 +; MUBUF: buffer_store_dword v40 +; FLATSCR: scratch_store_dword off, v40 ; GCN: v_writelane_b32 v40, s33, 4 ; GCN: s_mov_b32 s33, s32 @@ -60,7 +63,8 @@ ; GCN-NEXT: s_swappc_b64 ; GCN: v_readlane_b32 s33, v40, 4 -; GCN: buffer_load_dword v40, +; MUBUF: buffer_load_dword v40 +; FLATSCR: scratch_load_dword v40 define void @test_func_call_external_void_funcx2() #0 { call void @external_void_func_void() call void @external_void_func_void() Index: llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -84,7 +84,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN: v_writelane_b32 [[CSR_VGPR]], s33, 2 ; GCN-DAG: s_mov_b32 s33, s32 @@ -106,7 +107,8 @@ ; FLATSCR: s_sub_u32 s32, s32, 16{{$}} ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -127,7 +129,8 @@ ; GCN-LABEL: {{^}}callee_no_stack_with_call: ; GCN: s_waitcnt ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; MUBUF-DAG: s_add_u32 s32, s32, 0x400 ; FLATSCR-DAG: s_add_u32 s32, s32, 16 @@ -144,7 +147,8 @@ ; FLATSCR: s_sub_u32 s32, s32, 16 ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], [[FP_SPILL_LANE]] ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -160,7 +164,8 @@ ; ; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls: ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN: v_writelane_b32 [[CSR_VGPR]], s ; GCN: v_writelane_b32 [[CSR_VGPR]], s @@ -170,7 +175,8 @@ ; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]] ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -219,7 +225,8 @@ ; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill ; MUBUF-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:8 ; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33 offset:8 @@ -227,7 +234,8 @@ ; GCN-NEXT: ; clobber v41 ; GCN-NEXT: ;;#ASMEND -; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload ; MUBUF: s_add_u32 s32, s32, 0x300 ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300 ; FLATSCR: s_add_u32 s32, s32, 12 @@ -247,7 +255,8 @@ ; GCN: s_waitcnt ; GCN-NEXT: v_writelane_b32 v1, s33, 63 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill ; GCN-COUNT-63: v_writelane_b32 v1 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:8 @@ -282,7 +291,8 @@ ; GCN: s_waitcnt ; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill ; GCN-COUNT-64: v_writelane_b32 v1, ; MUBUF: buffer_store_dword @@ -290,7 +300,8 @@ ; GCN: ;;#ASMSTART ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 -; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload ; MUBUF: s_add_u32 s32, s32, 0x300 ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300 ; FLATSCR: s_add_u32 s32, s32, 12 @@ -374,7 +385,8 @@ ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr: ; GCN: s_waitcnt ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2 ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0 @@ -394,7 +406,8 @@ ; FLATSCR-NEXT: s_sub_u32 s32, s32, 12{{$}} ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -423,8 +436,11 @@ ; GCN-LABEL: {{^}}scratch_reg_needed_mubuf_offset: ; GCN: s_waitcnt ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 -; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill +; MUBUF-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill +; FLATSCR-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1008 +; FLATSCR-NEXT: s_nop 0 +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0 @@ -443,8 +459,10 @@ ; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x100c{{$}} ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 -; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload +; MUBUF-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload +; FLATSCR-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1008 +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, [[SCRATCH_SGPR]] ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -500,11 +518,13 @@ ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory: ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 -; GCN: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4 +; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4 +; FLATSCR: scratch_store_dword off, [[TMP_VGPR1]], s32 offset:4 ; GCN: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN: s_mov_b32 s33, s32 ; GCN: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4 +; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4 +; FLATSCR: scratch_load_dword [[TMP_VGPR2:v[0-9]+]], off, s32 offset:4 ; GCN: s_waitcnt vmcnt(0) ; GCN: v_readfirstlane_b32 s33, [[TMP_VGPR2]] ; GCN: s_mov_b64 exec, [[COPY_EXEC2]] @@ -531,13 +551,15 @@ ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr: ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 -; GCN: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]] +; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]] +; FLATSCR: scratch_store_dword off, [[TMP_VGPR1]], s32 offset:[[OFF:[0-9]+]] ; GCN: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NOT: v_writelane_b32 v40, s33 ; GCN: s_mov_b32 s33, s32 ; GCN-NOT: v_readlane_b32 s33, v40 ; GCN: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:[[OFF]] +; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:[[OFF]] +; FLATSCR: scratch_load_dword [[TMP_VGPR2:v[0-9]+]], off, s32 offset:[[OFF]] ; GCN: v_readfirstlane_b32 s33, [[TMP_VGPR2]] ; GCN: s_mov_b64 exec, [[COPY_EXEC2]] ; GCN: s_setpc_b64 @@ -566,10 +588,13 @@ ; scratch VGPR to hold the offset. ; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset ; GCN: s_or_saveexec_b64 s[4:5], -1 -; GCN: v_mov_b32_e32 v0, s33 +; MUBUF: v_mov_b32_e32 v0, s33 ; GCN-NOT: v_mov_b32_e32 v0, 0x1008 -; GCN-NEXT: v_mov_b32_e32 v1, 0x1008 -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1008 +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill +; FLATSCR-NEXT: s_add_u32 [[SOFF:s[0-9]+]], s32, 0x1008 +; FLATSCR-NEXT: v_mov_b32_e32 v0, s33 +; FLATSCR-NEXT: scratch_store_dword off, v0, [[SOFF]] ; 4-byte Folded Spill define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval align 4 %arg) #3 { %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca Index: llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll +++ llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll @@ -1,12 +1,17 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s - -; CHECK-LABEL: spill_v2i32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill -; CHECK: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,MUBUF +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -amdgpu-enable-flat-scratch < %s | FileCheck %s -check-prefixes=GCN,FLATSCR + +; GCN-LABEL: spill_v2i32: +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:16 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:20 ; 4-byte Folded Spill +; GCN: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload define void @spill_v2i32() { entry: @@ -24,13 +29,17 @@ ret void } -; CHECK-LABEL: spill_v2f32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill -; CHECK: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload +; GCN-LABEL: spill_v2f32: +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:16 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:20 ; 4-byte Folded Spill +; GCN: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload define void @spill_v2f32() { entry: @@ -48,15 +57,21 @@ ret void } -; CHECK-LABEL: spill_v3i32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill -; CHECK: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; GCN-LABEL: spill_v3i32: +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:32 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:36 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:40 ; 4-byte Folded Spill +; GCN: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload define void @spill_v3i32() { entry: @@ -74,15 +89,21 @@ ret void } -; CHECK-LABEL: spill_v3f32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill -; CHECK: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; GCN-LABEL: spill_v3f32: +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:32 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:36 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:40 ; 4-byte Folded Spill +; GCN: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload define void @spill_v3f32() { entry: @@ -100,17 +121,25 @@ ret void } -; CHECK-LABEL: spill_v4i32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill -; CHECK: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload +; GCN-LABEL: spill_v4i32: +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:32 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:36 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:40 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:44 ; 4-byte Folded Spill +; GCN: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload define void @spill_v4i32() { entry: @@ -128,17 +157,25 @@ ret void } -; CHECK-LABEL: spill_v4f32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill -; CHECK: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload +; GCN-LABEL: spill_v4f32: +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:32 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:36 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:40 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:44 ; 4-byte Folded Spill +; GCN: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload define void @spill_v4f32() { entry: @@ -156,17 +193,25 @@ ret void } -; CHECK-LABEL: spill_v5i32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill -; CHECK: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload +; GCN-LABEL: spill_v5i32: +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:64 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:68 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:72 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:76 ; 4-byte Folded Spill +; GCN: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload define void @spill_v5i32() { entry: %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5) @@ -183,17 +228,25 @@ ret void } -; CHECK-LABEL: spill_v5f32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill -; CHECK: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload +; GCN-LABEL: spill_v5f32: +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:64 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:68 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:72 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:76 ; 4-byte Folded Spill +; GCN: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload define void @spill_v5f32() { entry: %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5) Index: llvm/test/CodeGen/AMDGPU/sgpr-spill.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/sgpr-spill.mir +++ llvm/test/CodeGen/AMDGPU/sgpr-spill.mir @@ -1,5 +1,6 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=CHECK -check-prefix=GCN64 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=CHECK -check-prefix=GCN32 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GCN64,MUBUF %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GCN32,MUBUF %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GCN64,FLATSCR %s # CHECK-LABEL: name: check_spill @@ -8,14 +9,16 @@ # CHECK: V_WRITELANE # CHECK: $sgpr12 = S_MOV_B32 $exec_lo # CHECK: $exec_lo = S_MOV_B32 1 -# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 +# MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 4 # CHECK: $exec_lo = S_MOV_B32 killed $sgpr12 # S32 without kill # CHECK: V_WRITELANE # CHECK: $sgpr12 = S_MOV_B32 $exec_lo # CHECK: $exec_lo = S_MOV_B32 1 -# CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 +# MUBUF: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR $vgpr{{[0-9]+}}, $sgpr33, 4 # CHECK: $sgpr12 = V_READLANE # S64 with kill @@ -25,7 +28,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 3 # GCN64: $exec = S_MOV_B64 3 -# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 +# MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 8 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 @@ -36,7 +40,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 3 # GCN64: $exec = S_MOV_B64 3 -# CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 +# MUBUF: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR $vgpr{{[0-9]+}}, $sgpr33, 8 # GCN32: $exec_lo = S_MOV_B32 $sgpr12 # GCN64: $exec = S_MOV_B64 $sgpr12_sgpr13 # GCN64: $sgpr13 = V_READLANE @@ -50,7 +55,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 7 # GCN64: $exec = S_MOV_B64 7 -# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 16 +# MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 16 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 16 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 @@ -63,7 +69,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 15 # GCN64: $exec = S_MOV_B64 15 -# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 28 +# MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 28 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 28 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 @@ -77,7 +84,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 31 # GCN64: $exec = S_MOV_B64 31 -# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 44 +# MUBUF: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 44 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 44 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 @@ -94,7 +102,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 255 # GCN64: $exec = S_MOV_B64 255 -# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 64 +# MUBUF: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 64 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 64 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 @@ -119,7 +128,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 65535 # GCN64: $exec = S_MOV_B64 65535 -# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 96 +# MUBUF: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 96 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 96 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 @@ -160,7 +170,8 @@ # GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 4294967295 # GCN64: $exec = S_MOV_B64 4294967295 -# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 160 +# MUBUF: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 160 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 160 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr64 # GCN64: $exec = S_MOV_B64 killed $sgpr64_sgpr65 @@ -248,7 +259,8 @@ # S32 # CHECK: $sgpr12 = S_MOV_B32 $exec_lo # CHECK: $exec_lo = S_MOV_B32 1 -# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 +# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 +# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4 # CHECK: $exec_lo = S_MOV_B32 killed $sgpr12 # CHECK: $sgpr12 = V_READLANE @@ -257,7 +269,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 3 # GCN64: $exec = S_MOV_B64 3 -# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 +# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 +# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 # CHECK: $sgpr12 = V_READLANE @@ -268,7 +281,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 7 # GCN64: $exec = S_MOV_B64 7 -# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 16 +# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 16 +# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 # CHECK: $sgpr12 = V_READLANE @@ -280,7 +294,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 15 # GCN64: $exec = S_MOV_B64 15 -# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 28 +# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 28 +# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 28 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 # CHECK: $sgpr12 = V_READLANE @@ -293,7 +308,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 31 # GCN64: $exec = S_MOV_B64 31 -# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 44 +# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 44 +# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 44 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 # CHECK: $sgpr12 = V_READLANE @@ -307,7 +323,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 255 # GCN64: $exec = S_MOV_B64 255 -# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 64 +# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 64 +# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 64 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 # CHECK: $sgpr12 = V_READLANE @@ -324,7 +341,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 65535 # GCN64: $exec = S_MOV_B64 65535 -# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 96 +# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 96 +# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 96 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 # CHECK: $sgpr12 = V_READLANE @@ -349,7 +367,8 @@ # GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 4294967295 # GCN64: $exec = S_MOV_B64 4294967295 -# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 160 +# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 160 +# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 160 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr64 # GCN64: $exec = S_MOV_B64 killed $sgpr64_sgpr65 # CHECK: $sgpr64 = V_READLANE Index: llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -1,5 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=verde -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck -check-prefixes=CHECK,GFX6 %s ; RUN: llc -regalloc=basic -march=amdgcn -mcpu=tonga -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck -check-prefixes=CHECK,GFX7 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX9-FLATSCR,FLATSCR %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX10-FLATSCR,FLATSCR %s ; ; There is something about Tonga that causes this test to spend a lot of time ; in the default register allocator. @@ -11,6 +13,14 @@ ; Just test that it compiles successfully. ; CHECK-LABEL: test + +; GFX9-FLATSCR: s_mov_b32 [[SOFF1:s[0-9]+]], 0{{$}} +; GFX9-FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF1]] offset:{{[0-9]+}} ; 4-byte Folded Spill +; GFX9-FLATSCR: s_mov_b32 [[SOFF2:s[0-9]+]], 0{{$}} +; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF2]] offset:{{[0-9]+}} ; 4-byte Folded Reload + +; GFX10-FLATSCR: scratch_store_dword off, v{{[0-9]+}}, off offset:{{[0-9]+}} ; 4-byte Folded Spill +; GFX10-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, off offset:{{[0-9]+}} ; 4-byte Folded Reload define amdgpu_kernel void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) { entry: %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) @@ -35,11 +45,17 @@ } ; CHECK-LABEL: test_limited_sgpr -; GFX6: s_add_u32 s32, s32, 0x[[OFFSET:[0-9]+]] +; GFX6: s_add_u32 s32, s32, 0x[[OFFSET:[0-9a-f]+]] ; GFX6-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9:]+}}], s32 -; GFX6-NEXT: s_sub_u32 s32, s32, 0x[[OFFSET:[0-9]+]] +; GFX6-NEXT: s_sub_u32 s32, s32, 0x[[OFFSET:[0-9a-f]+]] ; GFX6: NumSgprs: 48 ; GFX6: ScratchSize: 8608 + +; FLATSCR: s_movk_i32 [[SOFF1:s[0-9]+]], 0x +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dword off, v{{[0-9]+}}, [[SOFF1]] ; 4-byte Folded Spill +; FLATSCR: s_movk_i32 [[SOFF2:s[0-9]+]], 0x +; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF2]] ; 4-byte Folded Reload define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 x i32> addrspace(1)* %in) #0 { entry: %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)