Index: lib/Target/AMDGPU/SIInsertWaits.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaits.cpp +++ lib/Target/AMDGPU/SIInsertWaits.cpp @@ -522,6 +522,7 @@ TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); IV = getIsaVersion(ST->getFeatureBits()); + const SIMachineFunctionInfo *MFI = MF.getInfo(); HardwareLimits.Named.VM = getVmcntBitMask(IV); HardwareLimits.Named.EXP = getExpcntBitMask(IV); @@ -532,20 +533,27 @@ LastIssued = ZeroCounts; LastOpcodeType = OTHER; LastInstWritesM0 = false; - ReturnsVoid = MF.getInfo()->returnsVoid(); + ReturnsVoid = MFI->returnsVoid(); memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); SmallVector RemoveMI; + SmallVector EndPgmBlocks; + + bool HaveScalarStores = false; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { + if (!HaveScalarStores && TII->isScalarStore(*I)) + HaveScalarStores = true; + if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { // There is a hardware bug on CI/SI where SMRD instruction may corrupt // vccz bit, so when we detect that an instruction may read from a @@ -614,12 +622,45 @@ pushInstruction(MBB, I, Increment); handleSendMsg(MBB, I); + + if (I->getOpcode() == AMDGPU::S_ENDPGM || + I->getOpcode() == AMDGPU::SI_RETURN) + EndPgmBlocks.push_back(&MBB); } // Wait for everything at the end of the MBB Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); } + if (HaveScalarStores) { + // If scalar writes are used, the cache must be flushed or else the next + // wave to reuse the same scratch memory can be clobbered. + // + // Insert s_dcache_wb at wave termination points if there were any scalar + // stores, and only if the cache hasn't already been flushed. This could be + // improved by looking across blocks for flushes in postdominating blocks + // from the stores but an explicitly requested flush is probably very rare. + for (MachineBasicBlock *MBB : EndPgmBlocks) { + bool SeenDCacheWB = false; + + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); + I != E; ++I) { + + if (I->getOpcode() == AMDGPU::S_DCACHE_WB) + SeenDCacheWB = true; + else if (TII->isScalarStore(*I)) + SeenDCacheWB = false; + + // FIXME: It would be better to insert this before a waitcnt if any. + if ((I->getOpcode() == AMDGPU::S_ENDPGM || + I->getOpcode() == AMDGPU::SI_RETURN) && !SeenDCacheWB) { + Changes = true; + BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); + } + } + } + } + for (MachineInstr *I : RemoveMI) I->eraseFromParent(); Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -611,7 +611,7 @@ MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); } - BuildMI(MBB, MI, DL, OpDesc) + MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc) .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) @@ -621,6 +621,11 @@ // needing them, and need to ensure that the reserved registers are // correctly handled. + if (ST.hasScalarStores()) { + // m0 is used for offset to scalar stores if used to spill. + Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); + } + return; } @@ -710,12 +715,17 @@ MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } - BuildMI(MBB, MI, DL, OpDesc, DestReg) + MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); + if (ST.hasScalarStores()) { + // m0 is used for offset to scalar stores if used to spill. + Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); + } + return; } Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -24,6 +24,12 @@ using namespace llvm; +static cl::opt EnableSpillSGPRToSMEM( + "amdgpu-spill-sgpr-to-smem", + cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"), + cl::init(true)); + + static bool hasPressureSet(const int *PSets, unsigned PSetID) { for (unsigned i = 0; PSets[i] != -1; ++i) { if (PSets[i] == (int)PSetID) @@ -475,18 +481,21 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS) const { - MachineFunction *MF = MI->getParent()->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo(); - MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); const SISubtarget &ST = MF->getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); - const DebugLoc &DL = MI->getDebugLoc(); unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned SuperReg = MI->getOperand(0).getReg(); bool IsKill = MI->getOperand(0).isKill(); + const DebugLoc &DL = MI->getDebugLoc(); + + SIMachineFunctionInfo *MFI = MF->getInfo(); + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + + bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); @@ -495,6 +504,55 @@ unsigned SubReg = NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i)); + if (SpillToSMEM) { + if (SuperReg == AMDGPU::M0) { + assert(NumSubRegs == 1); + unsigned CopyM0 + = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), CopyM0) + .addReg(AMDGPU::M0, getKillRegState(IsKill)); + + // The real spill now kills the temp copy. + SubReg = SuperReg = CopyM0; + IsKill = true; + } + + int64_t FrOffset = FrameInfo.getObjectOffset(Index); + unsigned Size = FrameInfo.getObjectSize(Index); + unsigned Align = FrameInfo.getObjectAlignment(Index); + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, Index); + MachineMemOperand *MMO + = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, + Size, Align); + + unsigned OffsetReg = AMDGPU::M0; + // Add i * 4 wave offset. + // + // SMEM instructions only support a single offset, so increment the wave + // offset. + + int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i); + if (Offset != 0) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) + .addReg(MFI->getScratchWaveOffsetReg()) + .addImm(Offset); + } else { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) + .addReg(MFI->getScratchWaveOffsetReg()); + } + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_STORE_DWORD_SGPR)) + .addReg(SubReg, getKillRegState(IsKill)) // sdata + .addReg(MFI->getScratchRSrcReg()) // sbase + .addReg(OffsetReg) // soff + .addImm(0) // glc + .addMemOperand(MMO); + + continue; + } + struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); if (Spill.hasReg()) { @@ -521,7 +579,8 @@ // it are fixed. } else { // Spill SGPR to a frame index. - // FIXME we should use S_STORE_DWORD here for VI. + // TODO: Should VI try to spill to VGPR and then spill to SMEM? + MachineInstrBuilder Mov = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addReg(SubReg, SubKillState); @@ -572,6 +631,7 @@ unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned SuperReg = MI->getOperand(0).getReg(); + bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; // m0 is not allowed as with readlane/writelane, so a temporary SGPR and // extra copy is needed. @@ -581,11 +641,45 @@ SuperReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); } + int64_t FrOffset = FrameInfo.getObjectOffset(Index); + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned SubReg = NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i)); + if (SpillToSMEM) { + unsigned Size = FrameInfo.getObjectSize(Index); + unsigned Align = FrameInfo.getObjectAlignment(Index); + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, Index); + MachineMemOperand *MMO + = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, + Size, Align); + + unsigned OffsetReg = AMDGPU::M0; + + // Add i * 4 offset + int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i); + if (Offset != 0) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) + .addReg(MFI->getScratchWaveOffsetReg()) + .addImm(Offset); + } else { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) + .addReg(MFI->getScratchWaveOffsetReg()); + } + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), SubReg) + .addReg(MFI->getScratchRSrcReg()) // sbase + .addReg(OffsetReg) // soff + .addImm(0) // glc + .addMemOperand(MMO) + .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + + continue; + } + SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); Index: test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll =================================================================== --- test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll +++ test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll @@ -34,9 +34,9 @@ attributes #2 = {"amdgpu-flat-work-group-size"="128,128"} ; CHECK-LABEL: {{^}}min_1024_max_2048 -; CHECK: SGPRBlocks: 2 +; CHECK: SGPRBlocks: 1 ; CHECK: VGPRBlocks: 7 -; CHECK: NumSGPRsForWavesPerEU: 19 +; CHECK: NumSGPRsForWavesPerEU: 15 ; CHECK: NumVGPRsForWavesPerEU: 32 @var = addrspace(1) global float 0.0 define void @min_1024_max_2048() #3 { Index: test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll =================================================================== --- test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -1,16 +1,20 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s -; CHECK-LABEL: {{^}}max_14_sgprs: +; If spilling to smem, additional registers are used for the resource +; descriptor. + +; ALL-LABEL: {{^}}max_14_sgprs: ; FIXME: Should be ablo to skip this copying of the private segment ; buffer because all the SGPR spills are to VGPRs. -; CHECK: s_mov_b64 s[6:7], s[2:3] -; CHECK: s_mov_b64 s[4:5], s[0:1] - -; CHECK: SGPRBlocks: 1 -; CHECK: NumSGPRsForWavesPerEU: 14 +; ALL: s_mov_b64 s[6:7], s[2:3] +; ALL: s_mov_b64 s[4:5], s[0:1] +; ALL: SGPRBlocks: 1 +; ALL: NumSGPRsForWavesPerEU: 14 define void @max_14_sgprs(i32 addrspace(1)* %out1, + i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, i32 addrspace(1)* %out4, Index: test/CodeGen/AMDGPU/basic-branch.ll =================================================================== --- test/CodeGen/AMDGPU/basic-branch.ll +++ test/CodeGen/AMDGPU/basic-branch.ll @@ -1,5 +1,5 @@ ; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s Index: test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll =================================================================== --- test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -1,14 +1,42 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SGPR %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SMEM %s ; Make sure this doesn't crash. -; CHECK: {{^}}test: +; ALL-LABEL: {{^}}test: + ; Make sure we are handling hazards correctly. -; CHECK: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]] -; CHECK-NEXT: s_nop 4 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0 -; CHECK: s_endpgm +; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12 +; SGPR-NEXT: s_waitcnt vmcnt(0) +; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]] +; SGPR-NEXT: s_nop 4 +; SGPR-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0 + + +; Make sure scratch wave offset register is correctly incremented and +; then restored. +; SMEM: s_mov_b32 m0, s97{{$}} +; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill +; SMEM: s_add_u32 m0, s97, 0x100{{$}} +; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill +; SMEM: s_add_u32 m0, s97, 0x200{{$}} +; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill +; SMEM: s_add_u32 m0, s97, 0x300{{$}} +; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill + + +; SMEM: s_mov_b32 m0, s97{{$}} +; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload +; SMEM: s_add_u32 m0, s97, 0x100{{$}} +; SMEM: s_waitcnt lgkmcnt(0) +; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload +; SMEM: s_add_u32 m0, s97, 0x200{{$}} +; SMEM: s_waitcnt lgkmcnt(0) +; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload +; SMEM: s_add_u32 m0, s97, 0x300{{$}} +; SMEM: s_waitcnt lgkmcnt(0) +; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload + +; ALL: s_endpgm define void @test(i32 addrspace(1)* %out, i32 %in) { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () Index: test/CodeGen/AMDGPU/spill-m0.ll =================================================================== --- test/CodeGen/AMDGPU/spill-m0.ll +++ test/CodeGen/AMDGPU/spill-m0.ll @@ -1,7 +1,8 @@ ; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s -; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s +; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s ; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s -; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s +; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s +; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=1 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=GCN %s ; XXX - Why does it like to use vcc? @@ -16,6 +17,13 @@ ; TOVMEM: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], m0 ; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill ; TOVMEM: s_waitcnt vmcnt(0) + +; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM: s_mov_b32 m0, s3{{$}} +; TOSMEM-NOT: vcc_hi +; TOSMEM: s_buffer_store_dword vcc_hi, s[88:91], m0 ; 4-byte Folded Spill +; TOSMEM: s_waitcnt lgkmcnt(0) + ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN: [[ENDIF]]: @@ -27,6 +35,11 @@ ; TOVMEM: v_readfirstlane_b32 vcc_hi, [[RELOAD_VREG]] ; TOVMEM: s_mov_b32 m0, vcc_hi +; TOSMEM: s_mov_b32 m0, s3{{$}} +; TOSMEM: s_buffer_load_dword vcc_hi, s[88:91], m0 ; 4-byte Folded Reload +; TOSMEM-NOT: vcc_hi +; TOSMEM: s_mov_b32 m0, vcc_hi + ; GCN: s_add_i32 m0, m0, 1 define void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 { entry: @@ -48,6 +61,8 @@ ; GCN-LABEL: {{^}}spill_m0_lds: ; GCN-NOT: v_readlane_b32 m0 +; GCN-NOT: s_buffer_store_dword m0 +; GCN-NOT: s_buffer_load_dword m0 define amdgpu_ps void @spill_m0_lds(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) #0 { main_body: %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) Index: test/CodeGen/MIR/AMDGPU/scalar-store-cache-flush.mir =================================================================== --- /dev/null +++ test/CodeGen/MIR/AMDGPU/scalar-store-cache-flush.mir @@ -0,0 +1,173 @@ +# RUN: llc -march=amdgcn -run-pass si-insert-waits %s -o - | FileCheck %s + +--- | + define void @basic_insert_dcache_wb() { + ret void + } + + define void @explicit_flush_after() { + ret void + } + + define void @explicit_flush_before() { + ret void + } + + define void @no_scalar_store() { + ret void + } + + define void @multi_block_store() { + bb0: + br i1 undef, label %bb1, label %bb2 + + bb1: + ret void + + bb2: + ret void + } + + define void @one_block_store() { + bb0: + br i1 undef, label %bb1, label %bb2 + + bb1: + ret void + + bb2: + ret void + } + + define amdgpu_ps float @si_return() { + ret float undef + } + +... +--- +# CHECK-LABEL: name: basic_insert_dcache_wb +# CHECK: bb.0: +# CHECK-NEXT: S_STORE_DWORD +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: S_ENDPGM + +name: basic_insert_dcache_wb +tracksRegLiveness: false + +body: | + bb.0: + S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0 + S_ENDPGM +... +--- +# Already has an explicitly requested flush after the last store. +# CHECK-LABEL: name: explicit_flush_after +# CHECK: bb.0: +# CHECK-NEXT: S_STORE_DWORD +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: S_ENDPGM + +name: explicit_flush_after +tracksRegLiveness: false + +body: | + bb.0: + S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0 + S_DCACHE_WB + S_ENDPGM +... +--- +# Already has an explicitly requested flush before the last store. +# CHECK-LABEL: name: explicit_flush_before +# CHECK: bb.0: +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: S_STORE_DWORD +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: S_ENDPGM + +name: explicit_flush_before +tracksRegLiveness: false + +body: | + bb.0: + S_DCACHE_WB + S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0 + S_ENDPGM +... +--- +# CHECK-LABEL: no_scalar_store +# CHECK: bb.0 +# CHECK-NEXT: S_ENDPGM +name: no_scalar_store +tracksRegLiveness: false + +body: | + bb.0: + S_ENDPGM +... + +# CHECK-LABEL: name: multi_block_store +# CHECK: bb.0: +# CHECK-NEXT: S_STORE_DWORD +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: S_ENDPGM + +# CHECK: bb.1: +# CHECK-NEXT: S_STORE_DWORD +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: S_ENDPGM + +name: multi_block_store +tracksRegLiveness: false + +body: | + bb.0: + S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0 + S_ENDPGM + + bb.1: + S_STORE_DWORD_SGPR undef %sgpr4, undef %sgpr6_sgpr7, undef %m0, 0 + S_ENDPGM +... +... + +# This one should be able to omit the flush in the storeless block but +# this isn't handled now. + +# CHECK-LABEL: name: one_block_store +# CHECK: bb.0: +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: S_ENDPGM + +# CHECK: bb.1: +# CHECK-NEXT: S_STORE_DWORD +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: S_ENDPGM + +name: one_block_store +tracksRegLiveness: false + +body: | + bb.0: + S_ENDPGM + + bb.1: + S_STORE_DWORD_SGPR undef %sgpr4, undef %sgpr6_sgpr7, undef %m0, 0 + S_ENDPGM +... +--- +# CHECK-LABEL: name: si_return +# CHECK: bb.0: +# CHECK-NEXT: S_STORE_DWORD +# CHECK-NEXT: S_WAITCNT +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: SI_RETURN + +name: si_return +tracksRegLiveness: false + +body: | + bb.0: + S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0 + SI_RETURN undef %vgpr0 +...