Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -360,6 +360,10 @@ return FlatAddressSpace; } + bool hasSoftClauses() const { + return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS; + } + bool isMesaKernel(const MachineFunction &MF) const { return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv()); } Index: lib/Target/AMDGPU/SIInsertWaits.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaits.cpp +++ lib/Target/AMDGPU/SIInsertWaits.cpp @@ -99,6 +99,9 @@ /// \brief Registers defined by async instructions. RegCounters DefinedRegs; + /// Registers used as inputs in the current soft clause. + SmallSet ClausePtrRegUnits; + /// \brief Different export instruction types seen since last wait. unsigned ExpInstrTypesSeen = 0; @@ -126,6 +129,13 @@ RegInterval getRegInterval(const TargetRegisterClass *RC, const MachineOperand &Reg) const; + bool inSoftClause() const { + return LastOpcodeType == VMEM; + } + + void leaveSoftClause(); + bool needToBreakSoftClause(MachineInstr &MI); + /// \brief Handle instructions async components void pushInstruction(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -299,6 +309,54 @@ return Result; } +// Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM +// or SMEM clause, respectively. +// +// The temporary workaround is to break the clauses with S_NOP. +// +// The proper solution would be to allocate registers such that all source +// and destination registers don't overlap, e.g. this is illegal: +// r0 = load r2 +// r2 = load r0 +bool SIInsertWaits::needToBreakSoftClause(MachineInstr &MI) { + // TODO: Handle SMEM clauses. + + // TODO: A clause is broken if any of the counters will overflow. Do we need + // to handle this here? + + // Don't need to break if this begins the clause. + if (inSoftClause()) { + for (MachineOperand &Dst : MI.defs()) { + for (MCRegUnitIterator R(Dst.getReg(), TRI); R.isValid(); ++R) { + if (ClausePtrRegUnits.count(*R)) { + // Overwriting a pointer def'd within the clause. Break the clause + // with a nop. + leaveSoftClause(); + return true; + } + } + } + } else { + // Clear out state from previous clause. + ClausePtrRegUnits.clear(); + } + + MachineOperand *VAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); + if (VAddr) { + // XXX - What about scalar loads? + for (MCRegUnitIterator R(VAddr->getReg(), TRI); R.isValid(); ++R) { + // XXX - What about same register for dest and pointer? + ClausePtrRegUnits.insert(*R); + } + } + + return false; +} + +void SIInsertWaits::leaveSoftClause() { + ClausePtrRegUnits.clear(); +} + void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const Counters &Increment) { @@ -322,29 +380,6 @@ return; } - if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { - // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM - // or SMEM clause, respectively. - // - // The temporary workaround is to break the clauses with S_NOP. - // - // The proper solution would be to allocate registers such that all source - // and destination registers don't overlap, e.g. this is illegal: - // r0 = load r2 - // r2 = load r0 - if (LastOpcodeType == VMEM && Increment.Named.VM) { - // Insert a NOP to break the clause. - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) - .addImm(0); - LastInstWritesM0 = false; - } - - if (TII->isSMRD(*I)) - LastOpcodeType = SMEM; - else if (Increment.Named.VM) - LastOpcodeType = VMEM; - } - // Remember which export instructions we have seen if (Increment.Named.EXP) { ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2; @@ -368,6 +403,23 @@ UsedRegs[j] = Limit; } } + + if (ST->hasSoftClauses()) { + if (TII->isSMRD(*I)) + LastOpcodeType = SMEM; + else if (Increment.Named.VM) { + if (needToBreakSoftClause(*I)) { + // Insert a NOP to break the clause. + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) + .addImm(0); + LastInstWritesM0 = false; + } + + LastOpcodeType = VMEM; + } else { + LastOpcodeType = OTHER; + } + } } bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, Index: test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir @@ -0,0 +1,623 @@ +# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-insert-waits %s -o - | FileCheck -check-prefix=GCN -check-prefix=VI %s + +--- | + define void @trivial_clause_load_flat4_x1() { + ret void + } + + define void @trivial_clause_load_flat4_x2() { + ret void + } + + define void @trivial_clause_load_flat4_x3() { + ret void + } + + define void @trivial_clause_load_flat4_x4() { + ret void + } + + define void @trivial_clause_load_flat4_x2_sameptr() { + ret void + } + + define void @flat_load4_overwrite_ptr_lo() { + ret void + } + + define void @flat_load4_overwrite_ptr_hi() { + ret void + } + + define void @flat_load8_overwrite_ptr() { + ret void + } + + define void @break_clause_at_max_clause_size_flat_load4() { + ret void + } + + define void @break_clause_simple_load_flat4_lo_ptr() { + ret void + } + + define void @break_clause_simple_load_flat4_hi_ptr() { + ret void + } + + define void @break_clause_simple_load_flat8_ptr() { + ret void + } + + define void @break_clause_simple_load_flat16_ptr() { + ret void + } + + define void @break_clause_block_boundary_load_flat8_ptr() { + ret void + } + + define void @break_clause_store_load_into_ptr_flat4() { + ret void + } + + define void @break_clause_store_load_into_data_flat4() { + ret void + } + + define void @valu_inst_breaks_clause() { + ret void + } + + define void @salu_inst_breaks_clause() { + ret void + } + + define void @ds_inst_breaks_clause() { + ret void + } + + define void @smrd_inst_breaks_clause() { + ret void + } + + define void @implicit_use_breaks_clause() { + ret void + } + + define void @trivial_clause_load_mubuf4_x2() { + ret void + } + + define void @break_clause_simple_load_mubuf_offen_ptr() { + ret void + } + + define void @mubuf_load4_overwrite_ptr() { + ret void + } + + define void @break_clause_flat_load_mubuf_load() { + ret void + } + + define void @break_clause_mubuf_load_flat_load() { + ret void + } + + define void @break_clause_atomic_rtn_into_ptr_flat4() { + ret void + } + + define void @break_clause_atomic_nortn_ptr_load_flat4() { + ret void + } + + define void @break_clause_atomic_rtn_into_ptr_mubuf4() { + ret void + } + + define void @break_clause_atomic_nortn_ptr_load_mubuf4() { + ret void + } + + define void @no_break_clause_mubuf_load_novaddr() { + ret void + } + +... +--- +# Trivial clause at beginning of program +# GCN-LABEL: name: trivial_clause_load_flat4_x1 +# GCN: bb.0: +# GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3 +name: trivial_clause_load_flat4_x1 + +body: | + bb.0: + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# Trivial clause at beginning of program +# GCN-LABEL: name: trivial_clause_load_flat4_x2 +# GCN: bb.0: +# GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3 +# GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr4_vgpr5 +name: trivial_clause_load_flat4_x2 + +body: | + bb.0: + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr1 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# Trivial clause at beginning of program +# GCN-LABEL: name: trivial_clause_load_flat4_x3 +# GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr +# GCN: %vgpr1 = FLAT_LOAD_DWORD %vgpr5_vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr +# GCN: %vgpr2 = FLAT_LOAD_DWORD %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr +# GCN: S_ENDPGM +name: trivial_clause_load_flat4_x3 + +body: | + bb.0: + %vgpr0 = FLAT_LOAD_DWORD %vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr1 = FLAT_LOAD_DWORD %vgpr5_vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr2 = FLAT_LOAD_DWORD %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# Trivial clause at beginning of program +# GCN-LABEL: name: trivial_clause_load_flat4_x4 +# GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr +# GCN: %vgpr1 = FLAT_LOAD_DWORD %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr +# GCN: %vgpr2 = FLAT_LOAD_DWORD %vgpr8_vgpr9, 0, 0, 0, implicit %exec, implicit %flat_scr +# GCN: %vgpr3 = FLAT_LOAD_DWORD %vgpr10_vgpr11, 0, 0, 0, implicit %exec, implicit %flat_scr +# GCN: S_ENDPGM +name: trivial_clause_load_flat4_x4 + +body: | + bb.0: + %vgpr0 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr1 = FLAT_LOAD_DWORD %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr2 = FLAT_LOAD_DWORD %vgpr8_vgpr9, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = FLAT_LOAD_DWORD %vgpr10_vgpr11, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# Reuse of same input pointer is OK +# GCN-LABEL: name: trivial_clause_load_flat4_x2_sameptr +# GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3 +# GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr2_vgpr3 +# GCN-NEXT: S_ENDPGM + +name: trivial_clause_load_flat4_x2_sameptr +body: | + bb.0: + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr1 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# 32-bit load partially clobbers its own ptr reg +# GCN-LABEL: name: flat_load4_overwrite_ptr_lo +# GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1 +name: flat_load4_overwrite_ptr_lo + +body: | + bb.0: + %vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# 32-bit load partially clobbers its own ptr reg +# GCN-LABEL: name: flat_load4_overwrite_ptr_hi +# GCN: %vgpr1 = FLAT_LOAD_DWORD %vgpr0_vgpr1 +name: flat_load4_overwrite_ptr_hi + +body: | + bb.0: + %vgpr1 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# 64-bit load clobbers its own ptr reg +# GCN-LABEL: name: flat_load8_overwrite_ptr +# GCN: %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3 +name: flat_load8_overwrite_ptr + +body: | + bb.0: + %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# vmcnt has 4 bits, so maximum 16 outstanding loads. The waitcnt +# breaks the clause. + +# GCN-LABEL: name: break_clause_at_max_clause_size_flat_load4 +# GCN: %vgpr17 = FLAT_LOAD_DWORD %vgpr0_vgpr1 +# GCN-NEXT: S_WAITCNT 112 +# GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3 + +name: break_clause_at_max_clause_size_flat_load4 + +body: | + bb.0: + %vgpr2 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr4 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr5 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr6 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr7 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr8 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr9 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr10 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr11 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr12 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr13 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr14 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr15 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr16 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr17 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %sgpr0 = S_MOV_B32 %sgpr0, implicit %vgpr2, implicit %vgpr3, implicit %vgpr4, implicit %vgpr5, implicit %vgpr6, implicit %vgpr7, implicit %vgpr8, implicit %vgpr9, implicit %vgpr10, implicit %vgpr11, implicit %vgpr12, implicit %vgpr13, implicit %vgpr14, implicit %vgpr15, implicit %vgpr16, implicit %vgpr17, implicit %vgpr18 + S_ENDPGM +... +--- + +# GCN-LABEL: name: break_clause_simple_load_flat4_lo_ptr +# GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5 +name: break_clause_simple_load_flat4_lo_ptr + +body: | + bb.0: + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- + +# GCN-LABEL: name: break_clause_simple_load_flat4_hi_ptr +# GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: %vgpr3 = FLAT_LOAD_DWORD %vgpr4_vgpr5 +name: break_clause_simple_load_flat4_hi_ptr + +body: | + bb.0: + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- + +# GCN-LABEL: name: break_clause_simple_load_flat8_ptr +# GCN: %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr +name: break_clause_simple_load_flat8_ptr + +body: | + bb.0: + %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- + +# GCN-LABEL: name: break_clause_simple_load_flat16_ptr +# GCN: %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: %vgpr2_vgpr3_vgpr4_vgpr5 = FLAT_LOAD_DWORDX4 %vgpr6_vgpr7 + +name: break_clause_simple_load_flat16_ptr + +body: | + bb.0: + %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr2_vgpr3_vgpr4_vgpr5 = FLAT_LOAD_DWORDX4 %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- + +# The clause is broken by the waitcnt inserted at the end of the +# block, so no nop is needed. +# GCN-LABEL: name: break_clause_block_boundary_load_flat8_ptr +# GCN: bb.0: +# GCN-NEXT: %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3 +# GCN-NEXT: S_WAITCNT 112 + +# GCN: bb.1: +# GCN-NEXT: %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr4_vgpr5 +# GCN-NEXT: S_ENDPGM + +name: break_clause_block_boundary_load_flat8_ptr + +body: | + bb.0: + %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + + bb.1: + %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# The load clobbers the pointer of the store, so it needs to break. + +# GCN-LABEL: name: break_clause_store_load_into_ptr_flat4 +# GCN: FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr0 +# GCN-NEXT: S_WAITCNT 112 +# GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5 +# GCN-NEXT: S_ENDPGM +name: break_clause_store_load_into_ptr_flat4 + +body: | + bb.0: + FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# The load clobbers the data of the store, so it needs to break. +# FIXME: Would it be better to s_nop and wait later? + +# GCN-LABEL: name: break_clause_store_load_into_data_flat4 +# GCN: FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr0 +# GCN-NEXT: S_WAITCNT 112 +# GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr4_vgpr5 +# GCN-NEXT: S_ENDPGM +name: break_clause_store_load_into_data_flat4 + +body: | + bb.0: + FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr0 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# Regular VALU instruction breaks clause, no nop needed + +# GCN-LABEL: name: valu_inst_breaks_clause +# GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr +# GCN-NEXT: %vgpr8 = V_MOV_B32_e32 0, implicit %exec +# GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr +name: valu_inst_breaks_clause + +body: | + bb.0: + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr8 = V_MOV_B32_e32 0, implicit %exec + %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# Regular SALU instruction breaks clause, no nop needed + +# GCN-LABEL: name: salu_inst_breaks_clause +# GCN: bb.0: +# GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr +# GCN-NEXT: %sgpr8 = S_MOV_B32 0 +# GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr +name: salu_inst_breaks_clause + +body: | + bb.0: + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %sgpr8 = S_MOV_B32 0 + %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- + +# GCN-LABEL: name: ds_inst_breaks_clause +# GCN: bb.0: +# GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr +# GCN-NEXT: %vgpr8 = DS_READ_B32 %vgpr9 +# GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr +name: ds_inst_breaks_clause + +body: | + bb.0: + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr8 = DS_READ_B32 %vgpr9, 0, 0, implicit %m0, implicit %exec + %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- + +# GCN-LABEL: name: smrd_inst_breaks_clause +# GCN: bb.0: +# GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr +# GCN-NEXT: %sgpr8 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 0, 0 +# GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr +name: smrd_inst_breaks_clause + +body: | + bb.0: + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %sgpr8 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 0, 0 + %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# FIXME: Should this be handled? +# GCN-LABEL: name: implicit_use_breaks_clause +# GCN: %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr, implicit %vgpr4_vgpr5 +# GCN-NEXT: %vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr +name: implicit_use_breaks_clause + +body: | + bb.0: + %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr, implicit %vgpr4_vgpr5 + %vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# GCN-LABEL: name: trivial_clause_load_mubuf4_x2 +# GCN: bb.0: +# GCN-NEXT: %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4 +# GCN-NEXT: %vgpr3 = BUFFER_LOAD_DWORD_OFFEN %vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4 +name: trivial_clause_load_mubuf4_x2 + +body: | + bb.0: + %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + %vgpr3 = BUFFER_LOAD_DWORD_OFFEN %vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + S_ENDPGM +... +--- +# GCN-LABEL: name: break_clause_simple_load_mubuf_offen_ptr +# GCN: bb.0: +# GCN-NEXT: %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: %vgpr2 = BUFFER_LOAD_DWORD_OFFEN %vgpr3, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4 +name: break_clause_simple_load_mubuf_offen_ptr + +body: | + bb.0: + %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + %vgpr2 = BUFFER_LOAD_DWORD_OFFEN %vgpr3, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + S_ENDPGM +... +--- +# BUFFER instructions overwriting their own inputs is supposedly OK. + +# GCN-LABEL: name: mubuf_load4_overwrite_ptr +# GCN: bb.0: +# GCN-NEXT: %vgpr0 = BUFFER_LOAD_DWORD_OFFEN %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec +# GCN-NEXT: %vgpr1 = V_MOV_B32_e32 0, implicit %exec +# GCN-NEXT: S_WAITCNT 3952 +# GCN-NEXT: %vgpr2 = V_MOV_B32_e32 %vgpr0, implicit %exec +# GCN-NEXT: S_ENDPGM +name: mubuf_load4_overwrite_ptr + +body: | + bb.0: + %vgpr0 = BUFFER_LOAD_DWORD_OFFEN %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + %vgpr1 = V_MOV_B32_e32 0, implicit %exec + %vgpr2 = V_MOV_B32_e32 %vgpr0, implicit %exec + S_ENDPGM +... +--- +# Break a clause from interference between mubuf and flat instructions + +# GCN-LABEL: name: break_clause_flat_load_mubuf_load +# GCN: bb.0: +# GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: %vgpr2 = BUFFER_LOAD_DWORD_OFFEN %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec +# GCN-NEXT: S_ENDPGM +name: break_clause_flat_load_mubuf_load + +body: | + bb.0: + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr2 = BUFFER_LOAD_DWORD_OFFEN %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + S_ENDPGM +... +# Break a clause from interference between mubuf and flat instructions + +# GCN-LABEL: name: break_clause_mubuf_load_flat_load +# GCN: bb.0: +# GCN-NEXT: %vgpr0 = BUFFER_LOAD_DWORD_OFFEN %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr2_vgpr3 +# GCN-NEXT: S_ENDPGM +name: break_clause_mubuf_load_flat_load + +body: | + bb.0: + %vgpr0 = BUFFER_LOAD_DWORD_OFFEN %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + %vgpr1 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + + S_ENDPGM +... +--- + +# GCN-LABEL: name: break_clause_atomic_rtn_into_ptr_flat4 +# GCN: bb.0: +# GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: %vgpr4 = FLAT_ATOMIC_ADD_RTN %vgpr5_vgpr6, %vgpr7 +# GCN-NEXT: S_ENDPGM +name: break_clause_atomic_rtn_into_ptr_flat4 + +body: | + bb.0: + %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr4 = FLAT_ATOMIC_ADD_RTN %vgpr5_vgpr6, %vgpr7, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# GCN-LABEL: name: break_clause_atomic_nortn_ptr_load_flat4 +# GCN: bb.0: +# GCN-NEXT: FLAT_ATOMIC_ADD %vgpr0_vgpr1, %vgpr2, 0, 0, implicit %exec, implicit %flat_scr +# GCN-NEXT: S_WAITCNT 112 +# GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr +# GCN-NEXT: S_ENDPGM +name: break_clause_atomic_nortn_ptr_load_flat4 + +body: | + bb.0: + FLAT_ATOMIC_ADD %vgpr0_vgpr1, %vgpr2, 0, 0, implicit %exec, implicit %flat_scr + %vgpr2 = FLAT_LOAD_DWORD %vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- + +# GCN-LABEL: name: break_clause_atomic_rtn_into_ptr_mubuf4 +# GCN: bb.0: +# GCN-NEXT: %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: %vgpr2 = BUFFER_ATOMIC_ADD_RTN_OFFEN %vgpr2, %vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4 +# GCN-NEXT: S_ENDPGM +name: break_clause_atomic_rtn_into_ptr_mubuf4 + +body: | + bb.0: + %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + %vgpr2 = BUFFER_ATOMIC_ADD_RTN_OFFEN %vgpr2, %vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, implicit %exec + S_ENDPGM +... +--- + +# GCN-LABEL: name: break_clause_atomic_nortn_ptr_load_mubuf4 +# GCN: bb.0: +# GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN %vgpr0, %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4 +# GCN-NEXT: S_ENDPGM +name: break_clause_atomic_nortn_ptr_load_mubuf4 + +body: | + bb.0: + BUFFER_ATOMIC_ADD_OFFEN %vgpr0, %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, implicit %exec + %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + S_ENDPGM +... +--- +# Make sure there is no assert on mubuf instructions which do not have +# vaddr, and don't add register to track. +# GCN-LABEL: name: no_break_clause_mubuf_load_novaddr +# GCN: bb.0: +# GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN %vgpr0, %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4 +# GCN-NEXT: S_ENDPGM +name: no_break_clause_mubuf_load_novaddr + +body: | + bb.0: + %vgpr1 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + %vgpr3 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + S_ENDPGM +...