Index: lib/Target/AMDGPU/GCNHazardRecognizer.h =================================================================== --- lib/Target/AMDGPU/GCNHazardRecognizer.h +++ lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -14,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H #define LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" #include @@ -24,6 +25,7 @@ class MachineInstr; class ScheduleDAG; class SIInstrInfo; +class SIRegisterInfo; class SISubtarget; class GCNHazardRecognizer final : public ScheduleHazardRecognizer { @@ -35,6 +37,20 @@ const MachineFunction &MF; const SISubtarget &ST; const SIInstrInfo &TII; + const SIRegisterInfo &TRI; + + /// RegUnits of uses in the current soft memory clause. + BitVector ClauseUses; + + /// RegUnits of defs in the current soft memory clause. + BitVector ClauseDefs; + + void resetClause() { + ClauseUses.reset(); + ClauseDefs.reset(); + } + + void addClauseInst(const MachineInstr &MI); int getWaitStatesSince(function_ref IsHazard); int getWaitStatesSinceDef(unsigned Reg, Index: lib/Target/AMDGPU/GCNHazardRecognizer.cpp =================================================================== --- lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -40,7 +40,10 @@ CurrCycleInstr(nullptr), MF(MF), ST(MF.getSubtarget()), - TII(*ST.getInstrInfo()) { + TII(*ST.getInstrInfo()), + TRI(TII.getRegisterInfo()), + ClauseUses(TRI.getNumRegUnits()), + ClauseDefs(TRI.getNumRegUnits()) { MaxLookAhead = 5; } @@ -258,19 +261,35 @@ // No-op Hazard Detection //===----------------------------------------------------------------------===// -static void addRegsToSet(iterator_range Ops, - std::set &Set) { +static void addRegUnits(const SIRegisterInfo &TRI, + BitVector &BV, unsigned Reg) { + for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) + BV.set(*RUI); +} + +static void addRegsToSet(const SIRegisterInfo &TRI, + iterator_range Ops, + BitVector &Set) { for (const MachineOperand &Op : Ops) { if (Op.isReg()) - Set.insert(Op.getReg()); + addRegUnits(TRI, Set, Op.getReg()); } } +void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { + // XXX: Do we need to worry about implicit operands + addRegsToSet(TRI, MI.defs(), ClauseDefs); + addRegsToSet(TRI, MI.uses(), ClauseUses); +} + int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) { - // SMEM soft clause are only present on VI+ - if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) + // SMEM soft clause are only present on VI+, and only matter if xnack is + // enabled. + if (!ST.isXNACKEnabled()) return 0; + resetClause(); + // A soft-clause is any group of consecutive SMEM instructions. The // instructions in this group may return out of order and/or may be // replayed (i.e. the same instruction issued more than once). @@ -281,21 +300,16 @@ // (including itself). If we encounter this situaion, we need to break the // clause by inserting a non SMEM instruction. - std::set ClauseDefs; - std::set ClauseUses; - for (MachineInstr *MI : EmittedInstrs) { - // When we hit a non-SMEM instruction then we have passed the start of the // clause and we can stop. if (!MI || !SIInstrInfo::isSMRD(*MI)) break; - addRegsToSet(MI->defs(), ClauseDefs); - addRegsToSet(MI->uses(), ClauseUses); + addClauseInst(*MI); } - if (ClauseDefs.empty()) + if (ClauseDefs.none()) return 0; // FIXME: When we support stores, we need to make sure not to put loads and @@ -304,21 +318,11 @@ if (SMEM->mayStore()) return 1; - addRegsToSet(SMEM->defs(), ClauseDefs); - addRegsToSet(SMEM->uses(), ClauseUses); - - std::vector Result(std::max(ClauseDefs.size(), ClauseUses.size())); - std::vector::iterator End; - - End = std::set_intersection(ClauseDefs.begin(), ClauseDefs.end(), - ClauseUses.begin(), ClauseUses.end(), Result.begin()); + addClauseInst(*SMEM); // If the set of defs and uses intersect then we cannot add this instruction // to the clause, so we have a hazard. - if (End != Result.begin()) - return 1; - - return 0; + return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; } int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { Index: test/CodeGen/AMDGPU/break-smem-soft-clauses.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/break-smem-soft-clauses.mir @@ -0,0 +1,351 @@ +# RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s +# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s + +--- +# Trivial clause at beginning of program +name: trivial_smem_clause_load_smrd4_x1 + +body: | + bb.0: + ; GCN-LABEL: name: trivial_smem_clause_load_smrd4_x1 + ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: S_ENDPGM + %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + S_ENDPGM +... +--- +# Trivial clause at beginning of program +name: trivial_smem_clause_load_smrd4_x2 + +body: | + bb.0: + ; GCN-LABEL: name: trivial_smem_clause_load_smrd4_x2 + ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr1 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + ; GCN-NEXT: S_ENDPGM + %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr1 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + S_ENDPGM +... +--- +# Trivial clause at beginning of program +name: trivial_smem_clause_load_smrd4_x3 + +body: | + bb.0: + ; GCN-LABEL: name: trivial_smem_clause_load_smrd4_x3 + ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + ; GCN-NEXT: %sgpr1 = S_LOAD_DWORD_IMM %sgpr6_sgpr7, 0, 0 + ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0 + ; GCN-NEXT: S_ENDPGM + %sgpr0 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + %sgpr1 = S_LOAD_DWORD_IMM %sgpr6_sgpr7, 0, 0 + %sgpr2 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0 + S_ENDPGM +... +--- +# Trivial clause at beginning of program +name: trivial_smem_clause_load_smrd4_x4 + +body: | + bb.0: + ; GCN-LABEL: name: trivial_smem_clause_load_smrd4_x4 + ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + ; GCN-NEXT: %sgpr1 = S_LOAD_DWORD_IMM %sgpr8_sgpr9, 0, 0 + ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0 + ; GCN-NEXT: %sgpr3 = S_LOAD_DWORD_IMM %sgpr16_sgpr17, 0, 0 + ; GCN-NEXT: S_ENDPGM + %sgpr0 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + %sgpr1 = S_LOAD_DWORD_IMM %sgpr8_sgpr9, 0, 0 + %sgpr2 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0 + %sgpr3 = S_LOAD_DWORD_IMM %sgpr16_sgpr17, 0, 0 + S_ENDPGM +... +--- +# Reuse of same input pointer is OK +name: trivial_smem_clause_load_smrd4_x2_sameptr +body: | + bb.0: + ; GCN-LABEL: name: trivial_smem_clause_load_smrd4_x2_sameptr + ; GCN: %sgpr12 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr13 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: S_ENDPGM + %sgpr12 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr13 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + S_ENDPGM +... +--- +# 32-bit load partially clobbers its own ptr reg +name: smrd_load4_overwrite_ptr_lo + +body: | + bb.0: + ; GCN-LABEL: name: smrd_load4_overwrite_ptr_lo + ; GCN: %sgpr10 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: S_ENDPGM + %sgpr10 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + S_ENDPGM +... +--- +# 32-bit load partially clobbers its own ptr reg +name: smrd_load4_overwrite_ptr_hi + +body: | + bb.0: + ; GCN-LABEL: name: smrd_load4_overwrite_ptr_hi + ; GCN: %sgpr11 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: S_ENDPGM + %sgpr11 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + S_ENDPGM +... +--- +# 64-bit load clobbers its own ptr reg +name: smrd_load8_overwrite_ptr + +body: | + bb.0: + ; GCN-LABEL: name: smrd_load8_overwrite_ptr + ; GCN: %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: S_ENDPGM + %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0 + S_ENDPGM +... +--- +# vmcnt has 4 bits, so maximum 16 outstanding loads. The waitcnt +# breaks the clause. + +name: break_smem_clause_at_max_smem_clause_size_smrd_load4 + +body: | + bb.0: + ; GCN-LABEL: name: break_smem_clause_at_max_smem_clause_size_smrd_load4 + ; GCN: %sgpr13 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr14 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr15 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr16 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr17 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr18 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr19 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr20 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr21 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr22 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr23 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr24 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr25 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr26 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr27 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr28 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr0 = S_LOAD_DWORD_IMM %sgpr30_sgpr31, 0, 0 + ; GCN-NEXT: %sgpr0 = S_MOV_B32 %sgpr0, implicit %sgpr13, implicit %sgpr14, implicit %sgpr15, implicit %sgpr16, implicit %sgpr17, implicit %sgpr18, implicit %sgpr19, implicit %sgpr20, implicit %sgpr21, implicit %sgpr22, implicit %sgpr23, implicit %sgpr24, implicit %sgpr25, implicit %sgpr26, implicit %sgpr27, implicit %sgpr28 + ; GCN-NEXT: S_ENDPGM + %sgpr13 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr14 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr15 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr16 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + %sgpr17 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr18 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr19 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr20 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + %sgpr21 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr22 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr23 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr24 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + %sgpr25 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr26 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr27 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr28 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + %sgpr0 = S_LOAD_DWORD_IMM %sgpr30_sgpr31, 0, 0 + %sgpr0 = S_MOV_B32 %sgpr0, implicit %sgpr13, implicit %sgpr14, implicit %sgpr15, implicit %sgpr16, implicit %sgpr17, implicit %sgpr18, implicit %sgpr19, implicit %sgpr20, implicit %sgpr21, implicit %sgpr22, implicit %sgpr23, implicit %sgpr24, implicit %sgpr25, implicit %sgpr26, implicit %sgpr27, implicit %sgpr28 + S_ENDPGM +... +--- + +name: break_smem_clause_simple_load_smrd4_lo_ptr + +body: | + bb.0: + ; GCN-LABEL: name: break_smem_clause_simple_load_smrd4_lo_ptr + ; GCN: %sgpr10 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %sgpr12 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + ; GCN-NEXT: S_ENDPGM + %sgpr10 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr12 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + S_ENDPGM +... +--- + +name: break_smem_clause_simple_load_smrd4_hi_ptr + +body: | + bb.0: + ; GCN-LABEL: name: break_smem_clause_simple_load_smrd4_hi_ptr + ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr3 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + ; GCN-NEXT: S_ENDPGM + %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr3 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + S_ENDPGM +... +--- + +name: break_smem_clause_simple_load_smrd8_ptr + +body: | + bb.0: + ; GCN-LABEL: name: break_smem_clause_simple_load_smrd8_ptr + ; GCN: %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0 + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr12_sgpr13, 0, 0 + ; GCN-NEXT: S_ENDPGM + %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0 + %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr12_sgpr13, 0, 0 + S_ENDPGM +... +--- + +name: break_smem_clause_simple_load_smrd16_ptr + +body: | + bb.0: + ; GCN-LABEL: name: break_smem_clause_simple_load_smrd16_ptr + ; GCN: %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX4_IMM %sgpr6_sgpr7, 0, 0 + ; GCN-NEXT: S_ENDPGM + %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0 + %sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX4_IMM %sgpr6_sgpr7, 0, 0 + S_ENDPGM +... +--- + +name: break_smem_clause_block_boundary_load_smrd8_ptr + +body: | + ; GCN-LABEL: name: break_smem_clause_block_boundary_load_smrd8_ptr + ; GCN: bb.0: + ; GCN: successors: %bb.1(0x80000000) + ; GCN: %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0 + ; GCN: bb.1: + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr12_sgpr13, 0, 0 + ; GCN-NEXT: S_ENDPGM + bb.0: + %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0 + + bb.1: + %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr12_sgpr13, 0, 0 + S_ENDPGM +... +--- +# The load clobbers the pointer of the store, so it needs to break. + +name: break_smem_clause_store_load_into_ptr_smrd4 + +body: | + bb.0: + ; GCN-LABEL: name: break_smem_clause_store_load_into_ptr_smrd4 + ; GCN: S_STORE_DWORD_IMM %sgpr16, %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr12 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0 + ; GCN-NEXT: S_ENDPGM + S_STORE_DWORD_IMM %sgpr16, %sgpr10_sgpr11, 0, 0 + %sgpr12 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0 + S_ENDPGM +... +--- +# The load clobbers the data of the store, so it needs to break. +# FIXME: Would it be better to s_nop and wait later? + +name: break_smem_clause_store_load_into_data_smrd4 + +body: | + bb.0: + ; GCN-LABEL: name: break_smem_clause_store_load_into_data_smrd4 + ; GCN: S_STORE_DWORD_IMM %sgpr8, %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr8 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + ; GCN-NEXT: S_ENDPGM + S_STORE_DWORD_IMM %sgpr8, %sgpr10_sgpr11, 0, 0 + %sgpr8 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + S_ENDPGM +... +--- +# Regular VALU instruction breaks clause, no nop needed +name: valu_inst_breaks_smem_clause + +body: | + bb.0: + ; GCN-LABEL: name: valu_inst_breaks_smem_clause + ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %vgpr8 = V_MOV_B32_e32 0, implicit %exec + ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + ; GCN-NEXT: S_ENDPGM + %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %vgpr8 = V_MOV_B32_e32 0, implicit %exec + %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + S_ENDPGM +... +--- +# Regular SALU instruction breaks clause, no nop needed +name: salu_inst_breaks_smem_clause + +body: | + bb.0: + ; GCN-LABEL: name: salu_inst_breaks_smem_clause + ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr8 = S_MOV_B32 0 + ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + ; GCN-NEXT: S_ENDPGM + %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr8 = S_MOV_B32 0 + %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + S_ENDPGM +... +--- +name: ds_inst_breaks_smem_clause + +body: | + bb.0: + ; GCN-LABEL: name: ds_inst_breaks_smem_clause + ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %vgpr8 = DS_READ_B32 %vgpr9, 0, 0, implicit %m0, implicit %exec + ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + ; GCN-NEXT: S_ENDPGM + %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %vgpr8 = DS_READ_B32 %vgpr9, 0, 0, implicit %m0, implicit %exec + %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + S_ENDPGM +... +--- + +name: flat_inst_breaks_smem_clause + +body: | + bb.0: + ; GCN-LABEL: name: flat_inst_breaks_smem_clause + ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + ; GCN-NEXT: S_ENDPGM + %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0 + S_ENDPGM +... +--- +# FIXME: Should this be handled? +name: implicit_use_breaks_smem_clause + +body: | + bb.0: + ; GCN-LABEL: name: implicit_use_breaks_smem_clause + ; GCN: %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0, implicit %sgpr12_sgpr13 + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %sgpr12_sgpr13 = S_LOAD_DWORDX2_IMM %sgpr6_sgpr7, 0, 0 + ; GCN-NEXT: S_ENDPGM + %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0, implicit %sgpr12_sgpr13 + %sgpr12_sgpr13 = S_LOAD_DWORDX2_IMM %sgpr6_sgpr7, 0, 0 + S_ENDPGM +... Index: test/CodeGen/AMDGPU/immv216.ll =================================================================== --- test/CodeGen/AMDGPU/immv216.ll +++ test/CodeGen/AMDGPU/immv216.ll @@ -282,9 +282,9 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 ; GFX9: buffer_store_dword [[REG]] -; VI: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800 ; VI: buffer_load_dword ; VI-NOT: and +; VI: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}} ; VI: v_or_b32 Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx901 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s @@ -428,10 +428,12 @@ ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 +; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 + ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] +; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] @@ -455,11 +457,12 @@ ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr: ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 +; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] +; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll @@ -81,8 +81,8 @@ } ; GCN-LABEL: {{^}}div_fixup_f16_imm_a_imm_b -; VI: v_mov_b32_e32 v[[AB_F16:[0-9]+]], 0x4200{{$}} -; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] +; VI-DAG: v_mov_b32_e32 v[[AB_F16:[0-9]+]], 0x4200{{$}} +; GCN-DAG: buffer_load_ushort v[[C_F16:[0-9]+]] ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AB_F16]], v[[AB_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm @@ -97,8 +97,8 @@ } ; GCN-LABEL: {{^}}div_fixup_f16_imm_b_imm_c -; VI: v_mov_b32_e32 v[[BC_F16:[0-9]+]], 0x4200{{$}} -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; VI-DAG: v_mov_b32_e32 v[[BC_F16:[0-9]+]], 0x4200{{$}} +; GCN-DAG: buffer_load_ushort v[[A_F16:[0-9]+]] ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[BC_F16]], v[[BC_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm @@ -113,8 +113,8 @@ } ; GCN-LABEL: {{^}}div_fixup_f16_imm_a_imm_c -; VI: v_mov_b32_e32 v[[AC_F16:[0-9]+]], 0x4200{{$}} -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; VI-DAG: v_mov_b32_e32 v[[AC_F16:[0-9]+]], 0x4200{{$}} +; GCN-DAG: buffer_load_ushort v[[B_F16:[0-9]+]] ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AC_F16]], v[[B_F16]], v[[AC_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm Index: test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll =================================================================== --- test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -45,14 +45,16 @@ ; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} ; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} ; SI: buffer_load_dword [[VA0:v[0-9]+]] -; SI: buffer_load_dword [[VA1:v[0-9]+]] +; SI-NEXT: buffer_load_dword [[VA1:v[0-9]+]] ; GCN-NOT: v_mov_b32 -; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; GCN-NOT: v_mov_b32 ; VI: buffer_load_dword [[VA0:v[0-9]+]] -; VI: buffer_load_dword [[VA1:v[0-9]+]] +; VI-NEXT: buffer_load_dword [[VA1:v[0-9]+]] + +; GCN-NOT: v_mov_b32 +; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; GCN-NOT: v_mov_b32 ; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SA]], [[VA0]], [[VB]] ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SA]], [[VA1]], [[VB]]