Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -547,6 +547,16 @@ void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;} bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;} + unsigned getMaxLgkmcnt() const { + return AMDGPU::getLgkmcntBitMask( + AMDGPU::IsaInfo::getIsaVersion(getFeatureBits())); + } + + unsigned getMaxVmcnt() const { + return AMDGPU::getVmcntBitMask( + AMDGPU::IsaInfo::getIsaVersion(getFeatureBits())); + } + /// \returns Number of execution units per compute unit supported by the /// subtarget. unsigned getEUsPerCU() const { Index: lib/Target/AMDGPU/GCNHazardRecognizer.h =================================================================== --- lib/Target/AMDGPU/GCNHazardRecognizer.h +++ lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -38,6 +38,8 @@ const SISubtarget &ST; const SIInstrInfo &TII; const SIRegisterInfo &TRI; + unsigned MaxSMEMClauseSize; + unsigned MaxVMEMClauseSize; /// RegUnits of uses in the current soft memory clause. BitVector ClauseUses; Index: lib/Target/AMDGPU/GCNHazardRecognizer.cpp =================================================================== --- lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -36,15 +36,35 @@ // Hazard Recoginizer Implementation //===----------------------------------------------------------------------===// +static cl::opt +HazardMaxLookAhead("amdgpu-hazard-max-lookahead", cl::Hidden, cl::init(16), + cl::desc("Maximum instruction count to look for hazards")); + GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : CurrCycleInstr(nullptr), MF(MF), ST(MF.getSubtarget()), TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()), + MaxSMEMClauseSize(ST.getMaxLgkmcnt() + 1), + MaxVMEMClauseSize(ST.getMaxVmcnt() + 1), ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { - MaxLookAhead = 5; + + // VMCNT is 4 bits, except on GFX9 where it is 6 bits, therefore the maximum + // size of a VMEM clause is 16 or 64. + // + // LGKMCNT is always 4 bits for an SMEM clause. + + // Setting it so high only allows for skipping a nop for extremely large + // clauses, so it should be fine to reduce this if it's a compile time issue. + MaxLookAhead = std::max(MaxVMEMClauseSize, MaxSMEMClauseSize); + + if (HazardMaxLookAhead.getNumOccurrencesFlag()) + MaxLookAhead = HazardMaxLookAhead; + + // The maximum number of wait states for any normal hazard type is 5. + MaxLookAhead = std::max(MaxLookAhead, 5u); } void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { @@ -308,11 +328,13 @@ bool IsSMRD = TII.isSMRD(*MEM); + const unsigned MaxClauseSize = IsSMRD ? MaxSMEMClauseSize : MaxVMEMClauseSize; + resetClause(); - // A soft-clause is any group of consecutive SMEM instructions. The - // instructions in this group may return out of order and/or may be - // replayed (i.e. the same instruction issued more than once). + // A soft-clause is any group of consecutive SMEM or VMEM instructions. The + // instructions in this group may return out of order and/or may be replayed + // (i.e. the same instruction issued more than once). // // In order to handle these situations correctly we need to make sure // that when a clause has more than one instruction, no instruction in the @@ -320,6 +342,8 @@ // (including itself). If we encounter this situaion, we need to break the // clause by inserting a non SMEM instruction. + unsigned ClauseSize = 1; + for (MachineInstr *MI : EmittedInstrs) { // When we hit a non-SMEM instruction then we have passed the start of the // clause and we can stop. @@ -330,6 +354,10 @@ break; addClauseInst(*MI); + + // FIXME: The maximum clause size is actually determined by saturating the + // counter type. This is > 1 for some SMEM operations. + ++ClauseSize; } if (ClauseDefs.none()) @@ -341,6 +369,10 @@ if (MEM->mayStore()) return 1; + // The clause is implicitly broken if it exceeds the maximum. + if (ClauseSize > MaxClauseSize) + return 0; + addClauseInst(*MEM); // If the set of defs and uses intersect then we cannot add this instruction Index: test/CodeGen/AMDGPU/break-smem-soft-clauses.mir =================================================================== --- test/CodeGen/AMDGPU/break-smem-soft-clauses.mir +++ test/CodeGen/AMDGPU/break-smem-soft-clauses.mir @@ -1,5 +1,7 @@ # RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s # RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s +# RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s --- # Trivial clause at beginning of program @@ -111,33 +113,58 @@ S_ENDPGM ... --- -# vmcnt has 4 bits, so maximum 16 outstanding loads. The waitcnt -# breaks the clause. +# lgmcnt has 4 bits pre-gfx9, so maximum 16 outstanding loads. name: break_smem_clause_at_max_smem_clause_size_smrd_load4 body: | bb.0: ; GCN-LABEL: name: break_smem_clause_at_max_smem_clause_size_smrd_load4 + ; GCN: %sgpr13 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 ; GCN-NEXT: %sgpr14 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 ; GCN-NEXT: %sgpr15 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 ; GCN-NEXT: %sgpr16 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr17 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 ; GCN-NEXT: %sgpr18 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 ; GCN-NEXT: %sgpr19 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 ; GCN-NEXT: %sgpr20 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr21 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 ; GCN-NEXT: %sgpr22 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 ; GCN-NEXT: %sgpr23 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 ; GCN-NEXT: %sgpr24 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr25 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 ; GCN-NEXT: %sgpr26 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 ; GCN-NEXT: %sgpr27 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 ; GCN-NEXT: %sgpr28 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 - ; GCN-NEXT: %sgpr0 = S_LOAD_DWORD_IMM %sgpr30_sgpr31, 0, 0 - ; GCN-NEXT: %sgpr0 = S_MOV_B32 %sgpr0, implicit %sgpr13, implicit %sgpr14, implicit %sgpr15, implicit %sgpr16, implicit %sgpr17, implicit %sgpr18, implicit %sgpr19, implicit %sgpr20, implicit %sgpr21, implicit %sgpr22, implicit %sgpr23, implicit %sgpr24, implicit %sgpr25, implicit %sgpr26, implicit %sgpr27, implicit %sgpr28 + + ; GCN-NEXT: %sgpr29 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr30 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr31 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr32 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: %sgpr33 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr34 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr35 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr36 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: %sgpr37 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr38 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr39 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr40 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: %sgpr41 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr42 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr43 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr44 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: %sgpr10 = S_LOAD_DWORD_IMM %sgpr30_sgpr31, 0, 0 + ; GCN-NEXT: %sgpr0 = S_MOV_B32 %sgpr0 ; GCN-NEXT: S_ENDPGM + %sgpr13 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 %sgpr14 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 %sgpr15 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 @@ -158,8 +185,83 @@ %sgpr27 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 %sgpr28 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 - %sgpr0 = S_LOAD_DWORD_IMM %sgpr30_sgpr31, 0, 0 - %sgpr0 = S_MOV_B32 %sgpr0, implicit %sgpr13, implicit %sgpr14, implicit %sgpr15, implicit %sgpr16, implicit %sgpr17, implicit %sgpr18, implicit %sgpr19, implicit %sgpr20, implicit %sgpr21, implicit %sgpr22, implicit %sgpr23, implicit %sgpr24, implicit %sgpr25, implicit %sgpr26, implicit %sgpr27, implicit %sgpr28 + %sgpr29 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr30 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr31 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr32 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + %sgpr33 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr34 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr35 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr36 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + %sgpr37 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr38 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr39 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr40 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + %sgpr41 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr42 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr43 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr44 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + %sgpr10 = S_LOAD_DWORD_IMM %sgpr30_sgpr31, 0, 0 + %sgpr0 = S_MOV_B32 %sgpr0 + S_ENDPGM +... +--- + +name: break_smem_clause_at_max_smem_clause_size_m1_smrd_load4 + +body: | + bb.0: + ; GCN-LABEL: name: break_smem_clause_at_max_smem_clause_size_m1_smrd_load4 + + ; GCN: %sgpr13 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr14 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr15 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr16 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: %sgpr17 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr18 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr19 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr20 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: %sgpr21 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr22 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr23 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr24 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: %sgpr25 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr26 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: %sgpr27 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %sgpr10 = S_LOAD_DWORD_IMM %sgpr30_sgpr31, 0, 0 + ; GCN-NEXT: %sgpr0 = S_MOV_B32 %sgpr0 + ; GCN-NEXT: S_ENDPGM + + %sgpr13 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr14 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr15 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr16 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + %sgpr17 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr18 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr19 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr20 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + %sgpr21 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr22 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr23 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr24 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + %sgpr25 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr26 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + %sgpr27 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0 + + %sgpr10 = S_LOAD_DWORD_IMM %sgpr30_sgpr31, 0, 0 + %sgpr0 = S_MOV_B32 %sgpr0 S_ENDPGM ... --- Index: test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir =================================================================== --- test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir +++ test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir @@ -1,5 +1,8 @@ -# RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s +# RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,XNACK-VI %s # RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s +# RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,XNACK-GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s + --- # Trivial clause at beginning of program name: trivial_clause_load_flat4_x1 @@ -119,15 +122,15 @@ S_ENDPGM ... --- -# vmcnt has 4 bits, so maximum 16 outstanding loads. The waitcnt -# breaks the clause. +# vmcnt has 4 bits pre-gfx9, so maximum 16 outstanding loads can be in a +# clause, so no breaking is needed. -name: break_clause_at_max_clause_size_flat_load4 +name: break_clause_at_max_clause16_size_flat_load4 body: | bb.0: - ; GCN-LABEL: name: break_clause_at_max_clause_size_flat_load4 + ; GCN-LABEL: name: break_clause_at_max_clause16_size_flat_load4 ; GCN: %vgpr2 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr ; GCN-NEXT: %vgpr3 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr ; GCN-NEXT: %vgpr4 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr @@ -144,7 +147,7 @@ ; GCN-NEXT: %vgpr15 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr ; GCN-NEXT: %vgpr16 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr ; GCN-NEXT: %vgpr17 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr - ; XNACK-NEXT: S_NOP 0 + ; XNACK-GFX9-NEXT: S_NOP 0 ; GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr ; GCN-NEXT: %sgpr0 = S_MOV_B32 %sgpr0, implicit %vgpr2, implicit %vgpr3, implicit %vgpr4, implicit %vgpr5, implicit %vgpr6, implicit %vgpr7, implicit %vgpr8, implicit %vgpr9, implicit %vgpr10, implicit %vgpr11, implicit %vgpr12, implicit %vgpr13, implicit %vgpr14, implicit %vgpr15, implicit %vgpr16, implicit %vgpr17, implicit %vgpr18 ; GCN-NEXT: S_ENDPGM @@ -174,6 +177,414 @@ S_ENDPGM ... --- +# vmcnt has 4 bits pre-gfx9, so maximum 16 outstanding loads can be in a +# clause, so no breaking is needed. + + +name: break_clause_at_max_m1_clause16_size_flat_load4 + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_at_max_m1_clause16_size_flat_load4 + ; GCN: %vgpr2 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr3 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr4 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr5 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr6 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr7 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr8 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr9 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr10 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr11 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr12 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr13 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr14 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr15 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr16 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %sgpr0 = S_MOV_B32 %sgpr0, implicit %vgpr2, implicit %vgpr3, implicit %vgpr4, implicit %vgpr5, implicit %vgpr6, implicit %vgpr7, implicit %vgpr8, implicit %vgpr9, implicit %vgpr10, implicit %vgpr11, implicit %vgpr12, implicit %vgpr13, implicit %vgpr14, implicit %vgpr15, implicit %vgpr16, implicit %vgpr17, implicit %vgpr18 + ; GCN-NEXT: S_ENDPGM + + %vgpr2 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr4 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr5 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr6 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr7 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr8 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr9 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr10 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr11 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr12 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr13 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr14 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr15 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr16 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %sgpr0 = S_MOV_B32 %sgpr0, implicit %vgpr2, implicit %vgpr3, implicit %vgpr4, implicit %vgpr5, implicit %vgpr6, implicit %vgpr7, implicit %vgpr8, implicit %vgpr9, implicit %vgpr10, implicit %vgpr11, implicit %vgpr12, implicit %vgpr13, implicit %vgpr14, implicit %vgpr15, implicit %vgpr16, implicit %vgpr17, implicit %vgpr18 + S_ENDPGM +... +--- +# vmcnt was increased to 6 bits in gfx9, so maximum 64 outstanding loads can be in a +# clause, so no breaking is needed. + +name: break_clause_at_max_clause64_size_flat_load4 + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_at_max_clause64_size_flat_load4 + + ; GCN: %vgpr2 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr3 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr4 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr5 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr6 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr7 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr8 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr9 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr10 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr11 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr12 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr13 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr14 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr15 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr16 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr17 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr18 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr19 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr20 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr21 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr22 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr23 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr24 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr25 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr26 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr27 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr28 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr29 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr30 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr31 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr32 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr33 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr34 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr35 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr36 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr37 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr38 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr39 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr40 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr41 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr42 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr43 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr44 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr45 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr46 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr47 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr48 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr49 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr50 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr51 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr52 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr53 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr54 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr55 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr56 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr57 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr58 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr59 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr60 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr61 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr62 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr63 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr64 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr65 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %sgpr0 = S_MOV_B32 %sgpr0 + ; GCN-NEXT: S_ENDPGM + + %vgpr2 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr4 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr5 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr6 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr7 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr8 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr9 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr10 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr11 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr12 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr13 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr14 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr15 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr16 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr17 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr18 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr19 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr20 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr21 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr22 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr23 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr24 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr25 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr26 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr27 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr28 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr29 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr30 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr31 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr32 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr33 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr34 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr35 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr36 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr37 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr38 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr39 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr40 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr41 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr42 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr43 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr44 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr45 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr46 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr47 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr48 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr49 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr50 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr51 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr52 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr53 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr54 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr55 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr56 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr57 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr58 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr59 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr60 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr61 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr62 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr63 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr64 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr65 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %sgpr0 = S_MOV_B32 %sgpr0 + S_ENDPGM +... +--- +# vmcnt was increased to 6 bits in gfx9, so maximum 64 outstanding loads can be in a +# clause, so no breaking is needed. + +name: break_clause_at_max_m1_clause64_size_flat_load4 + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_at_max_m1_clause64_size_flat_load4 + + ; GCN: %vgpr2 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr3 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr4 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr5 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr6 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr7 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr8 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr9 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr10 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr11 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr12 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr13 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr14 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr15 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr16 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr17 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr18 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr19 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr20 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr21 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr22 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr23 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr24 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr25 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr26 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr27 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr28 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr29 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr30 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr31 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr32 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr33 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr34 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr35 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr36 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr37 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr38 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr39 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr40 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr41 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr42 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr43 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr44 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr45 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr46 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr47 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr48 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr49 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr50 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr51 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr52 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr53 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr54 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr55 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr56 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr57 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr58 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr59 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr60 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr61 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; GCN-NEXT: %vgpr62 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr63 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr64 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + ; XNACK-GFX9-NEXT: S_NOP 0 + ; GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %sgpr0 = S_MOV_B32 %sgpr0 + ; GCN-NEXT: S_ENDPGM + + %vgpr2 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr4 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr5 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr6 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr7 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr8 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr9 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr10 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr11 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr12 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr13 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr14 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr15 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr16 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr17 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr18 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr19 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr20 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr21 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr22 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr23 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr24 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr25 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr26 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr27 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr28 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr29 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr30 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr31 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr32 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr33 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr34 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr35 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr36 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr37 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr38 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr39 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr40 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr41 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr42 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr43 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr44 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr45 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr46 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr47 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr48 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr49 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr50 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr51 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr52 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr53 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr54 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr55 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr56 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr57 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr58 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr59 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr60 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr61 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr62 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr63 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr64 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %sgpr0 = S_MOV_B32 %sgpr0 + S_ENDPGM +... +--- name: break_clause_simple_load_flat4_lo_ptr @@ -239,9 +650,7 @@ ... --- -# The clause is broken by the waitcnt inserted at the end of the -# block, so no nop is needed. - +# Make sure a clause break is inserted even if the next member is in a following basic block. name: break_clause_block_boundary_load_flat8_ptr