Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -285,6 +285,7 @@ // Basic subtarget description. Triple TargetTriple; unsigned Gen; + AMDGPU::IsaVersion ISAVersion; InstrItineraryData InstrItins; int LDSBankCount; unsigned MaxPrivateElementSize; @@ -453,6 +454,10 @@ return (Generation)Gen; } + AMDGPU::IsaVersion getIsaVersion() const { + return ISAVersion; + } + unsigned getWavefrontSizeLog2() const { return Log2_32(WavefrontSize); } @@ -731,6 +736,14 @@ return getGeneration() > GFX9; } + unsigned getMaxLgkmcnt() const { + return AMDGPU::getLgkmcntBitMask(ISAVersion); + } + + unsigned getMaxVmcnt() const { + return AMDGPU::getVmcntBitMask(ISAVersion); + } + bool hasD16LoadStore() const { return getGeneration() >= GFX9; } Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -173,6 +173,7 @@ AMDGPUSubtarget(TT), TargetTriple(TT), Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), + ISAVersion(AMDGPU::getIsaVersion(GPU)), InstrItins(getInstrItineraryForCPU(GPU)), LDSBankCount(0), MaxPrivateElementSize(0), Index: llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h =================================================================== --- llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -48,6 +48,8 @@ const SIInstrInfo &TII; const SIRegisterInfo &TRI; TargetSchedModel TSchedModel; + unsigned MaxSMEMClauseSize; + unsigned MaxVMEMClauseSize; /// RegUnits of uses in the current soft memory clause. BitVector ClauseUses; Index: llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -37,6 +37,10 @@ // Hazard Recoginizer Implementation //===----------------------------------------------------------------------===// +static cl::opt +HazardMaxLookAhead("amdgpu-hazard-max-lookahead", cl::Hidden, cl::init(16), + cl::desc("Maximum instruction count to look for hazards")); + GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), @@ -44,10 +48,30 @@ ST(MF.getSubtarget()), TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()), + MaxSMEMClauseSize(ST.getMaxLgkmcnt() + 1), + MaxVMEMClauseSize(ST.getMaxVmcnt() + 1), ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { - MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5; TSchedModel.init(&ST); + + // VMCNT is 4 bits, except on GFX9 where it is 6 bits, therefore the maximum + // size of a VMEM clause is 16 or 64. + // + // LGKMCNT is always 4 bits for an SMEM clause. + + // Setting it so high only allows for skipping a nop for extremely large + // clauses, so it should be fine to reduce this if it's a compile time issue. + MaxLookAhead = std::max(MaxVMEMClauseSize, MaxSMEMClauseSize); + + if (HazardMaxLookAhead.getNumOccurrencesFlag()) + MaxLookAhead = HazardMaxLookAhead; + + // FIXME: Could be any AGPR. + const unsigned MaxNormalHazardLookAhead + = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5; + + // The maximum number of wait states for any normal hazard type is 5. + MaxLookAhead = std::max(MaxLookAhead, MaxNormalHazardLookAhead); } void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { @@ -494,11 +518,13 @@ bool IsSMRD = TII.isSMRD(*MEM); + const unsigned MaxClauseSize = IsSMRD ? MaxSMEMClauseSize : MaxVMEMClauseSize; + resetClause(); - // A soft-clause is any group of consecutive SMEM instructions. The - // instructions in this group may return out of order and/or may be - // replayed (i.e. the same instruction issued more than once). + // A soft-clause is any group of consecutive SMEM or VMEM instructions. The + // instructions in this group may return out of order and/or may be replayed + // (i.e. the same instruction issued more than once). // // In order to handle these situations correctly we need to make sure that // when a clause has more than one instruction, no instruction in the clause @@ -506,6 +532,8 @@ // (including itself). If we encounter this situaion, we need to break the // clause by inserting a non SMEM instruction. + unsigned ClauseSize = 1; + for (MachineInstr *MI : EmittedInstrs) { // When we hit a non-SMEM instruction then we have passed the start of the // clause and we can stop. @@ -516,6 +544,10 @@ break; addClauseInst(*MI); + + // FIXME: The maximum clause size is actually determined by saturating the + // counter type. This is > 1 for some SMEM operations. + ++ClauseSize; } if (ClauseDefs.none()) @@ -527,6 +559,10 @@ if (MEM->mayStore()) return 1; + // The clause is implicitly broken if it exceeds the maximum. + if (ClauseSize > MaxClauseSize) + return 0; + addClauseInst(*MEM); // If the set of defs and uses intersect then we cannot add this instruction Index: llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -649,7 +649,7 @@ SICacheControl::SICacheControl(const GCNSubtarget &ST) { TII = ST.getInstrInfo(); - IV = getIsaVersion(ST.getCPU()); + IV = ST.getIsaVersion(); } /* static */ Index: llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir +++ llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir @@ -1,5 +1,7 @@ # RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s # RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s +# RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s --- # Trivial clause at beginning of program @@ -111,33 +113,58 @@ S_ENDPGM 0 ... --- -# vmcnt has 4 bits, so maximum 16 outstanding loads. The waitcnt -# breaks the clause. +# lgmcnt has 4 bits pre-gfx9, so maximum 16 outstanding loads. name: break_smem_clause_at_max_smem_clause_size_smrd_load4 body: | bb.0: ; GCN-LABEL: name: break_smem_clause_at_max_smem_clause_size_smrd_load4 - ; GCN: $sgpr13 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - ; GCN-NEXT: $sgpr14 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - ; GCN-NEXT: $sgpr15 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - ; GCN-NEXT: $sgpr16 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - ; GCN-NEXT: $sgpr17 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - ; GCN-NEXT: $sgpr18 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - ; GCN-NEXT: $sgpr19 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - ; GCN-NEXT: $sgpr20 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - ; GCN-NEXT: $sgpr21 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - ; GCN-NEXT: $sgpr22 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - ; GCN-NEXT: $sgpr23 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - ; GCN-NEXT: $sgpr24 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - ; GCN-NEXT: $sgpr25 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - ; GCN-NEXT: $sgpr26 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - ; GCN-NEXT: $sgpr27 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - ; GCN-NEXT: $sgpr28 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM $sgpr30_sgpr31, 0, 0, 0 - ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28 - ; GCN-NEXT: S_ENDPGM 0 + + ; GCN: $sgpr13 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr14 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr15 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr16 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: $sgpr17 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr18 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr19 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr20 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: $sgpr21 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr22 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr23 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr24 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: $sgpr25 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr26 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr27 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr28 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: $sgpr29 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr30 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr31 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr32 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: $sgpr33 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr34 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr35 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr36 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: $sgpr37 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr38 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr39 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr40 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: $sgpr41 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr42 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr43 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr44 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: $sgpr10 = S_LOAD_DWORD_IMM $sgpr30_sgpr31, 0, 0 + ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0 + ; GCN-NEXT: S_ENDPGM + $sgpr13 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 $sgpr14 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 $sgpr15 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 @@ -158,8 +185,83 @@ $sgpr27 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 $sgpr28 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - $sgpr0 = S_LOAD_DWORD_IMM $sgpr30_sgpr31, 0, 0, 0 - $sgpr0 = S_MOV_B32 $sgpr0, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28 + $sgpr29 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr30 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr31 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr32 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + + $sgpr33 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr34 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr35 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr36 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + + $sgpr37 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr38 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr39 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr40 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + + $sgpr41 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr42 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr43 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr44 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + + $sgpr10 = S_LOAD_DWORD_IMM $sgpr30_sgpr31, 0, 0, 0 + $sgpr0 = S_MOV_B32 $sgpr0 + S_ENDPGM 0 +... +--- + +name: break_smem_clause_at_max_smem_clause_size_m1_smrd_load4 + +body: | + bb.0: + ; GCN-LABEL: name: break_smem_clause_at_max_smem_clause_size_m1_smrd_load4 + + ; GCN: $sgpr13 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr14 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr15 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr16 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: $sgpr17 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr18 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr19 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr20 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: $sgpr21 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr22 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr23 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr24 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + + ; GCN-NEXT: $sgpr25 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr26 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr27 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: $sgpr10 = S_LOAD_DWORD_IMM $sgpr30_sgpr31, 0, 0 + ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0 + ; GCN-NEXT: S_ENDPGM + + $sgpr13 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr14 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr15 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr16 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + + $sgpr17 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr18 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr19 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr20 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + + $sgpr21 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr22 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr23 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr24 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + + $sgpr25 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr26 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + $sgpr27 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 + + $sgpr10 = S_LOAD_DWORD_IMM $sgpr30_sgpr31, 0, 0, 0 + $sgpr0 = S_MOV_B32 $sgpr0 S_ENDPGM 0 ... --- Index: llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir +++ llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir @@ -3,6 +3,9 @@ # Make sure the default assumption is xnack enabled with no cpu # RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+volcanic-islands -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s # RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,XNACK-GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s + --- # Trivial clause at beginning of program name: trivial_clause_load_flat4_x1 @@ -122,15 +125,15 @@ S_ENDPGM 0 ... --- -# vmcnt has 4 bits, so maximum 16 outstanding loads. The waitcnt -# breaks the clause. +# vmcnt has 4 bits pre-gfx9, so maximum 16 outstanding loads can be in a +# clause, so no breaking is needed. -name: break_clause_at_max_clause_size_flat_load4 +name: break_clause_at_max_clause16_size_flat_load4 body: | bb.0: - ; GCN-LABEL: name: break_clause_at_max_clause_size_flat_load4 + ; GCN-LABEL: name: break_clause_at_max_clause16_size_flat_load4 ; GCN: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr @@ -147,10 +150,10 @@ ; GCN-NEXT: $vgpr15 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: $vgpr16 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: $vgpr17 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; XNACK-NEXT: S_NOP 0 + ; XNACK-GFX9-NEXT: S_NOP 0 ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18 - ; GCN-NEXT: S_ENDPGM 0 + ; GCN-NEXT: S_ENDPGM $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr @@ -177,6 +180,414 @@ S_ENDPGM 0 ... --- +# vmcnt has 4 bits pre-gfx9, so maximum 16 outstanding loads can be in a +# clause, so no breaking is needed. + + +name: break_clause_at_max_m1_clause16_size_flat_load4 + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_at_max_m1_clause16_size_flat_load4 + ; GCN: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr6 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr8 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr9 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr10 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr11 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr12 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr13 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr14 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr15 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr16 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18 + ; GCN-NEXT: S_ENDPGM + + $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr6 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr8 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr9 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr10 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr11 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr12 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr13 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr14 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr15 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr16 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $sgpr0 = S_MOV_B32 $sgpr0, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18 + S_ENDPGM 0 +... +--- +# vmcnt was increased to 6 bits in gfx9, so maximum 64 outstanding loads can be in a +# clause, so no breaking is needed. + +name: break_clause_at_max_clause64_size_flat_load4 + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_at_max_clause64_size_flat_load4 + + ; GCN: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr6 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr8 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr9 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr10 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr11 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr12 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr13 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr14 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr15 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr16 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr17 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr18 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr19 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr20 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr21 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr22 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr23 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr24 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr25 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr26 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr27 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr28 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr29 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr30 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr31 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr32 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr33 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr34 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr35 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr36 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr37 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr38 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr39 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr40 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr41 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr42 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr43 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr44 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr45 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr46 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr47 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr48 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr49 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr50 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr51 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr52 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr53 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr54 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr55 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr56 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr57 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr58 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr59 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr60 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr61 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr62 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr63 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr64 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr65 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0 + ; GCN-NEXT: S_ENDPGM + + $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr6 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr8 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr9 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr10 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr11 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr12 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr13 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr14 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr15 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr16 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr17 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr18 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr19 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr20 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr21 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr22 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr23 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr24 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr25 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr26 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr27 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr28 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr29 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr30 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr31 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr32 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr33 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr34 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr35 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr36 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr37 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr38 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr39 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr40 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr41 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr42 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr43 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr44 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr45 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr46 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr47 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr48 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr49 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr50 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr51 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr52 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr53 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr54 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr55 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr56 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr57 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr58 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr59 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr60 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr61 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr62 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr63 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr64 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr65 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $sgpr0 = S_MOV_B32 $sgpr0 + S_ENDPGM 0 +... +--- +# vmcnt was increased to 6 bits in gfx9, so maximum 64 outstanding loads can be in a +# clause, so no breaking is needed. + +name: break_clause_at_max_m1_clause64_size_flat_load4 + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_at_max_m1_clause64_size_flat_load4 + + ; GCN: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr6 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr8 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr9 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr10 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr11 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr12 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr13 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr14 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr15 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr16 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr17 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr18 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr19 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr20 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr21 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr22 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr23 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr24 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr25 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr26 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr27 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr28 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr29 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr30 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr31 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr32 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr33 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr34 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr35 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr36 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr37 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr38 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr39 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr40 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr41 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr42 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr43 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr44 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr45 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr46 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr47 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr48 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr49 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr50 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr51 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr52 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr53 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr54 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr55 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr56 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr57 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr58 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr59 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr60 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr61 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: $vgpr62 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr63 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr64 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + ; XNACK-GFX9-NEXT: S_NOP 0 + ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0 + ; GCN-NEXT: S_ENDPGM + + $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr6 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr8 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr9 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr10 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr11 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr12 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr13 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr14 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr15 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr16 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr17 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr18 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr19 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr20 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr21 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr22 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr23 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr24 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr25 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr26 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr27 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr28 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr29 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr30 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr31 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr32 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr33 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr34 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr35 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr36 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr37 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr38 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr39 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr40 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr41 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr42 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr43 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr44 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr45 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr46 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr47 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr48 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr49 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr50 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr51 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr52 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr53 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr54 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr55 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr56 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr57 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr58 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr59 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr60 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr61 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr62 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr63 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr64 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $sgpr0 = S_MOV_B32 $sgpr0 + S_ENDPGM 0 +... +--- name: break_clause_simple_load_flat4_lo_ptr @@ -242,9 +653,7 @@ ... --- -# The clause is broken by the waitcnt inserted at the end of the -# block, so no nop is needed. - +# Make sure a clause break is inserted even if the next member is in a following basic block. name: break_clause_block_boundary_load_flat8_ptr