diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -52,13 +52,18 @@ MachineFunction *MF; public: - // schedule() have seen a clustered memory operation. Set it to false - // before a region scheduling to know if the region had such clusters. - bool HasClusteredNodes; + // schedule() have seen register pressure over the critical limits and had to + // track register pressure for actual scheduling heuristics. + bool HasHighPressure; - // schedule() have seen an excess register pressure and had to track - // register pressure for actual scheduling heuristics. - bool HasExcessPressure; + // An error margin is necessary because of poor performance of the generic RP + // tracker and can be adjusted up for tuning heuristics to try and more + // aggressively reduce register pressure. + const unsigned DefaultErrorMargin = 3; + + const unsigned HighRPErrorMargin = 10; + + unsigned ErrorMargin = DefaultErrorMargin; unsigned SGPRCriticalLimit; @@ -77,7 +82,7 @@ enum class GCNSchedStageID : unsigned { InitialSchedule = 0, - UnclusteredReschedule = 1, + UnclusteredHighRPReschedule = 1, ClusteredLowOccupancyReschedule = 2, PreRARematerialize = 3, LastStage = PreRARematerialize @@ -104,7 +109,7 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { friend class GCNSchedStage; friend class InitialScheduleStage; - friend class UnclusteredRescheduleStage; + friend class UnclusteredHighRPStage; friend class ClusteredLowOccStage; friend class PreRARematStage; @@ -126,12 +131,13 @@ // or we generally desire to reschedule it. BitVector RescheduleRegions; - // Record regions which use clustered loads/stores. - BitVector RegionsWithClusters; - // Record regions with high register pressure. BitVector RegionsWithHighRP; + // Record regions with excess register pressure over the physical register + // limit. Register pressure in these regions usually will result in spilling. + BitVector RegionsWithExcessRP; + // Regions that has the same occupancy as the latest MinOccupancy BitVector RegionsWithMinOcc; @@ -220,7 +226,7 @@ void setupNewBlock(); // Finalize state after scheudling a region. - virtual void finalizeGCNRegion(); + void finalizeGCNRegion(); // Check result of scheduling. void checkScheduling(); @@ -241,18 +247,19 @@ class InitialScheduleStage : public GCNSchedStage { public: - void finalizeGCNRegion() override; - bool shouldRevertScheduling(unsigned WavesAfter) override; InitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) : GCNSchedStage(StageID, DAG) {} }; -class UnclusteredRescheduleStage : public GCNSchedStage { +class UnclusteredHighRPStage : public GCNSchedStage { private: std::vector> SavedMutations; + // Save the initial occupancy before starting this stage. + unsigned InitialOccupancy; + public: bool initGCNSchedStage() override; @@ -262,7 +269,7 @@ bool shouldRevertScheduling(unsigned WavesAfter) override; - UnclusteredRescheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) + UnclusteredHighRPStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) : GCNSchedStage(StageID, DAG) {} }; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -34,7 +34,7 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( const MachineSchedContext *C) : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), - HasClusteredNodes(false), HasExcessPressure(false) {} + HasHighPressure(false) {} void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { GenericScheduler::initialize(DAG); @@ -43,10 +43,6 @@ const GCNSubtarget &ST = MF->getSubtarget(); - // FIXME: This is also necessary, because some passes that run after - // scheduling and before regalloc increase register pressure. - const unsigned ErrorMargin = 3; - SGPRExcessLimit = Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); VGPRExcessLimit = @@ -121,13 +117,13 @@ // marked as RegExcess in tryCandidate() when they are compared with // instructions that increase the register pressure. if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { - HasExcessPressure = true; + HasHighPressure = true; Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); } if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { - HasExcessPressure = true; + HasHighPressure = true; Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); } @@ -141,7 +137,7 @@ int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; if (SGPRDelta >= 0 || VGPRDelta >= 0) { - HasExcessPressure = true; + HasHighPressure = true; if (SGPRDelta > VGPRDelta) { Cand.RPDelta.CriticalMax = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); @@ -300,15 +296,6 @@ if (SU->isBottomReady()) Bot.removeReady(SU); - if (!HasClusteredNodes && SU->getInstr()->mayLoadOrStore()) { - for (SDep &Dep : SU->Preds) { - if (Dep.isCluster()) { - HasClusteredNodes = true; - break; - } - } - } - LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr()); return SU; @@ -426,12 +413,12 @@ LiveIns.resize(Regions.size()); Pressure.resize(Regions.size()); RescheduleRegions.resize(Regions.size()); - RegionsWithClusters.resize(Regions.size()); RegionsWithHighRP.resize(Regions.size()); + RegionsWithExcessRP.resize(Regions.size()); RegionsWithMinOcc.resize(Regions.size()); RescheduleRegions.set(); - RegionsWithClusters.reset(); RegionsWithHighRP.reset(); + RegionsWithExcessRP.reset(); RegionsWithMinOcc.reset(); runSchedStages(); @@ -440,7 +427,8 @@ void GCNScheduleDAGMILive::runSchedStages() { LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); InitialScheduleStage S0(GCNSchedStageID::InitialSchedule, *this); - UnclusteredRescheduleStage S1(GCNSchedStageID::UnclusteredReschedule, *this); + UnclusteredHighRPStage S1(GCNSchedStageID::UnclusteredHighRPReschedule, + *this); ClusteredLowOccStage S2(GCNSchedStageID::ClusteredLowOccupancyReschedule, *this); PreRARematStage S3(GCNSchedStageID::PreRARematerialize, *this); @@ -477,8 +465,8 @@ case GCNSchedStageID::InitialSchedule: OS << "Initial Schedule"; break; - case GCNSchedStageID::UnclusteredReschedule: - OS << "Unclustered Reschedule"; + case GCNSchedStageID::UnclusteredHighRPReschedule: + OS << "Unclustered High Register Pressure Reschedule"; break; case GCNSchedStageID::ClusteredLowOccupancyReschedule: OS << "Clustered Low Occupancy Reschedule"; @@ -503,16 +491,27 @@ return true; } -bool UnclusteredRescheduleStage::initGCNSchedStage() { +bool UnclusteredHighRPStage::initGCNSchedStage() { if (!GCNSchedStage::initGCNSchedStage()) return false; - if (DAG.RescheduleRegions.none()) + if (DAG.RegionsWithHighRP.none() || DAG.RegionsWithExcessRP.none()) return false; SavedMutations.swap(DAG.Mutations); + InitialOccupancy = DAG.MinOccupancy; + // Aggressivly try to reduce register pressure in the unclustered high RP + // stage. Temporarily increase occupancy target in the region. + S.ErrorMargin = S.HighRPErrorMargin; + if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) + MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); + + LLVM_DEBUG( + dbgs() + << "Retrying function scheduling without clustering. " + "Aggressivly try to reduce register pressure to achieve occupancy " + << DAG.MinOccupancy << ".\n"); - LLVM_DEBUG(dbgs() << "Retrying function scheduling without clustering.\n"); return true; } @@ -565,8 +564,18 @@ LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n"); } -void UnclusteredRescheduleStage::finalizeGCNSchedStage() { +void UnclusteredHighRPStage::finalizeGCNSchedStage() { SavedMutations.swap(DAG.Mutations); + S.ErrorMargin = S.DefaultErrorMargin; + if (DAG.MinOccupancy > InitialOccupancy) { + for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) + DAG.RegionsWithMinOcc[IDX] = + DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy; + + LLVM_DEBUG(dbgs() << StageID + << " stage successfully increased occupancy to " + << DAG.MinOccupancy << '\n'); + } GCNSchedStage::finalizeGCNSchedStage(); } @@ -606,29 +615,29 @@ llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]).print(dbgs()); dbgs() << "Region register pressure: "; PressureBefore.print(dbgs())); - // Set HasClusteredNodes to true for late stages where we have already - // collected it. That way pickNode() will not scan SDep's when not needed. - S.HasClusteredNodes = StageID > GCNSchedStageID::InitialSchedule; - S.HasExcessPressure = false; + S.HasHighPressure = false; return true; } -bool UnclusteredRescheduleStage::initGCNRegion() { - if (!DAG.RescheduleRegions[RegionIdx]) +bool UnclusteredHighRPStage::initGCNRegion() { + // Only reschedule regions with the minimum occupancy or regions that may have + // spilling (excess register pressure). + if ((!DAG.RegionsWithMinOcc[RegionIdx] || + DAG.MinOccupancy <= InitialOccupancy) && + !DAG.RegionsWithExcessRP[RegionIdx]) return false; return GCNSchedStage::initGCNRegion(); } bool ClusteredLowOccStage::initGCNRegion() { - // We may need to reschedule this region if it doesn't have clusters so it - // wasn't rescheduled in the last stage, or if we found it was testing - // critical register pressure limits in the unclustered reschedule stage. The - // later is because we may not have been able to raise the min occupancy in - // the previous stage so the region may be overly constrained even if it was - // already rescheduled. - if (!DAG.RegionsWithClusters[RegionIdx] && !DAG.RegionsWithHighRP[RegionIdx]) + // We may need to reschedule this region if it wasn't rescheduled in the last + // stage, or if we found it was testing critical register pressure limits in + // the unclustered reschedule stage. The later is because we may not have been + // able to raise the min occupancy in the previous stage so the region may be + // overly constrained even if it was already rescheduled. + if (!DAG.RegionsWithHighRP[RegionIdx]) return false; return GCNSchedStage::initGCNRegion(); @@ -656,7 +665,7 @@ void GCNSchedStage::finalizeGCNRegion() { DAG.Regions[RegionIdx] = std::make_pair(DAG.RegionBegin, DAG.RegionEnd); DAG.RescheduleRegions[RegionIdx] = false; - if (S.HasExcessPressure) + if (S.HasHighPressure) DAG.RegionsWithHighRP[RegionIdx] = true; // Revert scheduling if we have dropped occupancy or there is some other @@ -667,16 +676,6 @@ RegionIdx++; } -void InitialScheduleStage::finalizeGCNRegion() { - // Record which regions have clustered nodes for the next unclustered - // reschedule stage. - assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule); - if (S.HasClusteredNodes) - DAG.RegionsWithClusters[RegionIdx] = true; - - GCNSchedStage::finalizeGCNRegion(); -} - void GCNSchedStage::checkScheduling() { // Check the results of scheduling. PressureAfter = DAG.getRealRegPressure(RegionIdx); @@ -731,6 +730,7 @@ PressureAfter.getSGPRNum() > MaxSGPRs) { DAG.RescheduleRegions[RegionIdx] = true; DAG.RegionsWithHighRP[RegionIdx] = true; + DAG.RegionsWithExcessRP[RegionIdx] = true; } // Revert if this region's schedule would cause a drop in occupancy or @@ -758,21 +758,15 @@ if (mayCauseSpilling(WavesAfter)) return true; - assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule); - // Don't reschedule the region in the next stage if it doesn't have clusters. - if (!DAG.RegionsWithClusters[RegionIdx]) - DAG.RescheduleRegions[RegionIdx] = false; - return false; } -bool UnclusteredRescheduleStage::shouldRevertScheduling(unsigned WavesAfter) { - if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) - return true; - - // If RP is not reduced in the unclustred reschedule stage, revert to the old - // schedule. - if (!PressureAfter.less(ST, PressureBefore)) { +bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { + // If RP is not reduced in the unclustred reschedule stage, revert to the + // old schedule. + if ((WavesAfter <= PressureBefore.getOccupancy(ST) && + mayCauseSpilling(WavesAfter)) || + GCNSchedStage::shouldRevertScheduling(WavesAfter)) { LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); return true; } @@ -803,7 +797,7 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { if (WavesAfter <= MFI.getMinWavesPerEU() && !PressureAfter.less(ST, PressureBefore) && - DAG.RescheduleRegions[RegionIdx]) { + DAG.RegionsWithExcessRP[RegionIdx]) { LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); return true; } @@ -816,8 +810,7 @@ PressureBefore.getOccupancy(ST) == DAG.MinOccupancy; LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); DAG.RescheduleRegions[RegionIdx] = - DAG.RegionsWithClusters[RegionIdx] || - (nextStage(StageID)) != GCNSchedStageID::UnclusteredReschedule; + (nextStage(StageID)) != GCNSchedStageID::UnclusteredHighRPReschedule; DAG.RegionEnd = DAG.RegionBegin; int SkippedDebugInstr = 0; for (MachineInstr *MI : Unsched) { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -30,10 +30,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, public AMDGPUSubtarget { - +public: using AMDGPUSubtarget::getMaxWavesPerEU; -public: // Following 2 enums are documented at: // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi enum class TrapHandlerAbi { diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -525,11 +525,11 @@ ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 ; GFX908-NEXT: s_load_dword s8, s[4:5], 0x18 -; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: s_mov_b32 s10, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX908-NEXT: s_sub_i32 s4, 0, s1 +; GFX908-NEXT: s_sub_i32 s6, 0, s1 ; GFX908-NEXT: s_lshr_b32 s11, s8, 16 ; GFX908-NEXT: v_cvt_f32_f16_e32 v25, s8 ; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -539,11 +539,12 @@ ; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v7, s3 -; GFX908-NEXT: s_mov_b32 s10, 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s2 -; GFX908-NEXT: v_mul_lo_u32 v2, s4, v0 -; GFX908-NEXT: s_lshl_b64 s[4:5], s[6:7], 5 +; GFX908-NEXT: v_mul_lo_u32 v2, s6, v0 +; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX908-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_lshl_b64 s[4:5], s[6:7], 5 ; GFX908-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX908-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX908-NEXT: v_mov_b32_e32 v2, s8 diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir --- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir @@ -13,14 +13,6 @@ # CHECK-NEXT: From: DBG_VALUE %17:vgpr_32, 0, 0 # CHECK-NEXT: To: S_ENDPGM 0, implicit %69:vgpr_32, implicit %70:vgpr_32 # CHECK-NEXT: RegionInstrs: 46 -# CHECK: Unclustered reschedule did not help. -# CHECK: Attempting to revert scheduling. -# CHECK: Retrying function scheduling with lowest recorded occupancy 3. -# CHECK: ********** MI Scheduling ********** -# CHECK: test_same_num_instrs:%bb.2 -# CHECK-NEXT: From: DBG_VALUE %17:vgpr_32, 0, 0 -# CHECK-NEXT: To: S_ENDPGM 0, implicit %69:vgpr_32, implicit %70:vgpr_32 -# CHECK-NEXT: RegionInstrs: 46 # CHECK: Attempting to revert scheduling. --- diff --git a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir @@ -0,0 +1,144 @@ +# REQUIRES: asserts +# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s + +--- | + define amdgpu_kernel void @high-RP-reschedule() { ret void } +... + +# CHECK: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4 + +--- +name: high-RP-reschedule +tracksRegLiveness: true +machineFunctionInfo: + occupancy: 4 +body: | + bb.0: + %0:vreg_128 = IMPLICIT_DEF + %1:vreg_128 = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:vreg_128 = IMPLICIT_DEF + %4:vreg_128 = IMPLICIT_DEF + %5:vreg_128 = IMPLICIT_DEF + %6:vreg_128 = IMPLICIT_DEF + %7:vreg_128 = IMPLICIT_DEF + %8:vreg_128 = IMPLICIT_DEF + %9:vreg_128 = IMPLICIT_DEF + %10:vreg_128 = IMPLICIT_DEF + %11:sreg_64_xexec = IMPLICIT_DEF + %12:vreg_64 = IMPLICIT_DEF + + bb.1: + %13:vgpr_32 = V_LSHRREV_B16_e32 1, %12.sub0, implicit $exec + %14:vgpr_32 = V_AND_B32_e32 127, %13, implicit $exec + %15:vgpr_32 = V_MUL_LO_U16_e32 49, %14, implicit $exec + %16:vgpr_32 = V_LSHRREV_B16_e32 10, %15, implicit $exec + %17:vgpr_32 = V_MUL_LO_U16_e32 42, %16, implicit $exec + %18:vgpr_32 = V_SUB_U16_e32 %12.sub0, %17, implicit $exec + %19:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + %20:vgpr_32 = V_MUL_U32_U24_sdwa 0, %18, 0, %19, 0, 6, 0, 0, 6, implicit $exec + %21:vgpr_32 = V_LSHLREV_B32_e32 4, %20, implicit $exec + %22:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 608, 0, implicit $exec :: (load (s128)) + %23:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 576, 0, implicit $exec :: (load (s128)) + %24:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 592, 0, implicit $exec :: (load (s128)) + %25:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 624, 0, implicit $exec :: (load (s128)) + %26:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 672, 0, implicit $exec :: (load (s128)) + %27:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 640, 0, implicit $exec :: (load (s128)) + %28:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 656, 0, implicit $exec :: (load (s128)) + %29:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %7.sub2_sub3, 0, %25.sub2_sub3, 0, 0, implicit $mode, implicit $exec + %30:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %7.sub2_sub3, 0, %25.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %31:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %7.sub0_sub1, 0, %25.sub0_sub1, 1, %29, 0, 0, implicit $mode, implicit $exec + %32:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %7.sub0_sub1, 0, %25.sub2_sub3, 0, %30, 0, 0, implicit $mode, implicit $exec + %33:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %5.sub2_sub3, 0, %24.sub2_sub3, 0, 0, implicit $mode, implicit $exec + %34:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %5.sub2_sub3, 0, %24.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %35:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %5.sub0_sub1, 0, %24.sub0_sub1, 1, %33, 0, 0, implicit $mode, implicit $exec + %36:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %5.sub0_sub1, 0, %24.sub2_sub3, 0, %34, 0, 0, implicit $mode, implicit $exec + %37:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %9.sub2_sub3, 0, %28.sub2_sub3, 0, 0, implicit $mode, implicit $exec + %38:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %9.sub2_sub3, 0, %28.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %39:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %9.sub0_sub1, 0, %28.sub0_sub1, 1, %37, 0, 0, implicit $mode, implicit $exec + %40:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %9.sub0_sub1, 0, %28.sub2_sub3, 0, %38, 0, 0, implicit $mode, implicit $exec + %41:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %4.sub2_sub3, 0, %23.sub2_sub3, 0, 0, implicit $mode, implicit $exec + %42:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %4.sub2_sub3, 0, %23.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %43:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %4.sub0_sub1, 0, %23.sub0_sub1, 1, %41, 0, 0, implicit $mode, implicit $exec + %44:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %4.sub0_sub1, 0, %23.sub2_sub3, 0, %42, 0, 0, implicit $mode, implicit $exec + %45:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %8.sub2_sub3, 0, %27.sub2_sub3, 0, 0, implicit $mode, implicit $exec + %46:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %8.sub2_sub3, 0, %27.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %47:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %6.sub2_sub3, 0, %22.sub2_sub3, 0, 0, implicit $mode, implicit $exec + %48:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %8.sub0_sub1, 0, %27.sub0_sub1, 1, %45, 0, 0, implicit $mode, implicit $exec + %49:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %8.sub0_sub1, 0, %27.sub2_sub3, 0, %46, 0, 0, implicit $mode, implicit $exec + %50:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %10.sub2_sub3, 0, %26.sub2_sub3, 0, 0, implicit $mode, implicit $exec + %51:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %6.sub2_sub3, 0, %22.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %52:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %10.sub2_sub3, 0, %26.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %53:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %6.sub0_sub1, 0, %22.sub0_sub1, 1, %47, 0, 0, implicit $mode, implicit $exec + %54:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %10.sub0_sub1, 0, %26.sub0_sub1, 1, %50, 0, 0, implicit $mode, implicit $exec + %55:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %6.sub0_sub1, 0, %22.sub2_sub3, 0, %51, 0, 0, implicit $mode, implicit $exec + %56:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %10.sub0_sub1, 0, %26.sub2_sub3, 0, %52, 0, 0, implicit $mode, implicit $exec + %57:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %3.sub2_sub3, 1, %32, 0, 0, implicit $mode, implicit $exec + %58:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %35, 1, %39, 0, 0, implicit $mode, implicit $exec + %59:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %44, 1, %49, 0, 0, implicit $mode, implicit $exec + %60:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %53, 1, %54, 0, 0, implicit $mode, implicit $exec + %61:sreg_64 = S_MOV_B64_IMM_PSEUDO 4604544271217802189 + %62:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %3.sub0_sub1, 1, %31, 0, 0, implicit $mode, implicit $exec + undef %63.sub1:sreg_64 = S_MOV_B32 -1075404642 + %64:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %36, 1, %40, 0, 0, implicit $mode, implicit $exec + %65:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %43, 1, %48, 0, 0, implicit $mode, implicit $exec + %66:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %55, 1, %56, 0, 0, implicit $mode, implicit $exec + %67:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %57, 0, %58, 0, 0, implicit $mode, implicit $exec + %68:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %59, 0, %60, 0, 0, implicit $mode, implicit $exec + %69:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %3.sub2_sub3, 0, 4611686018427387904, 1, %57, 0, 0, implicit $mode, implicit $exec + %70:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %3.sub0_sub1, 0, 4611686018427387904, 1, %62, 0, 0, implicit $mode, implicit $exec + %71:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %35, 0, 4611686018427387904, 1, %58, 0, 0, implicit $mode, implicit $exec + %72:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %36, 0, 4611686018427387904, 1, %64, 0, 0, implicit $mode, implicit $exec + %73:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %43, 0, 4611686018427387904, 1, %65, 0, 0, implicit $mode, implicit $exec + %74:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %44, 0, 4611686018427387904, 1, %59, 0, 0, implicit $mode, implicit $exec + %75:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %55, 0, 4611686018427387904, 1, %66, 0, 0, implicit $mode, implicit $exec + %76:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %53, 0, 4611686018427387904, 1, %60, 0, 0, implicit $mode, implicit $exec + %77:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %62, 1, %64, 0, 0, implicit $mode, implicit $exec + %78:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %65, 1, %66, 0, 0, implicit $mode, implicit $exec + %79:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %70, 1, %71, 0, 0, implicit $mode, implicit $exec + %80:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %57, 0, 4611686018427387904, 1, %67, 0, 0, implicit $mode, implicit $exec + %81:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %59, 0, 4611686018427387904, 1, %68, 0, 0, implicit $mode, implicit $exec + %82:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %69, 1, %72, 0, 0, implicit $mode, implicit $exec + %83:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %74, 1, %75, 0, 0, implicit $mode, implicit $exec + %84:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %73, 1, %76, 0, 0, implicit $mode, implicit $exec + %85:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %62, 0, 4611686018427387904, 1, %77, 0, 0, implicit $mode, implicit $exec + %86:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %65, 0, 4611686018427387904, 1, %78, 0, 0, implicit $mode, implicit $exec + %63.sub0:sreg_64 = COPY %61.sub0 + %87:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %68, 0, %61, 0, %67, 0, 0, implicit $mode, implicit $exec + %88:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %81, 0, %63, 0, %80, 0, 0, implicit $mode, implicit $exec + %89:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %70, 0, 4611686018427387904, 1, %79, 0, 0, implicit $mode, implicit $exec + %90:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %69, 0, 4611686018427387904, 1, %82, 0, 0, implicit $mode, implicit $exec + %91:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %74, 0, 4611686018427387904, 1, %83, 0, 0, implicit $mode, implicit $exec + %92:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %73, 0, 4611686018427387904, 1, %84, 0, 0, implicit $mode, implicit $exec + %93:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %86, 0, %63, 0, %85, 0, 0, implicit $mode, implicit $exec + %94:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %78, 0, %61, 0, %77, 0, 0, implicit $mode, implicit $exec + undef %95.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %78, 0, %61, 0, %87, 0, 0, implicit $mode, implicit $exec + undef %96.sub2_sub3:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %82, 0, %84, 0, 0, implicit $mode, implicit $exec + undef %97.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %86, 0, %61, 0, %88, 0, 0, implicit $mode, implicit $exec + undef %98.sub2_sub3:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %90, 1, %91, 0, 0, implicit $mode, implicit $exec + %98.sub0_sub1:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %89, 1, %92, 0, 0, implicit $mode, implicit $exec + %97.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %81, 0, %63, 0, %93, 0, 0, implicit $mode, implicit $exec + %96.sub0_sub1:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %79, 1, %83, 0, 0, implicit $mode, implicit $exec + %95.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %68, 0, %63, 0, %94, 0, 0, implicit $mode, implicit $exec + undef %99.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %67, 0, 4611686018427387904, 1, %95.sub2_sub3, 0, 0, implicit $mode, implicit $exec + undef %100.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %82, 0, 4611686018427387904, 1, %96.sub2_sub3, 0, 0, implicit $mode, implicit $exec + undef %101.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %80, 0, 4611686018427387904, 1, %97.sub2_sub3, 0, 0, implicit $mode, implicit $exec + undef %102.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %90, 0, 4611686018427387904, 1, %98.sub2_sub3, 0, 0, implicit $mode, implicit $exec + %102.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %89, 0, 4611686018427387904, 1, %98.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %101.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %85, 0, 4611686018427387904, 1, %97.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %100.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %79, 0, 4611686018427387904, 1, %96.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %99.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %77, 0, 4611686018427387904, 1, %95.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %103:vgpr_32 = V_ADD_U32_sdwa 0, %2, 0, %18, 0, 6, 0, 6, 0, implicit $exec + %104:vgpr_32 = V_LSHL_ADD_U32_e64 %103, 4, 0, implicit $exec + DS_WRITE_B128_gfx9 %104, %102, 0, 0, implicit $exec + DS_WRITE_B128_gfx9 %104, %101, 672, 0, implicit $exec + DS_WRITE_B128_gfx9 %104, %100, 1344, 0, implicit $exec + DS_WRITE_B128_gfx9 %104, %99, 2016, 0, implicit $exec + DS_WRITE_B128_gfx9 %104, %98, 2688, 0, implicit $exec + DS_WRITE_B128_gfx9 %104, %97, 3360, 0, implicit $exec + DS_WRITE_B128_gfx9 %104, %96, 4032, 0, implicit $exec + DS_WRITE_B128_gfx9 %104, %95, 4704, 0, implicit $exec + + bb.2: + S_ENDPGM 0, implicit %0, implicit %1 +... diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -3484,8 +3484,8 @@ ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v18 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v18 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill @@ -3503,99 +3503,88 @@ ; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v19 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v18 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v19 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v18 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[18:21], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[22:25], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v26 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v24 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v27 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v26 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v25 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v24 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xffff, v21 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v20 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xffff, v19 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v18 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v31 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v30 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v33, 0xffff, v29 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v28 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v25 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v24 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v25 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v24 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[38:41], off, s[8:11], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v36, 0xffff, v23 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v22 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v41 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v42, 16, v40 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v39 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v46, 16, v38 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v37 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v35 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v43, 0xffff, v41 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v41, 0xffff, v40 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v47, 0xffff, v39 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xffff, v38 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v43, 0xffff, v37 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v36 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v47, 0xffff, v35 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v54, 16, v40 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v39 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v37, 0xffff, v42 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v35, 0xffff, v41 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[55:58], off, s[8:11], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v40 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v51, 0xffff, v39 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v51, 0xffff, v23 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v22 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[22:25], off, s[8:11], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v55, 0xffff, v21 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v20 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[57:60], off, s[8:11], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v61, 16, v58 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 16, v57 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v56 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v25 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v24 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v55 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, 0xffff, v58 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v57 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v56 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v24 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v42 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v41 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v42 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v41 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v60 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v59 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v63, 16, v58 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v61, 16, v57 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v60 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v59 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v62, 0xffff, v58 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, 0xffff, v57 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[53:56], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload @@ -3833,102 +3822,93 @@ ; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v18 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v12 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v16 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v19 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v18 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v17 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v19 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v17 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v16 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v23 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v21 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v20 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:80 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v46, 16, v38 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v37 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v48, 16, v35 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, 0xffff, v38 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v43, 0xffff, v37 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v49, 0xffff, v36 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v47, 0xffff, v35 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v21 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v23 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, 0xffff, v22 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v46, 0xffff, v21 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, 0xffff, v20 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 16, v40 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v52, 16, v39 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v37, 0xffff, v42 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v35, 0xffff, v41 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v53, 0xffff, v40 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v39 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[55:58], off, s[8:11], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v26 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v24 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v27 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v26 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v25 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v24 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v30 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xffff, v29 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v31, 0xffff, v28 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v39 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v38 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v36 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v38 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, 0xffff, v37 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v36 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[52:55], off, s[8:11], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 16, v39 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v56 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v60, 16, v41 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v40 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v61, 0xffff, v42 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v59, 0xffff, v41 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v40 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v39 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v42, 16, v58 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v40, 16, v57 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v41, 0xffff, v58 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, 0xffff, v57 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v52 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v57, 16, v38 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v63, 16, v37 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v61, 16, v36 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v58, 0xffff, v39 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, 0xffff, v38 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v62, 0xffff, v37 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, 0xffff, v36 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v55 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v54 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, 0xffff, v55 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v54 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload