diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -52,13 +52,18 @@ MachineFunction *MF; public: - // schedule() have seen a clustered memory operation. Set it to false - // before a region scheduling to know if the region had such clusters. - bool HasClusteredNodes; + // schedule() have seen register pressure over the critical limits and had to + // track register pressure for actual scheduling heuristics. + bool HasHighPressure; - // schedule() have seen an excess register pressure and had to track - // register pressure for actual scheduling heuristics. - bool HasExcessPressure; + // An error margin is necessary because of poor performance of the generic RP + // tracker and can be adjusted up for tuning heuristics to try and more + // aggressively reduce register pressure. + const unsigned DefaultErrorMargin = 3; + + const unsigned HighRPErrorMargin = 10; + + unsigned ErrorMargin = DefaultErrorMargin; unsigned SGPRCriticalLimit; @@ -77,7 +82,7 @@ enum class GCNSchedStageID : unsigned { InitialSchedule = 0, - UnclusteredReschedule = 1, + UnclusteredHighRPReschedule = 1, ClusteredLowOccupancyReschedule = 2, PreRARematerialize = 3, LastStage = PreRARematerialize @@ -104,7 +109,7 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { friend class GCNSchedStage; friend class InitialScheduleStage; - friend class UnclusteredRescheduleStage; + friend class UnclusteredHighRPStage; friend class ClusteredLowOccStage; friend class PreRARematStage; @@ -126,12 +131,13 @@ // or we generally desire to reschedule it. BitVector RescheduleRegions; - // Record regions which use clustered loads/stores. - BitVector RegionsWithClusters; - // Record regions with high register pressure. BitVector RegionsWithHighRP; + // Record regions with excess register pressure over the physical register + // limit. Register pressure in these regions usually will result in spilling. + BitVector RegionsWithExcessRP; + // Regions that has the same occupancy as the latest MinOccupancy BitVector RegionsWithMinOcc; @@ -220,7 +226,7 @@ void setupNewBlock(); // Finalize state after scheudling a region. - virtual void finalizeGCNRegion(); + void finalizeGCNRegion(); // Check result of scheduling. void checkScheduling(); @@ -241,18 +247,19 @@ class InitialScheduleStage : public GCNSchedStage { public: - void finalizeGCNRegion() override; - bool shouldRevertScheduling(unsigned WavesAfter) override; InitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) : GCNSchedStage(StageID, DAG) {} }; -class UnclusteredRescheduleStage : public GCNSchedStage { +class UnclusteredHighRPStage : public GCNSchedStage { private: std::vector> SavedMutations; + // Save the initial occupancy before starting this stage. + unsigned InitialOccupancy; + public: bool initGCNSchedStage() override; @@ -262,7 +269,7 @@ bool shouldRevertScheduling(unsigned WavesAfter) override; - UnclusteredRescheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) + UnclusteredHighRPStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) : GCNSchedStage(StageID, DAG) {} }; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -31,10 +31,17 @@ using namespace llvm; +cl::opt + DisableUnclusterHighRP("amdgpu-disable-unclustred-high-rp-reschedule", + cl::Hidden, + cl::desc("Disable unclustred high register pressure " + "reduction scheduling stage."), + cl::init(false)); + GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( const MachineSchedContext *C) : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), - HasClusteredNodes(false), HasExcessPressure(false) {} + HasHighPressure(false) {} void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { GenericScheduler::initialize(DAG); @@ -43,10 +50,6 @@ const GCNSubtarget &ST = MF->getSubtarget(); - // FIXME: This is also necessary, because some passes that run after - // scheduling and before regalloc increase register pressure. - const unsigned ErrorMargin = 3; - SGPRExcessLimit = Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); VGPRExcessLimit = @@ -121,13 +124,13 @@ // marked as RegExcess in tryCandidate() when they are compared with // instructions that increase the register pressure. if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { - HasExcessPressure = true; + HasHighPressure = true; Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); } if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { - HasExcessPressure = true; + HasHighPressure = true; Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); } @@ -141,7 +144,7 @@ int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; if (SGPRDelta >= 0 || VGPRDelta >= 0) { - HasExcessPressure = true; + HasHighPressure = true; if (SGPRDelta > VGPRDelta) { Cand.RPDelta.CriticalMax = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); @@ -300,15 +303,6 @@ if (SU->isBottomReady()) Bot.removeReady(SU); - if (!HasClusteredNodes && SU->getInstr()->mayLoadOrStore()) { - for (SDep &Dep : SU->Preds) { - if (Dep.isCluster()) { - HasClusteredNodes = true; - break; - } - } - } - LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr()); return SU; @@ -426,12 +420,12 @@ LiveIns.resize(Regions.size()); Pressure.resize(Regions.size()); RescheduleRegions.resize(Regions.size()); - RegionsWithClusters.resize(Regions.size()); RegionsWithHighRP.resize(Regions.size()); + RegionsWithExcessRP.resize(Regions.size()); RegionsWithMinOcc.resize(Regions.size()); RescheduleRegions.set(); - RegionsWithClusters.reset(); RegionsWithHighRP.reset(); + RegionsWithExcessRP.reset(); RegionsWithMinOcc.reset(); runSchedStages(); @@ -440,7 +434,8 @@ void GCNScheduleDAGMILive::runSchedStages() { LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); InitialScheduleStage S0(GCNSchedStageID::InitialSchedule, *this); - UnclusteredRescheduleStage S1(GCNSchedStageID::UnclusteredReschedule, *this); + UnclusteredHighRPStage S1(GCNSchedStageID::UnclusteredHighRPReschedule, + *this); ClusteredLowOccStage S2(GCNSchedStageID::ClusteredLowOccupancyReschedule, *this); PreRARematStage S3(GCNSchedStageID::PreRARematerialize, *this); @@ -477,8 +472,8 @@ case GCNSchedStageID::InitialSchedule: OS << "Initial Schedule"; break; - case GCNSchedStageID::UnclusteredReschedule: - OS << "Unclustered Reschedule"; + case GCNSchedStageID::UnclusteredHighRPReschedule: + OS << "Unclustered High Register Pressure Reschedule"; break; case GCNSchedStageID::ClusteredLowOccupancyReschedule: OS << "Clustered Low Occupancy Reschedule"; @@ -503,16 +498,30 @@ return true; } -bool UnclusteredRescheduleStage::initGCNSchedStage() { +bool UnclusteredHighRPStage::initGCNSchedStage() { + if (DisableUnclusterHighRP) + return false; + if (!GCNSchedStage::initGCNSchedStage()) return false; - if (DAG.RescheduleRegions.none()) + if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none()) return false; SavedMutations.swap(DAG.Mutations); + InitialOccupancy = DAG.MinOccupancy; + // Aggressivly try to reduce register pressure in the unclustered high RP + // stage. Temporarily increase occupancy target in the region. + S.ErrorMargin = S.HighRPErrorMargin; + if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) + MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); + + LLVM_DEBUG( + dbgs() + << "Retrying function scheduling without clustering. " + "Aggressivly try to reduce register pressure to achieve occupancy " + << DAG.MinOccupancy << ".\n"); - LLVM_DEBUG(dbgs() << "Retrying function scheduling without clustering.\n"); return true; } @@ -565,8 +574,18 @@ LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n"); } -void UnclusteredRescheduleStage::finalizeGCNSchedStage() { +void UnclusteredHighRPStage::finalizeGCNSchedStage() { SavedMutations.swap(DAG.Mutations); + S.ErrorMargin = S.DefaultErrorMargin; + if (DAG.MinOccupancy > InitialOccupancy) { + for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) + DAG.RegionsWithMinOcc[IDX] = + DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy; + + LLVM_DEBUG(dbgs() << StageID + << " stage successfully increased occupancy to " + << DAG.MinOccupancy << '\n'); + } GCNSchedStage::finalizeGCNSchedStage(); } @@ -606,29 +625,29 @@ llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]).print(dbgs()); dbgs() << "Region register pressure: "; PressureBefore.print(dbgs())); - // Set HasClusteredNodes to true for late stages where we have already - // collected it. That way pickNode() will not scan SDep's when not needed. - S.HasClusteredNodes = StageID > GCNSchedStageID::InitialSchedule; - S.HasExcessPressure = false; + S.HasHighPressure = false; return true; } -bool UnclusteredRescheduleStage::initGCNRegion() { - if (!DAG.RescheduleRegions[RegionIdx]) +bool UnclusteredHighRPStage::initGCNRegion() { + // Only reschedule regions with the minimum occupancy or regions that may have + // spilling (excess register pressure). + if ((!DAG.RegionsWithMinOcc[RegionIdx] || + DAG.MinOccupancy <= InitialOccupancy) && + !DAG.RegionsWithExcessRP[RegionIdx]) return false; return GCNSchedStage::initGCNRegion(); } bool ClusteredLowOccStage::initGCNRegion() { - // We may need to reschedule this region if it doesn't have clusters so it - // wasn't rescheduled in the last stage, or if we found it was testing - // critical register pressure limits in the unclustered reschedule stage. The - // later is because we may not have been able to raise the min occupancy in - // the previous stage so the region may be overly constrained even if it was - // already rescheduled. - if (!DAG.RegionsWithClusters[RegionIdx] && !DAG.RegionsWithHighRP[RegionIdx]) + // We may need to reschedule this region if it wasn't rescheduled in the last + // stage, or if we found it was testing critical register pressure limits in + // the unclustered reschedule stage. The later is because we may not have been + // able to raise the min occupancy in the previous stage so the region may be + // overly constrained even if it was already rescheduled. + if (!DAG.RegionsWithHighRP[RegionIdx]) return false; return GCNSchedStage::initGCNRegion(); @@ -656,7 +675,7 @@ void GCNSchedStage::finalizeGCNRegion() { DAG.Regions[RegionIdx] = std::make_pair(DAG.RegionBegin, DAG.RegionEnd); DAG.RescheduleRegions[RegionIdx] = false; - if (S.HasExcessPressure) + if (S.HasHighPressure) DAG.RegionsWithHighRP[RegionIdx] = true; // Revert scheduling if we have dropped occupancy or there is some other @@ -667,16 +686,6 @@ RegionIdx++; } -void InitialScheduleStage::finalizeGCNRegion() { - // Record which regions have clustered nodes for the next unclustered - // reschedule stage. - assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule); - if (S.HasClusteredNodes) - DAG.RegionsWithClusters[RegionIdx] = true; - - GCNSchedStage::finalizeGCNRegion(); -} - void GCNSchedStage::checkScheduling() { // Check the results of scheduling. PressureAfter = DAG.getRealRegPressure(RegionIdx); @@ -731,6 +740,7 @@ PressureAfter.getSGPRNum() > MaxSGPRs) { DAG.RescheduleRegions[RegionIdx] = true; DAG.RegionsWithHighRP[RegionIdx] = true; + DAG.RegionsWithExcessRP[RegionIdx] = true; } // Revert if this region's schedule would cause a drop in occupancy or @@ -758,21 +768,15 @@ if (mayCauseSpilling(WavesAfter)) return true; - assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule); - // Don't reschedule the region in the next stage if it doesn't have clusters. - if (!DAG.RegionsWithClusters[RegionIdx]) - DAG.RescheduleRegions[RegionIdx] = false; - return false; } -bool UnclusteredRescheduleStage::shouldRevertScheduling(unsigned WavesAfter) { - if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) - return true; - - // If RP is not reduced in the unclustred reschedule stage, revert to the old - // schedule. - if (!PressureAfter.less(ST, PressureBefore)) { +bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { + // If RP is not reduced in the unclustred reschedule stage, revert to the + // old schedule. + if ((WavesAfter <= PressureBefore.getOccupancy(ST) && + mayCauseSpilling(WavesAfter)) || + GCNSchedStage::shouldRevertScheduling(WavesAfter)) { LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); return true; } @@ -803,7 +807,7 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { if (WavesAfter <= MFI.getMinWavesPerEU() && !PressureAfter.less(ST, PressureBefore) && - DAG.RescheduleRegions[RegionIdx]) { + DAG.RegionsWithExcessRP[RegionIdx]) { LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); return true; } @@ -816,8 +820,7 @@ PressureBefore.getOccupancy(ST) == DAG.MinOccupancy; LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); DAG.RescheduleRegions[RegionIdx] = - DAG.RegionsWithClusters[RegionIdx] || - (nextStage(StageID)) != GCNSchedStageID::UnclusteredReschedule; + (nextStage(StageID)) != GCNSchedStageID::UnclusteredHighRPReschedule; DAG.RegionEnd = DAG.RegionBegin; int SkippedDebugInstr = 0; for (MachineInstr *MI : Unsched) { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -30,10 +30,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, public AMDGPUSubtarget { - +public: using AMDGPUSubtarget::getMaxWavesPerEU; -public: // Following 2 enums are documented at: // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi enum class TrapHandlerAbi { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll @@ -501,24 +501,24 @@ ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; GFX9-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX9-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GFX9-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; GFX9-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; GFX9-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; GFX9-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5) +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX9-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32] -; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34] -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(3) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36] +; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16 ; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25] +; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 +; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24 ; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25] +; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32 ; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38] +; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25] ; GFX9-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; @@ -527,24 +527,24 @@ ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; GFX9-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX9-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GFX9-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; GFX9-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; GFX9-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; GFX9-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5) +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX9-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32] -; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5) -; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34] -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(3) -; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36] +; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16 ; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25] +; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 +; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24 ; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25] +; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32 ; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38] +; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25] ; GFX9-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; @@ -616,24 +616,24 @@ ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; GFX9-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX9-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GFX9-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; GFX9-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; GFX9-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; GFX9-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5) +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX9-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32] -; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34] -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(3) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36] +; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16 ; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25] +; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 +; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24 ; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25] +; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32 ; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38] +; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25] ; GFX9-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; @@ -642,24 +642,24 @@ ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; GFX9-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX9-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GFX9-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; GFX9-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; GFX9-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; GFX9-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5) +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX9-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32] -; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5) -; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34] -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(3) -; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36] +; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16 ; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25] +; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 +; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24 ; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25] +; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32 ; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38] +; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25] ; GFX9-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -104,7 +104,7 @@ ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x8 ; GFX11-NEXT: global_load_b128 v[32:35], v64, s[0:1] ; GFX11-NEXT: global_load_b128 v[36:39], v64, s[0:1] offset:16 ; GFX11-NEXT: global_load_b128 v[40:43], v64, s[0:1] offset:32 @@ -114,10 +114,10 @@ ; GFX11-NEXT: global_load_b128 v[56:59], v64, s[0:1] offset:96 ; GFX11-NEXT: global_load_b128 v[60:63], v64, s[0:1] offset:112 ; GFX11-NEXT: global_load_b128 v[4:7], v64, s[0:1] offset:144 -; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:128 -; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, 0x3e7 -; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: s_clause 0x6 +; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:128 ; GFX11-NEXT: global_load_b128 v[8:11], v64, s[0:1] offset:160 ; GFX11-NEXT: global_load_b128 v[12:15], v64, s[0:1] offset:176 ; GFX11-NEXT: global_load_b128 v[16:19], v64, s[0:1] offset:192 @@ -131,8 +131,10 @@ ; GFX11-NEXT: s_waitcnt vmcnt(5) ; GFX11-NEXT: global_store_b128 v64, v[8:11], s[2:3] offset:160 ; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: s_clause 0x8 ; GFX11-NEXT: global_store_b128 v64, v[12:15], s[2:3] offset:176 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:192 ; GFX11-NEXT: global_store_b128 v64, v[32:35], s[2:3] ; GFX11-NEXT: global_store_b128 v64, v[36:39], s[2:3] offset:16 ; GFX11-NEXT: global_store_b128 v64, v[40:43], s[2:3] offset:32 @@ -141,8 +143,6 @@ ; GFX11-NEXT: global_store_b128 v64, v[52:55], s[2:3] offset:80 ; GFX11-NEXT: global_store_b128 v64, v[56:59], s[2:3] offset:96 ; GFX11-NEXT: global_store_b128 v64, v[60:63], s[2:3] offset:112 -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:192 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: global_store_b128 v64, v[20:23], s[2:3] offset:208 ; GFX11-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -1645,211 +1645,208 @@ ; GFX7-LABEL: v_mul_i256: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v16, v0 -; GFX7-NEXT: v_mov_b32_e32 v17, v1 -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1] -; GFX7-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX7-NEXT: v_mul_lo_u32 v26, v5, v10 -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX7-NEXT: v_mul_lo_u32 v28, v4, v11 +; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] ; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] ; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] ; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc ; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v20, vcc +; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 ; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v22, vcc -; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0 -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23] -; GFX7-NEXT: v_mov_b32_e32 v1, v18 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] +; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc +; GFX7-NEXT: v_mov_b32_e32 v20, v18 ; GFX7-NEXT: v_mov_b32_e32 v18, v19 -; GFX7-NEXT: v_mov_b32_e32 v19, v20 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23] -; GFX7-NEXT: v_addc_u32_e64 v25, s[4:5], 0, v0, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19] -; GFX7-NEXT: v_mov_b32_e32 v0, v23 -; GFX7-NEXT: v_mul_lo_u32 v23, v4, v11 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v13, v2, v13 -; GFX7-NEXT: v_mul_lo_u32 v20, v6, v9 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12] +; GFX7-NEXT: v_mov_b32_e32 v19, v16 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] +; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] +; GFX7-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] +; GFX7-NEXT: v_mov_b32_e32 v19, v22 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] +; GFX7-NEXT: v_mul_lo_u32 v24, v3, v12 +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] +; GFX7-NEXT: v_mul_lo_u32 v22, v2, v13 +; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1] +; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] ; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, v22 -; GFX7-NEXT: v_addc_u32_e64 v10, s[12:13], 0, v4, s[12:13] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 +; GFX7-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX7-NEXT: v_mov_b32_e32 v20, v11 +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] ; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12] -; GFX7-NEXT: v_addc_u32_e64 v10, s[12:13], 0, v10, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2] -; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v11, v3, s[12:13] -; GFX7-NEXT: v_mul_lo_u32 v11, v16, v15 -; GFX7-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v25, v4, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v10, v5, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v24, v6, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v10, s[12:13], v21, v11, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v9, s[12:13], v10, v9, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v9, s[10:11], v9, v13, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v27, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v23, s[6:7] -; GFX7-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v26, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc -; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] +; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] +; GFX7-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13] +; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14 +; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] +; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13] +; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 +; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15] +; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc +; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, v10 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v16, v0 -; GFX8-NEXT: v_mov_b32_e32 v17, v1 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1] -; GFX8-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX8-NEXT: v_mul_lo_u32 v26, v5, v10 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX8-NEXT: v_mul_lo_u32 v28, v4, v11 +; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] ; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] ; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] ; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc ; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v20, vcc +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 ; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v22, vcc -; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0 -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23] -; GFX8-NEXT: v_mov_b32_e32 v1, v18 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] +; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc +; GFX8-NEXT: v_mov_b32_e32 v20, v18 ; GFX8-NEXT: v_mov_b32_e32 v18, v19 -; GFX8-NEXT: v_mov_b32_e32 v19, v20 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23] -; GFX8-NEXT: v_addc_u32_e64 v25, s[4:5], 0, v0, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19] -; GFX8-NEXT: v_mov_b32_e32 v0, v23 -; GFX8-NEXT: v_mul_lo_u32 v23, v4, v11 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v13, v2, v13 -; GFX8-NEXT: v_mul_lo_u32 v20, v6, v9 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12] +; GFX8-NEXT: v_mov_b32_e32 v19, v16 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] +; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] +; GFX8-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] +; GFX8-NEXT: v_mov_b32_e32 v19, v22 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] +; GFX8-NEXT: v_mul_lo_u32 v24, v3, v12 +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] +; GFX8-NEXT: v_mul_lo_u32 v22, v2, v13 +; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] ; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, v22 -; GFX8-NEXT: v_addc_u32_e64 v10, s[12:13], 0, v4, s[12:13] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 +; GFX8-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX8-NEXT: v_mov_b32_e32 v20, v11 +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12] -; GFX8-NEXT: v_addc_u32_e64 v10, s[12:13], 0, v10, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2] -; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v11, v3, s[12:13] -; GFX8-NEXT: v_mul_lo_u32 v11, v16, v15 -; GFX8-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v25, v4, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v10, v5, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v24, v6, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v10, s[12:13], v21, v11, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v9, s[12:13], v10, v9, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v9, s[10:11], v9, v13, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v27, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v23, s[6:7] -; GFX8-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v26, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] +; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] +; GFX8-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13] +; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] +; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13] +; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 +; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15] +; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, v10 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-NEXT: v_mov_b32_e32 v17, v1 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1] -; GFX9-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX9-NEXT: v_mul_lo_u32 v26, v5, v10 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX9-NEXT: v_mul_lo_u32 v28, v4, v11 +; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] ; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] ; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] ; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc ; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v20, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 ; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v22, vcc -; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0 -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23] -; GFX9-NEXT: v_mov_b32_e32 v1, v18 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v20, vcc +; GFX9-NEXT: v_mov_b32_e32 v20, v18 ; GFX9-NEXT: v_mov_b32_e32 v18, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v20 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23] -; GFX9-NEXT: v_addc_co_u32_e64 v25, s[4:5], 0, v0, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19] -; GFX9-NEXT: v_mov_b32_e32 v0, v23 -; GFX9-NEXT: v_mul_lo_u32 v23, v4, v11 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v13, v2, v13 -; GFX9-NEXT: v_mul_lo_u32 v20, v6, v9 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12] +; GFX9-NEXT: v_mov_b32_e32 v19, v16 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] +; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] +; GFX9-NEXT: v_addc_co_u32_e64 v26, s[4:5], 0, v6, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] +; GFX9-NEXT: v_mov_b32_e32 v19, v22 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] +; GFX9-NEXT: v_mul_lo_u32 v24, v3, v12 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] +; GFX9-NEXT: v_mul_lo_u32 v22, v2, v13 +; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] ; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, v22 -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[12:13], 0, v4, s[12:13] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 +; GFX9-NEXT: v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX9-NEXT: v_mov_b32_e32 v20, v11 +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12] -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[12:13], 0, v10, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v11, v3, s[12:13] -; GFX9-NEXT: v_mul_lo_u32 v11, v16, v15 -; GFX9-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v25, v4, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v10, v5, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v24, v6, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[12:13], v21, v11, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[12:13], v10, v9, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[10:11], v9, v13, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v27, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v9, v23, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v9, v26, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v20, vcc -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[12:13], 0, v2, s[12:13] +; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v12, v3, s[12:13] +; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v26, v4, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v11, v5, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v25, v6, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v17, v0, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v0, v9, s[14:15] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v22, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v24, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v28, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, v10 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i256: diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -525,11 +525,11 @@ ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 ; GFX908-NEXT: s_load_dword s8, s[4:5], 0x18 -; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: s_mov_b32 s10, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX908-NEXT: s_sub_i32 s4, 0, s1 +; GFX908-NEXT: s_sub_i32 s6, 0, s1 ; GFX908-NEXT: s_lshr_b32 s11, s8, 16 ; GFX908-NEXT: v_cvt_f32_f16_e32 v25, s8 ; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -539,11 +539,12 @@ ; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v7, s3 -; GFX908-NEXT: s_mov_b32 s10, 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s2 -; GFX908-NEXT: v_mul_lo_u32 v2, s4, v0 -; GFX908-NEXT: s_lshl_b64 s[4:5], s[6:7], 5 +; GFX908-NEXT: v_mul_lo_u32 v2, s6, v0 +; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX908-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_lshl_b64 s[4:5], s[6:7], 5 ; GFX908-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX908-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX908-NEXT: v_mov_b32_e32 v2, s8 diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir --- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir @@ -13,14 +13,6 @@ # CHECK-NEXT: From: DBG_VALUE %17:vgpr_32, 0, 0 # CHECK-NEXT: To: S_ENDPGM 0, implicit %69:vgpr_32, implicit %70:vgpr_32 # CHECK-NEXT: RegionInstrs: 46 -# CHECK: Unclustered reschedule did not help. -# CHECK: Attempting to revert scheduling. -# CHECK: Retrying function scheduling with lowest recorded occupancy 3. -# CHECK: ********** MI Scheduling ********** -# CHECK: test_same_num_instrs:%bb.2 -# CHECK-NEXT: From: DBG_VALUE %17:vgpr_32, 0, 0 -# CHECK-NEXT: To: S_ENDPGM 0, implicit %69:vgpr_32, implicit %70:vgpr_32 -# CHECK-NEXT: RegionInstrs: 46 # CHECK: Attempting to revert scheduling. --- diff --git a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir @@ -0,0 +1,144 @@ +# REQUIRES: asserts +# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s + +--- | + define amdgpu_kernel void @high-RP-reschedule() { ret void } +... + +# CHECK: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4 + +--- +name: high-RP-reschedule +tracksRegLiveness: true +machineFunctionInfo: + occupancy: 4 +body: | + bb.0: + %0:vreg_128 = IMPLICIT_DEF + %1:vreg_128 = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:vreg_128 = IMPLICIT_DEF + %4:vreg_128 = IMPLICIT_DEF + %5:vreg_128 = IMPLICIT_DEF + %6:vreg_128 = IMPLICIT_DEF + %7:vreg_128 = IMPLICIT_DEF + %8:vreg_128 = IMPLICIT_DEF + %9:vreg_128 = IMPLICIT_DEF + %10:vreg_128 = IMPLICIT_DEF + %11:sreg_64_xexec = IMPLICIT_DEF + %12:vreg_64 = IMPLICIT_DEF + + bb.1: + %13:vgpr_32 = V_LSHRREV_B16_e32 1, %12.sub0, implicit $exec + %14:vgpr_32 = V_AND_B32_e32 127, %13, implicit $exec + %15:vgpr_32 = V_MUL_LO_U16_e32 49, %14, implicit $exec + %16:vgpr_32 = V_LSHRREV_B16_e32 10, %15, implicit $exec + %17:vgpr_32 = V_MUL_LO_U16_e32 42, %16, implicit $exec + %18:vgpr_32 = V_SUB_U16_e32 %12.sub0, %17, implicit $exec + %19:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + %20:vgpr_32 = V_MUL_U32_U24_sdwa 0, %18, 0, %19, 0, 6, 0, 0, 6, implicit $exec + %21:vgpr_32 = V_LSHLREV_B32_e32 4, %20, implicit $exec + %22:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 608, 0, implicit $exec :: (load (s128)) + %23:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 576, 0, implicit $exec :: (load (s128)) + %24:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 592, 0, implicit $exec :: (load (s128)) + %25:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 624, 0, implicit $exec :: (load (s128)) + %26:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 672, 0, implicit $exec :: (load (s128)) + %27:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 640, 0, implicit $exec :: (load (s128)) + %28:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 656, 0, implicit $exec :: (load (s128)) + %29:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %7.sub2_sub3, 0, %25.sub2_sub3, 0, 0, implicit $mode, implicit $exec + %30:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %7.sub2_sub3, 0, %25.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %31:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %7.sub0_sub1, 0, %25.sub0_sub1, 1, %29, 0, 0, implicit $mode, implicit $exec + %32:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %7.sub0_sub1, 0, %25.sub2_sub3, 0, %30, 0, 0, implicit $mode, implicit $exec + %33:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %5.sub2_sub3, 0, %24.sub2_sub3, 0, 0, implicit $mode, implicit $exec + %34:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %5.sub2_sub3, 0, %24.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %35:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %5.sub0_sub1, 0, %24.sub0_sub1, 1, %33, 0, 0, implicit $mode, implicit $exec + %36:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %5.sub0_sub1, 0, %24.sub2_sub3, 0, %34, 0, 0, implicit $mode, implicit $exec + %37:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %9.sub2_sub3, 0, %28.sub2_sub3, 0, 0, implicit $mode, implicit $exec + %38:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %9.sub2_sub3, 0, %28.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %39:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %9.sub0_sub1, 0, %28.sub0_sub1, 1, %37, 0, 0, implicit $mode, implicit $exec + %40:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %9.sub0_sub1, 0, %28.sub2_sub3, 0, %38, 0, 0, implicit $mode, implicit $exec + %41:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %4.sub2_sub3, 0, %23.sub2_sub3, 0, 0, implicit $mode, implicit $exec + %42:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %4.sub2_sub3, 0, %23.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %43:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %4.sub0_sub1, 0, %23.sub0_sub1, 1, %41, 0, 0, implicit $mode, implicit $exec + %44:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %4.sub0_sub1, 0, %23.sub2_sub3, 0, %42, 0, 0, implicit $mode, implicit $exec + %45:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %8.sub2_sub3, 0, %27.sub2_sub3, 0, 0, implicit $mode, implicit $exec + %46:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %8.sub2_sub3, 0, %27.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %47:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %6.sub2_sub3, 0, %22.sub2_sub3, 0, 0, implicit $mode, implicit $exec + %48:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %8.sub0_sub1, 0, %27.sub0_sub1, 1, %45, 0, 0, implicit $mode, implicit $exec + %49:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %8.sub0_sub1, 0, %27.sub2_sub3, 0, %46, 0, 0, implicit $mode, implicit $exec + %50:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %10.sub2_sub3, 0, %26.sub2_sub3, 0, 0, implicit $mode, implicit $exec + %51:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %6.sub2_sub3, 0, %22.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %52:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %10.sub2_sub3, 0, %26.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %53:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %6.sub0_sub1, 0, %22.sub0_sub1, 1, %47, 0, 0, implicit $mode, implicit $exec + %54:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %10.sub0_sub1, 0, %26.sub0_sub1, 1, %50, 0, 0, implicit $mode, implicit $exec + %55:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %6.sub0_sub1, 0, %22.sub2_sub3, 0, %51, 0, 0, implicit $mode, implicit $exec + %56:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %10.sub0_sub1, 0, %26.sub2_sub3, 0, %52, 0, 0, implicit $mode, implicit $exec + %57:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %3.sub2_sub3, 1, %32, 0, 0, implicit $mode, implicit $exec + %58:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %35, 1, %39, 0, 0, implicit $mode, implicit $exec + %59:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %44, 1, %49, 0, 0, implicit $mode, implicit $exec + %60:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %53, 1, %54, 0, 0, implicit $mode, implicit $exec + %61:sreg_64 = S_MOV_B64_IMM_PSEUDO 4604544271217802189 + %62:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %3.sub0_sub1, 1, %31, 0, 0, implicit $mode, implicit $exec + undef %63.sub1:sreg_64 = S_MOV_B32 -1075404642 + %64:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %36, 1, %40, 0, 0, implicit $mode, implicit $exec + %65:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %43, 1, %48, 0, 0, implicit $mode, implicit $exec + %66:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %55, 1, %56, 0, 0, implicit $mode, implicit $exec + %67:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %57, 0, %58, 0, 0, implicit $mode, implicit $exec + %68:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %59, 0, %60, 0, 0, implicit $mode, implicit $exec + %69:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %3.sub2_sub3, 0, 4611686018427387904, 1, %57, 0, 0, implicit $mode, implicit $exec + %70:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %3.sub0_sub1, 0, 4611686018427387904, 1, %62, 0, 0, implicit $mode, implicit $exec + %71:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %35, 0, 4611686018427387904, 1, %58, 0, 0, implicit $mode, implicit $exec + %72:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %36, 0, 4611686018427387904, 1, %64, 0, 0, implicit $mode, implicit $exec + %73:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %43, 0, 4611686018427387904, 1, %65, 0, 0, implicit $mode, implicit $exec + %74:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %44, 0, 4611686018427387904, 1, %59, 0, 0, implicit $mode, implicit $exec + %75:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %55, 0, 4611686018427387904, 1, %66, 0, 0, implicit $mode, implicit $exec + %76:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %53, 0, 4611686018427387904, 1, %60, 0, 0, implicit $mode, implicit $exec + %77:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %62, 1, %64, 0, 0, implicit $mode, implicit $exec + %78:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %65, 1, %66, 0, 0, implicit $mode, implicit $exec + %79:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %70, 1, %71, 0, 0, implicit $mode, implicit $exec + %80:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %57, 0, 4611686018427387904, 1, %67, 0, 0, implicit $mode, implicit $exec + %81:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %59, 0, 4611686018427387904, 1, %68, 0, 0, implicit $mode, implicit $exec + %82:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %69, 1, %72, 0, 0, implicit $mode, implicit $exec + %83:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %74, 1, %75, 0, 0, implicit $mode, implicit $exec + %84:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %73, 1, %76, 0, 0, implicit $mode, implicit $exec + %85:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %62, 0, 4611686018427387904, 1, %77, 0, 0, implicit $mode, implicit $exec + %86:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %65, 0, 4611686018427387904, 1, %78, 0, 0, implicit $mode, implicit $exec + %63.sub0:sreg_64 = COPY %61.sub0 + %87:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %68, 0, %61, 0, %67, 0, 0, implicit $mode, implicit $exec + %88:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %81, 0, %63, 0, %80, 0, 0, implicit $mode, implicit $exec + %89:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %70, 0, 4611686018427387904, 1, %79, 0, 0, implicit $mode, implicit $exec + %90:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %69, 0, 4611686018427387904, 1, %82, 0, 0, implicit $mode, implicit $exec + %91:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %74, 0, 4611686018427387904, 1, %83, 0, 0, implicit $mode, implicit $exec + %92:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %73, 0, 4611686018427387904, 1, %84, 0, 0, implicit $mode, implicit $exec + %93:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %86, 0, %63, 0, %85, 0, 0, implicit $mode, implicit $exec + %94:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %78, 0, %61, 0, %77, 0, 0, implicit $mode, implicit $exec + undef %95.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %78, 0, %61, 0, %87, 0, 0, implicit $mode, implicit $exec + undef %96.sub2_sub3:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %82, 0, %84, 0, 0, implicit $mode, implicit $exec + undef %97.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %86, 0, %61, 0, %88, 0, 0, implicit $mode, implicit $exec + undef %98.sub2_sub3:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %90, 1, %91, 0, 0, implicit $mode, implicit $exec + %98.sub0_sub1:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %89, 1, %92, 0, 0, implicit $mode, implicit $exec + %97.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %81, 0, %63, 0, %93, 0, 0, implicit $mode, implicit $exec + %96.sub0_sub1:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %79, 1, %83, 0, 0, implicit $mode, implicit $exec + %95.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %68, 0, %63, 0, %94, 0, 0, implicit $mode, implicit $exec + undef %99.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %67, 0, 4611686018427387904, 1, %95.sub2_sub3, 0, 0, implicit $mode, implicit $exec + undef %100.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %82, 0, 4611686018427387904, 1, %96.sub2_sub3, 0, 0, implicit $mode, implicit $exec + undef %101.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %80, 0, 4611686018427387904, 1, %97.sub2_sub3, 0, 0, implicit $mode, implicit $exec + undef %102.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %90, 0, 4611686018427387904, 1, %98.sub2_sub3, 0, 0, implicit $mode, implicit $exec + %102.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %89, 0, 4611686018427387904, 1, %98.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %101.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %85, 0, 4611686018427387904, 1, %97.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %100.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %79, 0, 4611686018427387904, 1, %96.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %99.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %77, 0, 4611686018427387904, 1, %95.sub0_sub1, 0, 0, implicit $mode, implicit $exec + %103:vgpr_32 = V_ADD_U32_sdwa 0, %2, 0, %18, 0, 6, 0, 6, 0, implicit $exec + %104:vgpr_32 = V_LSHL_ADD_U32_e64 %103, 4, 0, implicit $exec + DS_WRITE_B128_gfx9 %104, %102, 0, 0, implicit $exec + DS_WRITE_B128_gfx9 %104, %101, 672, 0, implicit $exec + DS_WRITE_B128_gfx9 %104, %100, 1344, 0, implicit $exec + DS_WRITE_B128_gfx9 %104, %99, 2016, 0, implicit $exec + DS_WRITE_B128_gfx9 %104, %98, 2688, 0, implicit $exec + DS_WRITE_B128_gfx9 %104, %97, 3360, 0, implicit $exec + DS_WRITE_B128_gfx9 %104, %96, 4032, 0, implicit $exec + DS_WRITE_B128_gfx9 %104, %95, 4704, 0, implicit $exec + + bb.2: + S_ENDPGM 0, implicit %0, implicit %1 +... diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -2948,18 +2948,19 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s42 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s62 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s41 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s59 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s58 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 @@ -3094,95 +3095,95 @@ ; GCN-HSA-NEXT: s_and_b32 s50, s50, 0xffff ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x80 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s43 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s38 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s55 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s34 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s33 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 @@ -3682,18 +3683,19 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s22 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s61 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s59 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s58 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 @@ -3806,72 +3808,82 @@ ; GCN-HSA-NEXT: s_ashr_i32 s68, s50, 16 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s47, s47 +; GCN-HSA-NEXT: s_sext_i32_i16 s46, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s49, s49 -; GCN-HSA-NEXT: s_sext_i32_i16 s48, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] +; GCN-HSA-NEXT: s_sext_i32_i16 s45, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s44, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] +; GCN-HSA-NEXT: s_sext_i32_i16 s51, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s50, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s43, s43 ; GCN-HSA-NEXT: s_sext_i32_i16 s42, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s43 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: s_sext_i32_i16 s51, s51 +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] +; GCN-HSA-NEXT: s_sext_i32_i16 s49, s49 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_sext_i32_i16 s50, s50 +; GCN-HSA-NEXT: s_sext_i32_i16 s48, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s36, s36 ; GCN-HSA-NEXT: s_sext_i32_i16 s39, s39 ; GCN-HSA-NEXT: s_sext_i32_i16 s38, s38 ; GCN-HSA-NEXT: s_sext_i32_i16 s41, s41 ; GCN-HSA-NEXT: s_sext_i32_i16 s40, s40 -; GCN-HSA-NEXT: s_sext_i32_i16 s45, s45 -; GCN-HSA-NEXT: s_sext_i32_i16 s44, s44 -; GCN-HSA-NEXT: s_sext_i32_i16 s47, s47 -; GCN-HSA-NEXT: s_sext_i32_i16 s46, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: s_sext_i32_i16 s29, s29 @@ -3879,43 +3891,33 @@ ; GCN-HSA-NEXT: s_sext_i32_i16 s31, s31 ; GCN-HSA-NEXT: s_sext_i32_i16 s30, s30 ; GCN-HSA-NEXT: s_sext_i32_i16 s37, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s38 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s55 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s54 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s30 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s28 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s34 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s33 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s27, s27 ; GCN-HSA-NEXT: s_sext_i32_i16 s26, s26 @@ -6578,16 +6580,16 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s46, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s38, s3 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s14, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s11 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s46, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s38, s1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s14, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s12, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s8, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[20:21], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[18:19], 0x100000 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s6, 16 @@ -6596,8 +6598,8 @@ ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s0, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[60:61], s[8:9], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[12:13], 0x100000 @@ -6623,50 +6625,50 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[46:47], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[44:45], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[42:43], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[40:41], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[44:45], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s37 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s15 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s13 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s40 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s41 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[58:59], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[54:55], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[52:53], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[30:31], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[28:29], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s70 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s71 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s39 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s68 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 @@ -6679,14 +6681,14 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s63 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s60 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 @@ -6721,21 +6723,21 @@ ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_mov_b32 s42, s15 -; GCN-HSA-NEXT: s_mov_b32 s44, s13 -; GCN-HSA-NEXT: s_mov_b32 s46, s11 -; GCN-HSA-NEXT: s_mov_b32 s48, s9 -; GCN-HSA-NEXT: s_mov_b32 s50, s7 -; GCN-HSA-NEXT: s_mov_b32 s52, s5 -; GCN-HSA-NEXT: s_mov_b32 s54, s3 -; GCN-HSA-NEXT: s_mov_b32 s56, s1 -; GCN-HSA-NEXT: s_lshr_b32 s58, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s60, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s62, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s64, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s66, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s68, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s70, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s72, s0, 16 +; GCN-HSA-NEXT: s_mov_b32 s48, s13 +; GCN-HSA-NEXT: s_mov_b32 s50, s11 +; GCN-HSA-NEXT: s_mov_b32 s52, s9 +; GCN-HSA-NEXT: s_mov_b32 s54, s7 +; GCN-HSA-NEXT: s_mov_b32 s56, s5 +; GCN-HSA-NEXT: s_mov_b32 s44, s3 +; GCN-HSA-NEXT: s_mov_b32 s58, s1 +; GCN-HSA-NEXT: s_lshr_b32 s60, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s62, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s64, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s66, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s68, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s70, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s72, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s74, s0, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-HSA-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 @@ -6749,7 +6751,7 @@ ; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 ; GCN-HSA-NEXT: s_ashr_i64 s[40:41], s[4:5], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[46:47], s[6:7], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[80:81], s[12:13], 48 @@ -6757,94 +6759,95 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[72:73], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[70:71], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[64:65], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[60:61], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[58:59], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[56:57], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[74:75], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[72:73], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[70:71], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[68:69], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[66:67], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[64:65], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[62:63], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[60:61], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[58:59], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 -; GCN-HSA-NEXT: s_add_u32 s56, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s57, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s44 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s45 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s45 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s45 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s45 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: s_add_u32 s58, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s59, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48 +; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s49 +; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s49 +; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s46 +; GCN-HSA-NEXT: s_add_u32 s46, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s47 +; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40 -; GCN-HSA-NEXT: s_add_u32 s40, s16, 0x50 +; GCN-HSA-NEXT: s_add_u32 s40, s16, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 ; GCN-HSA-NEXT: s_addc_u32 s41, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s41 +; GCN-HSA-NEXT: s_add_u32 s40, s16, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s41, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GCN-HSA-NEXT: s_add_u32 s38, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s79 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 ; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s39 ; GCN-HSA-NEXT: s_add_u32 s38, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s79 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 ; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14 ; GCN-HSA-NEXT: s_add_u32 s14, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s53 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15 ; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s44 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s54 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s12 ; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xc0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s74 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s75 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s55 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s13 ; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s38 ; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s34 @@ -6852,10 +6855,9 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s14 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -3484,8 +3484,8 @@ ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v18 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v18 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill @@ -3503,99 +3503,88 @@ ; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v19 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v18 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v19 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v18 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[18:21], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[22:25], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v26 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v24 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v27 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v26 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v25 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v24 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xffff, v21 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v20 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xffff, v19 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v18 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v31 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v30 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v33, 0xffff, v29 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v28 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v25 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v24 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v25 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v24 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[38:41], off, s[8:11], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v36, 0xffff, v23 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v22 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v41 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v42, 16, v40 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v39 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v46, 16, v38 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v37 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v35 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v43, 0xffff, v41 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v41, 0xffff, v40 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v47, 0xffff, v39 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xffff, v38 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v43, 0xffff, v37 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v36 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v47, 0xffff, v35 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v54, 16, v40 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v39 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v37, 0xffff, v42 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v35, 0xffff, v41 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[55:58], off, s[8:11], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v40 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v51, 0xffff, v39 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v51, 0xffff, v23 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v22 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[22:25], off, s[8:11], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v55, 0xffff, v21 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v20 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[57:60], off, s[8:11], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v61, 16, v58 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 16, v57 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v56 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v25 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v24 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v55 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, 0xffff, v58 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v57 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v56 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v24 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v42 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v41 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v42 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v41 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v60 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v59 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v63, 16, v58 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v61, 16, v57 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v60 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v59 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v62, 0xffff, v58 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, 0xffff, v57 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[53:56], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload @@ -3617,132 +3606,132 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s8, s2, 48 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 64 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s8 +; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] ; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29] +; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 -; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_waitcnt vmcnt(7) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v30, 0xffff, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v28, 0xffff, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xd0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v30, 0xffff, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v28, 0xffff, v14 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xa0 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[24:27] -; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xa0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] -; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[28:31] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v6 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[7:10] ; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v14 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v13 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v14 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(11) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v17 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v16 ; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v17 ; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v19 @@ -3759,23 +3748,24 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: s_waitcnt vmcnt(12) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v33 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v32 ; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v33 ; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v21 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v25 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v23 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v23 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v22 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v27 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v26 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v27 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v26 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -3787,18 +3777,18 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v29 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v28 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v29 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v28 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v20 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v31 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v23 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v31 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -3833,102 +3823,93 @@ ; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v18 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v12 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v16 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v19 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v18 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v17 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v19 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v17 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v16 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v23 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v21 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v20 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:80 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v46, 16, v38 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v37 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v48, 16, v35 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, 0xffff, v38 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v43, 0xffff, v37 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v49, 0xffff, v36 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v47, 0xffff, v35 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v21 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v23 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, 0xffff, v22 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v46, 0xffff, v21 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, 0xffff, v20 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 16, v40 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v52, 16, v39 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v37, 0xffff, v42 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v35, 0xffff, v41 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v53, 0xffff, v40 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v39 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[55:58], off, s[8:11], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v26 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v24 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v27 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v26 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v25 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v24 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v30 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xffff, v29 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v31, 0xffff, v28 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v39 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v38 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v36 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v38 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, 0xffff, v37 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v36 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[52:55], off, s[8:11], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 16, v39 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v56 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v60, 16, v41 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v40 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v61, 0xffff, v42 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v59, 0xffff, v41 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v40 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v39 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v42, 16, v58 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v40, 16, v57 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v41, 0xffff, v58 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, 0xffff, v57 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v52 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v57, 16, v38 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v63, 16, v37 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v61, 16, v36 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v58, 0xffff, v39 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, 0xffff, v38 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v62, 0xffff, v37 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, 0xffff, v36 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v55 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v54 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, 0xffff, v55 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v54 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload @@ -4432,8 +4413,8 @@ ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 @@ -4447,93 +4428,93 @@ ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 -; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 +; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29] +; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(5) +; GCN-HSA-NEXT: s_waitcnt vmcnt(7) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v12 ; GCN-HSA-NEXT: v_bfe_i32 v26, v13, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v24, v12, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[12:13] -; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[24:27] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[24:27] +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v14 ; GCN-HSA-NEXT: v_bfe_i32 v26, v15, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v24, v14, 0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[24:27] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v8 ; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 ; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v39, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v4 ; GCN-HSA-NEXT: v_bfe_i32 v10, v5, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[7:10] -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v14, v7, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v6, 0, 16 +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0 ; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[12:15] ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 @@ -4570,20 +4551,20 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GCN-HSA-NEXT: s_waitcnt vmcnt(11) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v33 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v32 -; GCN-HSA-NEXT: v_bfe_i32 v14, v33, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v12, v32, 0, 16 +; GCN-HSA-NEXT: s_waitcnt vmcnt(12) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v29 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v28 +; GCN-HSA-NEXT: v_bfe_i32 v14, v29, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v28, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v35 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v31 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v34 -; GCN-HSA-NEXT: v_bfe_i32 v10, v35, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v34, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v30 +; GCN-HSA-NEXT: v_bfe_i32 v10, v31, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v30, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v21 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v20 @@ -4593,18 +4574,19 @@ ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v29 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v28 -; GCN-HSA-NEXT: v_bfe_i32 v6, v29, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v28, 0, 16 +; GCN-HSA-NEXT: s_waitcnt vmcnt(14) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v33 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v32 +; GCN-HSA-NEXT: v_bfe_i32 v6, v33, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v32, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v31 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v35 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v30 -; GCN-HSA-NEXT: v_bfe_i32 v2, v31, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v30, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v34 +; GCN-HSA-NEXT: v_bfe_i32 v2, v35, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v34, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -6527,51 +6509,50 @@ ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v6 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v20 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v21 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: @@ -6669,10 +6650,10 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir @@ -1,5 +1,5 @@ -# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck -check-prefix=DEBUG %s -# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler -amdgpu-disable-unclustred-high-rp-reschedule %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck -check-prefix=DEBUG %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -amdgpu-disable-unclustred-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s # REQUIRES: asserts --- diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir --- a/llvm/test/CodeGen/AMDGPU/pr51516.mir +++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustred-high-rp-reschedule -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s # Check that %3 was not rematerialized before the last store since its operand %1 # is killed by that store. diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -452,92 +452,101 @@ ; GFX8-NEXT: v_mov_b32_e32 v6, 0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v7, 0 -; GFX8-NEXT: s_movk_i32 s0, 0x7f +; GFX8-NEXT: s_movk_i32 s4, 0x7f ; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_mov_b32 s1, 0 +; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: .LBB1_2: ; %for.body ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffb000, v4 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v5, vcc -; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9] +; GFX8-NEXT: s_mov_b64 s[0:1], vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffb800, v4 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v5, vcc -; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] +; GFX8-NEXT: s_mov_b64 s[2:3], vcc +; GFX8-NEXT: v_addc_u32_e64 v9, vcc, -1, v5, s[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9] ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0xffffc000, v4 -; GFX8-NEXT: v_addc_u32_e32 v13, vcc, -1, v5, vcc +; GFX8-NEXT: s_mov_b64 s[0:1], vcc +; GFX8-NEXT: v_addc_u32_e64 v11, vcc, -1, v5, s[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xffffc800, v4 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, -1, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xffffd000, v4 +; GFX8-NEXT: s_mov_b64 s[2:3], vcc +; GFX8-NEXT: v_addc_u32_e64 v13, vcc, -1, v5, s[0:1] +; GFX8-NEXT: s_addk_i32 s5, 0x2000 +; GFX8-NEXT: s_cmp_gt_u32 s5, 0x3fffff +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v8, v6 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[12:13] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffd000, v4 +; GFX8-NEXT: s_mov_b64 s[0:1], vcc +; GFX8-NEXT: v_addc_u32_e64 v15, vcc, -1, v5, s[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[14:15] +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v10, v16 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffd800, v4 +; GFX8-NEXT: s_mov_b64 s[2:3], vcc +; GFX8-NEXT: v_addc_u32_e64 v7, vcc, -1, v5, s[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u32_e32 v14, vcc, v8, v16 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffe000, v4 +; GFX8-NEXT: s_mov_b64 s[0:1], vcc +; GFX8-NEXT: v_addc_u32_e64 v11, vcc, -1, v5, s[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u32_e32 v14, vcc, v12, v14 +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, v13, v9, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0xffffe800, v4 +; GFX8-NEXT: s_mov_b64 s[2:3], vcc +; GFX8-NEXT: v_addc_u32_e64 v9, vcc, -1, v5, s[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9] +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u32_e32 v14, vcc, v6, v14 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v13, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff000, v4 +; GFX8-NEXT: s_mov_b64 s[0:1], vcc +; GFX8-NEXT: v_addc_u32_e64 v13, vcc, -1, v5, s[2:3] ; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13] -; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[14:15] -; GFX8-NEXT: v_addc_u32_e32 v17, vcc, -1, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xffffd800, v4 -; GFX8-NEXT: v_addc_u32_e32 v19, vcc, -1, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xffffe000, v4 -; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[16:17] -; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[18:19] -; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xffffe800, v4 -; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xfffff000, v4 -; GFX8-NEXT: flat_load_dwordx2 v[20:21], v[20:21] -; GFX8-NEXT: flat_load_dwordx2 v[22:23], v[22:23] -; GFX8-NEXT: v_addc_u32_e32 v25, vcc, -1, v5, vcc -; GFX8-NEXT: s_addk_i32 s1, 0x2000 -; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff -; GFX8-NEXT: s_waitcnt vmcnt(7) -; GFX8-NEXT: v_add_u32_e32 v26, vcc, v8, v6 -; GFX8-NEXT: v_addc_u32_e32 v27, vcc, v9, v7, vcc -; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[24:25] -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v4 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u32_e32 v14, vcc, v10, v14 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v11, v7, vcc +; GFX8-NEXT: v_addc_u32_e64 v7, s[0:1], -1, v5, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xfffff800, v4 ; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] -; GFX8-NEXT: flat_load_dwordx2 v[24:25], v[4:5] +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v5, vcc +; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] +; GFX8-NEXT: s_waitcnt vmcnt(3) +; GFX8-NEXT: v_add_u32_e32 v14, vcc, v8, v14 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v9, v15, vcc +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5] ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x10000, v4 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GFX8-NEXT: s_waitcnt vmcnt(9) -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v26 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v27, vcc -; GFX8-NEXT: s_waitcnt vmcnt(8) -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v12, v10 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v13, v11, vcc -; GFX8-NEXT: s_waitcnt vmcnt(7) -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v14, v10 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v15, v11, vcc -; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v16, v10 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v17, v11, vcc -; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v18, v10 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v19, v11, vcc -; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v20, v10 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v21, v11, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v22, v10 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v23, v11, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v14 +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, v13, v15, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v12 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v13, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v10, v6 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v11, v7, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v24, v6 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v25, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc ; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX8-NEXT: s_add_i32 s1, s0, -1 -; GFX8-NEXT: s_cmp_eq_u32 s0, 0 +; GFX8-NEXT: s_add_i32 s0, s4, -1 +; GFX8-NEXT: s_cmp_eq_u32 s4, 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB1_5 ; GFX8-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 -; GFX8-NEXT: s_mov_b32 s0, s1 +; GFX8-NEXT: s_mov_b32 s4, s0 ; GFX8-NEXT: s_branch .LBB1_1 ; GFX8-NEXT: .LBB1_5: ; %while.end ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7] @@ -593,62 +602,63 @@ ; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v5, vcc +; GFX900-NEXT: s_mov_b64 s[0:1], vcc +; GFX900-NEXT: v_addc_co_u32_e64 v9, s[0:1], -1, v5, s[0:1] ; GFX900-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:-4096 ; GFX900-NEXT: global_load_dwordx2 v[12:13], v[4:5], off offset:-2048 ; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v4 ; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off ; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v5, vcc -; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, s2, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v5, vcc -; GFX900-NEXT: global_load_dwordx2 v[22:23], v[14:15], off -; GFX900-NEXT: global_load_dwordx2 v[24:25], v[16:17], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v20, vcc, s3, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v5, vcc -; GFX900-NEXT: global_load_dwordx2 v[16:17], v[20:21], off offset:-4096 -; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, s5, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v5, vcc ; GFX900-NEXT: s_addk_i32 s6, 0x2000 ; GFX900-NEXT: s_cmp_gt_u32 s6, 0x3fffff -; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_add_co_u32_e64 v28, s[0:1], v8, v6 -; GFX900-NEXT: v_addc_co_u32_e64 v29, s[0:1], v9, v7, s[0:1] -; GFX900-NEXT: global_load_dwordx2 v[6:7], v[20:21], off offset:-2048 -; GFX900-NEXT: global_load_dwordx2 v[8:9], v[20:21], off -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: global_load_dwordx2 v[20:21], v[14:15], off offset:-2048 -; GFX900-NEXT: global_load_dwordx2 v[26:27], v[4:5], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v8, v6 +; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v7, vcc +; GFX900-NEXT: global_load_dwordx2 v[6:7], v[14:15], off offset:-2048 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v6, v8 +; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v7, v9, vcc +; GFX900-NEXT: global_load_dwordx2 v[7:8], v[14:15], off +; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, s2, v4 +; GFX900-NEXT: s_mov_b64 s[0:1], vcc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v7, v16 +; GFX900-NEXT: v_addc_co_u32_e64 v7, s[0:1], -1, v5, s[0:1] +; GFX900-NEXT: global_load_dwordx2 v[6:7], v[6:7], off offset:-2048 +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v8, v9, vcc +; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, s3, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v5, vcc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v6, v14 +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v7, v15, vcc +; GFX900-NEXT: global_load_dwordx2 v[6:7], v[8:9], off offset:-4096 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v6, v14 +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v7, v15, vcc +; GFX900-NEXT: global_load_dwordx2 v[6:7], v[8:9], off offset:-2048 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v6, v14 +; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v7, v15, vcc +; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, s5, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v5, vcc +; GFX900-NEXT: global_load_dwordx2 v[6:7], v[6:7], off offset:-2048 +; GFX900-NEXT: s_waitcnt vmcnt(1) +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v8, v14 +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v9, v15, vcc +; GFX900-NEXT: global_load_dwordx2 v[8:9], v[4:5], off ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, 0x10000, v4 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX900-NEXT: s_waitcnt vmcnt(7) -; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v18, v28 -; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v19, v29, vcc -; GFX900-NEXT: s_waitcnt vmcnt(6) -; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v22, v14 -; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v23, v15, vcc -; GFX900-NEXT: s_waitcnt vmcnt(5) -; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v24, v14 -; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v25, v15, vcc -; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v16, v14 -; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v17, v15, vcc -; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v6, v14 ; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v15, vcc -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 -; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc -; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v20, v6 -; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v21, v7, vcc ; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 ; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v11, v7, vcc ; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v12, v6 ; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v7, vcc ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v26, v6 -; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v27, v7, vcc +; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 +; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc ; GFX900-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1