diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -349,6 +349,12 @@ cl::desc("Enable rewrite partial reg uses pass"), cl::init(false), cl::Hidden); +static cl::opt EnableBalancedSchedStrategy( + "amdgpu-enable-balanced-scheduling-strategy", + cl::desc( + "Enable scheduling strategy to tradeoff between ILP and occupancy."), + cl::Hidden, cl::init(true)); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheR600Target()); @@ -456,6 +462,20 @@ return DAG; } +static ScheduleDAGInstrs * +createGCNBalancedMachineScheduler(MachineSchedContext *C) { + const GCNSubtarget &ST = C->MF->getSubtarget(); + ScheduleDAGMILive *DAG = + new GCNScheduleDAGMILive(C, std::make_unique(C)); + DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + if (ST.shouldClusterStores()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createIGroupLPDAGMutation()); + DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); + DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + return DAG; +} + static ScheduleDAGInstrs * createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { const GCNSubtarget &ST = C->MF->getSubtarget(); @@ -1120,6 +1140,9 @@ if (EnableMaxIlpSchedStrategy) return createGCNMaxILPMachineScheduler(C); + if (EnableBalancedSchedStrategy) + return createGCNBalancedMachineScheduler(C); + return createGCNMaxOccupancyMachineScheduler(C); } diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -116,6 +116,12 @@ bool hasNextStage() const; GCNSchedStageID getNextStage() const; + + virtual bool computeScheduleMetric(unsigned RegionIdx, unsigned WavesAfter, + unsigned WavesBefore) { + return false; + } + virtual void clearMetric(){}; }; /// The goal of this scheduling strategy is to maximize kernel occupancy (i.e. @@ -423,6 +429,82 @@ bool RemoveKillFlags); }; +#ifndef NDEBUG +struct EarlierIssuingCycle { + bool operator()(std::pair A, + std::pair B) const { + return A.second < B.second; + } +}; +#endif + +/// The goal of this scheduling strategy is to find a reasonable tradeof between +/// the kernel occupancy (i.e. maximum number of waves per simd). and ILP (i.e. +/// minimize the amount of stall cycles by means of the better latency +/// covering). +class GCNBalancedSchedStrategy final : public GCNSchedStrategy { + + const unsigned ScaleFactor = 100; + unsigned StallTotal = 0; + DenseMap> Metrics; + + void clearMetric() override { + StallTotal = 0; +#ifndef NDEBUG + BottomScheduledSU.clear(); + PrintableSchedule.clear(); +#endif + } + +#ifndef NDEBUG + std::set, EarlierIssuingCycle> PrintableSchedule; + // Since we don't know the absolute value of the bottom ready cycless until we + // finish scheduling we need to sustain temporary mapping from the + // SUnit::nodeNum to MI to be able later fill in the PrintableSchedule + std::vector BottomScheduledSU; + + void makePrintableSchedule(unsigned ScheduleLength) { + for (auto SU : BottomScheduledSU) { + unsigned BotReadyCycle = ScheduleLength - SU->BotReadyCycle; + PrintableSchedule.insert(std::pair(SU->getInstr(), BotReadyCycle)); + } + } + + void printSchedule() { + if (PrintableSchedule.empty()) + return; + + unsigned BBNum = PrintableSchedule.begin()->first->getParent()->getNumber(); + dbgs() << "\n################## Schedule time ReadyCycles for MBB : " + << BBNum + << " ##################\n# Cycle #\t\t\tInstruction " + " " + " \n"; + unsigned IPrev = 1; + for (auto &I : PrintableSchedule) { + if (I.second > IPrev + 1) + dbgs() << "****************************** BUBBLE OF " + << I.second - IPrev + << " CYCLES DETECTED ******************************\n\n"; + dbgs() << "[ " << I.second << " ] : " << *I.first << "\n"; + IPrev = I.second; + } + } +#endif + +public: + GCNBalancedSchedStrategy(const MachineSchedContext *C) : GCNSchedStrategy(C) { + SchedStages.push_back(GCNSchedStageID::OccInitialSchedule); + SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule); + SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule); + SchedStages.push_back(GCNSchedStageID::PreRARematerialize); + } + + void schedNode(SUnit *SU, bool IsTopNode) override; + bool computeScheduleMetric(unsigned RegionIdx, unsigned WavesAfter, + unsigned WavesBefore) override; +}; + } // End namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -839,6 +839,7 @@ S.HasHighPressure = false; S.KnownExcessRP = isRegionWithExcessRP(); + S.clearMetric(); if (DAG.RegionsWithIGLPInstrs[RegionIdx] && StageID != GCNSchedStageID::UnclusteredHighRPReschedule) { @@ -970,9 +971,10 @@ DAG.RegionsWithExcessRP[RegionIdx] = true; } - // Revert if this region's schedule would cause a drop in occupancy or - // spilling. - if (shouldRevertScheduling(WavesAfter)) { + bool IsWorse = S.computeScheduleMetric(RegionIdx, WavesAfter, WavesBefore); + if (IsWorse && + !(DAG.RegionsWithExcessRP[RegionIdx] && + S.getCurrentStage() == GCNSchedStageID::UnclusteredHighRPReschedule)) { revertScheduling(); } else { DAG.Pressure[RegionIdx] = PressureAfter; @@ -998,103 +1000,6 @@ return ReadyCycle; } -#ifndef NDEBUG -struct EarlierIssuingCycle { - bool operator()(std::pair A, - std::pair B) const { - return A.second < B.second; - } -}; - -static void printScheduleModel(std::set, - EarlierIssuingCycle> &ReadyCycles) { - if (ReadyCycles.empty()) - return; - unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber(); - dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum - << " ##################\n# Cycle #\t\t\tInstruction " - " " - " \n"; - unsigned IPrev = 1; - for (auto &I : ReadyCycles) { - if (I.second > IPrev + 1) - dbgs() << "****************************** BUBBLE OF " << I.second - IPrev - << " CYCLES DETECTED ******************************\n\n"; - dbgs() << "[ " << I.second << " ] : " << *I.first << "\n"; - IPrev = I.second; - } -} -#endif - -ScheduleMetrics -GCNSchedStage::getScheduleMetrics(const std::vector &InputSchedule) { -#ifndef NDEBUG - std::set, EarlierIssuingCycle> - ReadyCyclesSorted; -#endif - const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); - unsigned SumBubbles = 0; - DenseMap ReadyCycles; - unsigned CurrCycle = 0; - for (auto &SU : InputSchedule) { - unsigned ReadyCycle = - computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM); - SumBubbles += ReadyCycle - CurrCycle; -#ifndef NDEBUG - ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle)); -#endif - CurrCycle = ++ReadyCycle; - } -#ifndef NDEBUG - LLVM_DEBUG( - printScheduleModel(ReadyCyclesSorted); - dbgs() << "\n\t" - << "Metric: " - << (SumBubbles - ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle - : 1) - << "\n\n"); -#endif - - return ScheduleMetrics(CurrCycle, SumBubbles); -} - -ScheduleMetrics -GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) { -#ifndef NDEBUG - std::set, EarlierIssuingCycle> - ReadyCyclesSorted; -#endif - const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); - unsigned SumBubbles = 0; - DenseMap ReadyCycles; - unsigned CurrCycle = 0; - for (auto &MI : DAG) { - SUnit *SU = DAG.getSUnit(&MI); - if (!SU) - continue; - unsigned ReadyCycle = - computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM); - SumBubbles += ReadyCycle - CurrCycle; -#ifndef NDEBUG - ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle)); -#endif - CurrCycle = ++ReadyCycle; - } -#ifndef NDEBUG - LLVM_DEBUG( - printScheduleModel(ReadyCyclesSorted); - dbgs() << "\n\t" - << "Metric: " - << (SumBubbles - ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle - : 1) - << "\n\n"); -#endif - - return ScheduleMetrics(CurrCycle, SumBubbles); -} - bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { if (WavesAfter < DAG.MinOccupancy) return true; @@ -1125,32 +1030,7 @@ return true; } - // Do not attempt to relax schedule even more if we are already spilling. - if (isRegionWithExcessRP()) - return false; - - LLVM_DEBUG( - dbgs() - << "\n\t *** In shouldRevertScheduling ***\n" - << " *********** BEFORE UnclusteredHighRPStage ***********\n"); - ScheduleMetrics MBefore = - getScheduleMetrics(DAG.SUnits); - LLVM_DEBUG( - dbgs() - << "\n *********** AFTER UnclusteredHighRPStage ***********\n"); - ScheduleMetrics MAfter = getScheduleMetrics(DAG); - unsigned OldMetric = MBefore.getMetric(); - unsigned NewMetric = MAfter.getMetric(); - unsigned WavesBefore = - std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST)); - unsigned Profit = - ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * - ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / - NewMetric) / - ScheduleMetrics::ScaleFactor; - LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after " - << MAfter << "Profit: " << Profit << "\n"); - return Profit < ScheduleMetrics::ScaleFactor; + return false; } bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { @@ -1570,3 +1450,49 @@ ScheduleDAGMI::finalizeSchedule(); } + +void llvm::GCNBalancedSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { + if (IsTopNode) { +#ifndef NDEBUG + PrintableSchedule.insert(std::pair(SU->getInstr(), SU->TopReadyCycle)); +#endif + StallTotal += SU->TopReadyCycle > Top.getCurrCycle() + ? SU->TopReadyCycle - Top.getCurrCycle() + : 0; + } else { + BottomScheduledSU.push_back(SU); + StallTotal += SU->BotReadyCycle > Bot.getCurrCycle() + ? SU->BotReadyCycle - Bot.getCurrCycle() + : 0; + } + GCNSchedStrategy::schedNode(SU, IsTopNode); +} + +bool llvm::GCNBalancedSchedStrategy::computeScheduleMetric( + unsigned RegionIdx, unsigned WavesAfter, unsigned WavesBefore) { + bool Result = false; + unsigned ScheduleLength = Top.getCurrCycle() + Bot.getCurrCycle(); +#ifndef NDEBUG + makePrintableSchedule(ScheduleLength); +#endif + unsigned PrevMetric = 0; + if (Metrics.count(RegionIdx)) { + PrevMetric = Metrics[RegionIdx].back(); + } + unsigned Metric = StallTotal * ScaleFactor / ScheduleLength; + Metric = Metric ? Metric : 1; +#ifndef NDEBUG + LLVM_DEBUG(printSchedule()); +#endif + if (PrevMetric) { + unsigned Profit = + ((WavesAfter * ScaleFactor) / WavesBefore * + ((PrevMetric + ScheduleMetricBias) * ScaleFactor) / Metric) / + ScaleFactor; + Result = Profit < ScaleFactor; + } + if (!Result) + Metrics[RegionIdx].push_back(Metric); + clearMetric(); + return Result; +} diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -2399,125 +2399,125 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:160 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:284 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:156 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:280 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:152 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:276 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:272 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:268 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:264 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:260 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:256 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:280 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:272 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:264 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:256 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:252 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:248 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:244 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:240 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:236 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:232 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:228 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:224 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:248 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:240 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:232 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:224 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:220 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:216 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:212 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:208 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:204 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:200 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:196 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:192 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:216 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:212 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:208 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:204 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:200 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:196 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:192 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:188 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:184 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:180 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:176 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:172 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:168 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:164 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:160 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:184 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:180 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:176 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:172 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:168 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:164 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:160 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:156 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:152 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:148 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:144 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:140 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:136 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:132 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:152 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:144 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:136 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:128 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:124 @@ -2834,8 +2834,8 @@ ; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_add_i32 s33, s32, 0x7fc0 ; GFX9-NEXT: s_and_b32 s33, s33, 0xffff8000 -; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_add_i32 s32, s32, 0x28000 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -2843,7 +2843,6 @@ ; GFX9-NEXT: s_addc_u32 s35, s35, return_72xi32@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill @@ -2901,7 +2900,7 @@ ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 -; GFX9-NEXT: v_writelane_b32 v33, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x200, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2934,27 +2933,27 @@ ; GFX9-NEXT: v_mov_b32_e32 v29, 0 ; GFX9-NEXT: v_mov_b32_e32 v30, 0 ; GFX9-NEXT: v_mov_b32_e32 v31, 0 -; GFX9-NEXT: v_writelane_b32 v33, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:636 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:640 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:644 -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s33 offset:648 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s33 offset:652 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s33 offset:656 -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s33 offset:660 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s33 offset:664 -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:668 -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s33 offset:672 -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s33 offset:676 -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s33 offset:680 -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s33 offset:684 -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s33 offset:688 -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s33 offset:692 -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s33 offset:696 -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s33 offset:700 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:704 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:648 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s33 offset:652 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s33 offset:656 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s33 offset:660 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s33 offset:664 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s33 offset:668 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:672 +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s33 offset:676 +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s33 offset:680 +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s33 offset:684 +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s33 offset:688 +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s33 offset:692 +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s33 offset:696 +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s33 offset:700 +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s33 offset:704 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:708 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:712 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:716 @@ -3026,21 +3025,21 @@ ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 @@ -3099,11 +3098,10 @@ ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v33, 1 -; GFX9-NEXT: v_readlane_b32 s30, v33, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_add_i32 s32, s32, 0xfffd8000 ; GFX9-NEXT: s_mov_b32 s33, s36 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -3571,190 +3571,190 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 -; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] -; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 +; GCN-HSA-NEXT: s_add_u32 s6, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s6 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29] +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: s_add_u32 s8, s2, 48 +; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 64 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v33, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s4 +; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v21 -; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v20 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[24:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v30, 0xffff, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v28, 0xffff, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xc0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v30, 0xffff, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v28, 0xffff, v14 ; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xa0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; GCN-HSA-NEXT: v_and_b32_e32 v25, 0xffff, v23 -; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v22 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[23:26] +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[28:31] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v19 -; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[19:22] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v6 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[15:18] +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[7:10] +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[16:19] -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[1:4] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[11:14] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v33 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 16, v32 -; GCN-HSA-NEXT: v_and_b32_e32 v25, 0xffff, v33 -; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v32 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[23:26] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v35 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v34 -; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v35 -; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v34 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[15:18] -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v29 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v28 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v29 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v28 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v33 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v32 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v33 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v31 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v30 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v31 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v30 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v25 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v27 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v26 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v27 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v26 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v35 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v34 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v35 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v34 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v20 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32: @@ -4237,120 +4237,120 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s8, s8, s3 -; GCN-NOHSA-SI-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s3 +; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v11 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v10 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v11, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(7) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v2 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v3, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v32, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v8, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v31 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v30 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v31, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v30, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v29 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v28 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v29, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v28, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v35 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 16, v34 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v35, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v34, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 16, v33 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v45, 16, v32 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v33, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v44, v32, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v39 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v38 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v39, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v38, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v51, 16, v37 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v49, 16, v36 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v37, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v36, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 16, v27 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v37, 16, v26 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v27, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v26, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 16, v25 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v53, 16, v24 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v25, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v24, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v23 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v22 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v23, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v22, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v33, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v34, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v35, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 16, v1 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v37, 16, v0 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v6 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v7, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v4 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v11 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v10 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v11, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v10, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 16, v9 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v45, 16, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v44, v8, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v15 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v14 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v15, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v14, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v51, 16, v13 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v49, 16, v12 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 16, v19 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 16, v18 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v19, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 16, v17 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v53, 16, v16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v17, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v16, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 16, v23 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 16, v22 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v23, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v22, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v59, 16, v21 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v57, 16, v20 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v21, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v20, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 16, v19 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 16, v18 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v19, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v18, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 16, v17 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v61, 16, v16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 16, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 16, v14 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v15, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v14, 0, 16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v12 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v27 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 16, v26 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v27, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v26, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 16, v25 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v61, 16, v24 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v25, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v24, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 16, v31 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v30 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v31, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v30, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v29 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v28 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v29, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v28, 0, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -4361,189 +4361,192 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 -; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 -; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s8, s2, 64 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 64 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 +; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29] ; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v21 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v20 -; GCN-HSA-NEXT: v_bfe_i32 v30, v21, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v28, v20, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v12 +; GCN-HSA-NEXT: v_bfe_i32 v26, v13, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v24, v12, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[24:27] +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v14 +; GCN-HSA-NEXT: v_bfe_i32 v26, v15, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v24, v14, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v23 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v22 -; GCN-HSA-NEXT: v_bfe_i32 v30, v23, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v28, v22, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[28:31] -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[24:27] +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s4 -; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v17 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v16 -; GCN-HSA-NEXT: v_bfe_i32 v22, v17, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v20, v16, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v8 +; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v19 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v18 -; GCN-HSA-NEXT: v_bfe_i32 v21, v19, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v19, v18, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 +; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[19:22] -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v12, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v39, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v14 -; GCN-HSA-NEXT: v_bfe_i32 v22, v15, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v20, v14, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[20:23] -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v17, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v15, v8, 0, 16 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 -; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[15:18] -; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[11:14] +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v10, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v14, v7, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v6, 0, 16 ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v17, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v15, v0, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[12:15] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GCN-HSA-NEXT: s_waitcnt vmcnt(11) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v17 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v17, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v16, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v13, v5, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v11, v4, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v2 -; GCN-HSA-NEXT: v_bfe_i32 v5, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v19 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v18 +; GCN-HSA-NEXT: v_bfe_i32 v2, v19, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v18, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v26 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: v_bfe_i32 v15, v26, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v25 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v24 -; GCN-HSA-NEXT: v_bfe_i32 v21, v25, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v19, v24, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 16, v33 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 16, v32 -; GCN-HSA-NEXT: v_bfe_i32 v25, v33, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v23, v32, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[23:26] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v35 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v34 -; GCN-HSA-NEXT: v_bfe_i32 v2, v35, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v34, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: s_waitcnt vmcnt(12) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v23 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v22 +; GCN-HSA-NEXT: v_bfe_i32 v10, v23, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v22, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: s_waitcnt vmcnt(12) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v29 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v28 +; GCN-HSA-NEXT: v_bfe_i32 v14, v29, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v28, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v27 -; GCN-HSA-NEXT: v_bfe_i32 v17, v27, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v31 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v30 +; GCN-HSA-NEXT: v_bfe_i32 v10, v31, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v30, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v21 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v20 +; GCN-HSA-NEXT: v_bfe_i32 v2, v21, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v20, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_waitcnt vmcnt(14) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v33 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v32 +; GCN-HSA-NEXT: v_bfe_i32 v6, v33, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v32, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v35 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v34 +; GCN-HSA-NEXT: v_bfe_i32 v2, v35, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v34, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v64i16_to_v64i32: diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir @@ -84,13 +84,11 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 S_NOP 0 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -191,14 +189,12 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 S_NOP 0, implicit %24 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %23 S_NOP 0, implicit %0, implicit %1 @@ -300,7 +296,6 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 @@ -308,7 +303,6 @@ S_NOP 0, implicit %23 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -408,7 +402,6 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 @@ -416,7 +409,6 @@ S_NOP 0, implicit %22, implicit %23 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -529,7 +521,6 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 @@ -537,14 +528,12 @@ S_NOP 0, implicit %23 bb.2: - ; predcessors: %bb.1 successors: %bb.3 %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode S_NOP 0 bb.3: - ; predecessors: %bb.2 successors: %bb.4 %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 @@ -552,7 +541,6 @@ S_NOP 0, implicit %25 bb.4: - ; predcessors: %bb.3 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -666,7 +654,6 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 @@ -674,7 +661,6 @@ S_NOP 0, implicit %23, implicit %22 bb.2: - ; predcessors: %bb.1 successors: %bb.3 %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode @@ -682,7 +668,6 @@ S_NOP 0 bb.3: - ; predecessors: %bb.2 successors: %bb.4 %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0 @@ -690,7 +675,6 @@ S_NOP 0, implicit %25, implicit %26 bb.4: - ; predcessors: %bb.3 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -949,14 +933,12 @@ undef %23.sub0:vreg_64 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %23.sub1:vreg_64 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 S_NOP 0, implicit %23 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -1053,7 +1035,6 @@ undef %21.sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 @@ -1062,7 +1043,6 @@ S_NOP 0, implicit %21 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -1581,7 +1561,6 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 @@ -1589,7 +1568,6 @@ S_NOP 0, implicit %24, implicit %25 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %23 S_NOP 0, implicit %0, implicit %1 @@ -2528,14 +2506,12 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 S_NOP 0, implicit %24 bb.2: - ; predcessors: %bb.1 successors: %bb.3 S_NOP 0, implicit %23 @@ -2543,7 +2519,6 @@ S_NOP 0 bb.3: - ; predecessors: %bb.2 successors: %bb.4 %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 @@ -2551,7 +2526,6 @@ S_NOP 0, implicit %26, implicit %27 bb.4: - ; predcessors: %bb.3 S_NOP 0, implicit %25 S_NOP 0, implicit %0, implicit %1 @@ -2650,7 +2624,6 @@ %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 bb.1: - ; predecessors: %bb.0 successors: %bb.2 %21.sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 @@ -2658,7 +2631,6 @@ S_NOP 0, implicit %21 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -2759,7 +2731,6 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 @@ -2767,7 +2738,6 @@ S_NOP 0, implicit %23 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %23 S_NOP 0, implicit %0, implicit %1 @@ -3182,8 +3152,8 @@ ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_31]] ; GFX908-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_31]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: @@ -3372,8 +3342,8 @@ ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_35]] ; GFX908-NEXT: [[V_CVT_I32_F64_e32_36:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_35]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_36]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: @@ -3944,8 +3914,8 @@ ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_63:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_63]] ; GFX908-NEXT: [[V_CVT_I32_F64_e32_64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_63]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_64]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: @@ -4246,8 +4216,8 @@ ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_83:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_83]] ; GFX908-NEXT: [[V_CVT_I32_F64_e32_84:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_83]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_84]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: @@ -5030,7 +5000,6 @@ %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %21.sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 @@ -5038,7 +5007,6 @@ S_NOP 0, implicit %21 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -5137,14 +5105,12 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 S_NOP 0, implicit %23, implicit %24 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -5242,7 +5208,6 @@ %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 @@ -5250,7 +5215,6 @@ S_NOP 0, implicit %22 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -5348,7 +5312,6 @@ %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 @@ -5357,7 +5320,6 @@ S_NOP 0, implicit %22 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -5456,7 +5418,6 @@ %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 @@ -5466,7 +5427,6 @@ S_NOP 0, implicit %22 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -5562,14 +5522,12 @@ %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 S_NOP 0, implicit %22, implicit %23 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -5669,14 +5627,12 @@ undef %23.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 S_NOP 0, implicit %24 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %23.sub1 S_NOP 0, implicit %0, implicit %1 @@ -5779,14 +5735,12 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 S_NOP 0, implicit %24 bb.2: - ; predcessors: %bb.1 DBG_VALUE %23, 0, 0 S_NOP 0, implicit %23 @@ -5889,14 +5843,12 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 S_NOP 0, implicit %24 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %23 S_NOP 0, implicit %0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -74,27 +74,19 @@ ; CHECK-NEXT: v_writelane_b32 v23, s5, 33 ; CHECK-NEXT: v_writelane_b32 v23, s6, 34 ; CHECK-NEXT: v_writelane_b32 v23, s7, 35 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:11] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 36 -; CHECK-NEXT: v_writelane_b32 v23, s5, 37 -; CHECK-NEXT: v_writelane_b32 v23, s6, 38 -; CHECK-NEXT: v_writelane_b32 v23, s7, 39 -; CHECK-NEXT: v_writelane_b32 v23, s8, 40 -; CHECK-NEXT: v_writelane_b32 v23, s9, 41 -; CHECK-NEXT: v_writelane_b32 v23, s10, 42 -; CHECK-NEXT: v_writelane_b32 v23, s11, 43 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[44:51] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[16:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[52:53] +; CHECK-NEXT: ; def s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[48:51] +; CHECK-NEXT: ; def s[52:55] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[36:43] @@ -102,62 +94,70 @@ ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 44 -; CHECK-NEXT: v_writelane_b32 v23, s1, 45 -; CHECK-NEXT: v_writelane_b32 v23, s2, 46 -; CHECK-NEXT: v_writelane_b32 v23, s3, 47 -; CHECK-NEXT: v_writelane_b32 v23, s4, 48 -; CHECK-NEXT: v_writelane_b32 v23, s5, 49 -; CHECK-NEXT: v_writelane_b32 v23, s6, 50 -; CHECK-NEXT: v_writelane_b32 v23, s7, 51 -; CHECK-NEXT: v_writelane_b32 v23, s8, 52 -; CHECK-NEXT: v_writelane_b32 v23, s9, 53 -; CHECK-NEXT: v_writelane_b32 v23, s10, 54 -; CHECK-NEXT: v_writelane_b32 v23, s11, 55 -; CHECK-NEXT: v_writelane_b32 v23, s12, 56 -; CHECK-NEXT: v_writelane_b32 v23, s13, 57 -; CHECK-NEXT: v_writelane_b32 v23, s14, 58 -; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: v_writelane_b32 v23, s15, 59 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[34:35] -; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s0, 36 +; CHECK-NEXT: v_writelane_b32 v23, s1, 37 +; CHECK-NEXT: v_writelane_b32 v23, s2, 38 +; CHECK-NEXT: v_writelane_b32 v23, s3, 39 +; CHECK-NEXT: v_writelane_b32 v23, s4, 40 +; CHECK-NEXT: v_writelane_b32 v23, s5, 41 +; CHECK-NEXT: v_writelane_b32 v23, s6, 42 +; CHECK-NEXT: v_writelane_b32 v23, s7, 43 +; CHECK-NEXT: v_writelane_b32 v23, s8, 44 +; CHECK-NEXT: v_writelane_b32 v23, s9, 45 +; CHECK-NEXT: v_writelane_b32 v23, s10, 46 +; CHECK-NEXT: v_writelane_b32 v23, s11, 47 +; CHECK-NEXT: v_writelane_b32 v23, s12, 48 +; CHECK-NEXT: v_writelane_b32 v23, s13, 49 +; CHECK-NEXT: v_writelane_b32 v23, s14, 50 +; CHECK-NEXT: v_writelane_b32 v23, s15, 51 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s0, 52 +; CHECK-NEXT: v_writelane_b32 v23, s1, 53 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[44:47] +; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s0, 54 +; CHECK-NEXT: v_writelane_b32 v23, s1, 55 +; CHECK-NEXT: v_writelane_b32 v23, s2, 56 +; CHECK-NEXT: v_writelane_b32 v23, s3, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 60 -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v23, s1, 61 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 -; CHECK-NEXT: v_writelane_b32 v23, s2, 62 -; CHECK-NEXT: v_writelane_b32 v0, s6, 2 -; CHECK-NEXT: v_writelane_b32 v23, s3, 63 -; CHECK-NEXT: v_writelane_b32 v0, s7, 3 +; CHECK-NEXT: v_writelane_b32 v23, s0, 58 +; CHECK-NEXT: v_writelane_b32 v23, s1, 59 +; CHECK-NEXT: v_writelane_b32 v23, s2, 60 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_writelane_b32 v23, s3, 61 +; CHECK-NEXT: v_writelane_b32 v23, s4, 62 +; CHECK-NEXT: v_writelane_b32 v0, s6, 0 +; CHECK-NEXT: v_writelane_b32 v23, s5, 63 +; CHECK-NEXT: v_writelane_b32 v0, s7, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 4 -; CHECK-NEXT: v_writelane_b32 v0, s1, 5 -; CHECK-NEXT: v_writelane_b32 v0, s2, 6 -; CHECK-NEXT: v_writelane_b32 v0, s3, 7 -; CHECK-NEXT: v_writelane_b32 v0, s4, 8 -; CHECK-NEXT: v_writelane_b32 v0, s5, 9 -; CHECK-NEXT: v_writelane_b32 v0, s6, 10 -; CHECK-NEXT: v_writelane_b32 v0, s7, 11 -; CHECK-NEXT: v_writelane_b32 v0, s8, 12 -; CHECK-NEXT: v_writelane_b32 v0, s9, 13 -; CHECK-NEXT: v_writelane_b32 v0, s10, 14 -; CHECK-NEXT: v_writelane_b32 v0, s11, 15 -; CHECK-NEXT: v_writelane_b32 v0, s12, 16 -; CHECK-NEXT: v_writelane_b32 v0, s13, 17 -; CHECK-NEXT: v_writelane_b32 v0, s14, 18 -; CHECK-NEXT: v_writelane_b32 v0, s15, 19 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[54:55] -; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s0, 2 +; CHECK-NEXT: v_writelane_b32 v0, s1, 3 +; CHECK-NEXT: v_writelane_b32 v0, s2, 4 +; CHECK-NEXT: v_writelane_b32 v0, s3, 5 +; CHECK-NEXT: v_writelane_b32 v0, s4, 6 +; CHECK-NEXT: v_writelane_b32 v0, s5, 7 +; CHECK-NEXT: v_writelane_b32 v0, s6, 8 +; CHECK-NEXT: v_writelane_b32 v0, s7, 9 +; CHECK-NEXT: v_writelane_b32 v0, s8, 10 +; CHECK-NEXT: v_writelane_b32 v0, s9, 11 +; CHECK-NEXT: v_writelane_b32 v0, s10, 12 +; CHECK-NEXT: v_writelane_b32 v0, s11, 13 +; CHECK-NEXT: v_writelane_b32 v0, s12, 14 +; CHECK-NEXT: v_writelane_b32 v0, s13, 15 +; CHECK-NEXT: v_writelane_b32 v0, s14, 16 +; CHECK-NEXT: v_writelane_b32 v0, s15, 17 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s0, 18 +; CHECK-NEXT: v_writelane_b32 v0, s1, 19 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND @@ -257,90 +257,90 @@ ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v23, 36 ; CHECK-NEXT: v_readlane_b32 s1, v23, 37 -; CHECK-NEXT: v_readlane_b32 s2, v23, 38 -; CHECK-NEXT: v_readlane_b32 s3, v23, 39 -; CHECK-NEXT: v_readlane_b32 s4, v23, 40 -; CHECK-NEXT: v_readlane_b32 s5, v23, 41 -; CHECK-NEXT: v_readlane_b32 s6, v23, 42 -; CHECK-NEXT: v_readlane_b32 s7, v23, 43 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:7] +; CHECK-NEXT: ; use s[44:51] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 44 -; CHECK-NEXT: v_readlane_b32 s1, v23, 45 -; CHECK-NEXT: v_readlane_b32 s2, v23, 46 -; CHECK-NEXT: v_readlane_b32 s3, v23, 47 -; CHECK-NEXT: v_readlane_b32 s4, v23, 48 -; CHECK-NEXT: v_readlane_b32 s5, v23, 49 -; CHECK-NEXT: v_readlane_b32 s6, v23, 50 -; CHECK-NEXT: v_readlane_b32 s7, v23, 51 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[16:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[52:53] +; CHECK-NEXT: ; use s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[48:51] +; CHECK-NEXT: ; use s[52:55] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[36:43] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s8, v23, 52 -; CHECK-NEXT: v_readlane_b32 s9, v23, 53 -; CHECK-NEXT: v_readlane_b32 s10, v23, 54 -; CHECK-NEXT: v_readlane_b32 s11, v23, 55 -; CHECK-NEXT: v_readlane_b32 s12, v23, 56 -; CHECK-NEXT: v_readlane_b32 s13, v23, 57 -; CHECK-NEXT: v_readlane_b32 s14, v23, 58 -; CHECK-NEXT: v_readlane_b32 s15, v23, 59 +; CHECK-NEXT: v_readlane_b32 s2, v23, 38 +; CHECK-NEXT: v_readlane_b32 s3, v23, 39 +; CHECK-NEXT: v_readlane_b32 s4, v23, 40 +; CHECK-NEXT: v_readlane_b32 s5, v23, 41 +; CHECK-NEXT: v_readlane_b32 s6, v23, 42 +; CHECK-NEXT: v_readlane_b32 s7, v23, 43 +; CHECK-NEXT: v_readlane_b32 s8, v23, 44 +; CHECK-NEXT: v_readlane_b32 s9, v23, 45 +; CHECK-NEXT: v_readlane_b32 s10, v23, 46 +; CHECK-NEXT: v_readlane_b32 s11, v23, 47 +; CHECK-NEXT: v_readlane_b32 s12, v23, 48 +; CHECK-NEXT: v_readlane_b32 s13, v23, 49 +; CHECK-NEXT: v_readlane_b32 s14, v23, 50 +; CHECK-NEXT: v_readlane_b32 s15, v23, 51 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 60 -; CHECK-NEXT: v_readlane_b32 s1, v23, 61 -; CHECK-NEXT: v_readlane_b32 s2, v23, 62 -; CHECK-NEXT: v_readlane_b32 s3, v23, 63 -; CHECK-NEXT: v_readlane_b32 s4, v0, 0 -; CHECK-NEXT: v_readlane_b32 s5, v0, 1 -; CHECK-NEXT: v_readlane_b32 s6, v0, 2 -; CHECK-NEXT: v_readlane_b32 s7, v0, 3 +; CHECK-NEXT: v_readlane_b32 s0, v23, 52 +; CHECK-NEXT: v_readlane_b32 s1, v23, 53 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[34:35] +; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 54 +; CHECK-NEXT: v_readlane_b32 s1, v23, 55 +; CHECK-NEXT: v_readlane_b32 s2, v23, 56 +; CHECK-NEXT: v_readlane_b32 s3, v23, 57 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[44:47] +; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 58 +; CHECK-NEXT: v_readlane_b32 s1, v23, 59 +; CHECK-NEXT: v_readlane_b32 s2, v23, 60 +; CHECK-NEXT: v_readlane_b32 s3, v23, 61 +; CHECK-NEXT: v_readlane_b32 s4, v23, 62 +; CHECK-NEXT: v_readlane_b32 s5, v23, 63 +; CHECK-NEXT: v_readlane_b32 s6, v0, 0 +; CHECK-NEXT: v_readlane_b32 s7, v0, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 4 -; CHECK-NEXT: v_readlane_b32 s1, v0, 5 -; CHECK-NEXT: v_readlane_b32 s2, v0, 6 -; CHECK-NEXT: v_readlane_b32 s3, v0, 7 -; CHECK-NEXT: v_readlane_b32 s4, v0, 8 -; CHECK-NEXT: v_readlane_b32 s5, v0, 9 -; CHECK-NEXT: v_readlane_b32 s6, v0, 10 -; CHECK-NEXT: v_readlane_b32 s7, v0, 11 -; CHECK-NEXT: v_readlane_b32 s8, v0, 12 -; CHECK-NEXT: v_readlane_b32 s9, v0, 13 -; CHECK-NEXT: v_readlane_b32 s10, v0, 14 -; CHECK-NEXT: v_readlane_b32 s11, v0, 15 -; CHECK-NEXT: v_readlane_b32 s12, v0, 16 -; CHECK-NEXT: v_readlane_b32 s13, v0, 17 -; CHECK-NEXT: v_readlane_b32 s14, v0, 18 -; CHECK-NEXT: v_readlane_b32 s15, v0, 19 +; CHECK-NEXT: v_readlane_b32 s0, v0, 2 +; CHECK-NEXT: v_readlane_b32 s1, v0, 3 +; CHECK-NEXT: v_readlane_b32 s2, v0, 4 +; CHECK-NEXT: v_readlane_b32 s3, v0, 5 +; CHECK-NEXT: v_readlane_b32 s4, v0, 6 +; CHECK-NEXT: v_readlane_b32 s5, v0, 7 +; CHECK-NEXT: v_readlane_b32 s6, v0, 8 +; CHECK-NEXT: v_readlane_b32 s7, v0, 9 +; CHECK-NEXT: v_readlane_b32 s8, v0, 10 +; CHECK-NEXT: v_readlane_b32 s9, v0, 11 +; CHECK-NEXT: v_readlane_b32 s10, v0, 12 +; CHECK-NEXT: v_readlane_b32 s11, v0, 13 +; CHECK-NEXT: v_readlane_b32 s12, v0, 14 +; CHECK-NEXT: v_readlane_b32 s13, v0, 15 +; CHECK-NEXT: v_readlane_b32 s14, v0, 16 +; CHECK-NEXT: v_readlane_b32 s15, v0, 17 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 18 +; CHECK-NEXT: v_readlane_b32 s1, v0, 19 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v0, 20 ; CHECK-NEXT: v_readlane_b32 s1, v0, 21 ; CHECK-NEXT: v_readlane_b32 s2, v0, 22 ; CHECK-NEXT: v_readlane_b32 s3, v0, 23 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[54:55] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v0, 24