diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -343,6 +343,12 @@ cl::desc("Enable rewrite partial reg uses pass"), cl::init(false), cl::Hidden); +static cl::opt EnableBalancedSchedStrategy( + "amdgpu-enable-balanced-scheduling-strategy", + cl::desc( + "Enable scheduling strategy to tradeoff between ILP and occupancy."), + cl::Hidden, cl::init(true)); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheR600Target()); @@ -448,6 +454,20 @@ return DAG; } +static ScheduleDAGInstrs * +createGCNBalancedMachineScheduler(MachineSchedContext *C) { + const GCNSubtarget &ST = C->MF->getSubtarget(); + ScheduleDAGMILive *DAG = + new GCNScheduleDAGMILive(C, std::make_unique(C)); + DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + if (ST.shouldClusterStores()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createIGroupLPDAGMutation()); + DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); + DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + return DAG; +} + static ScheduleDAGInstrs * createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { const GCNSubtarget &ST = C->MF->getSubtarget(); @@ -1122,6 +1142,9 @@ if (EnableMaxIlpSchedStrategy) return createGCNMaxILPMachineScheduler(C); + if (EnableBalancedSchedStrategy) + return createGCNBalancedMachineScheduler(C); + return createGCNMaxOccupancyMachineScheduler(C); } diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -116,6 +116,12 @@ bool hasNextStage() const; GCNSchedStageID getNextStage() const; + + virtual bool computeScheduleMetric(unsigned RegionIdx, unsigned WavesAfter, + unsigned WavesBefore) { + return false; + } + virtual void clearMetric(){}; }; /// The goal of this scheduling strategy is to maximize kernel occupancy (i.e. @@ -136,33 +142,6 @@ GCNMaxILPSchedStrategy(const MachineSchedContext *C); }; -class ScheduleMetrics { - unsigned ScheduleLength; - unsigned BubbleCycles; - -public: - ScheduleMetrics() {} - ScheduleMetrics(unsigned L, unsigned BC) - : ScheduleLength(L), BubbleCycles(BC) {} - unsigned getLength() const { return ScheduleLength; } - unsigned getBubbles() const { return BubbleCycles; } - unsigned getMetric() const { - unsigned Metric = (BubbleCycles * ScaleFactor) / ScheduleLength; - // Metric is zero if the amount of bubbles is less than 1% which is too - // small. So, return 1. - return Metric ? Metric : 1; - } - static const unsigned ScaleFactor; -}; - -inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) { - dbgs() << "\n Schedule Metric (scaled by " - << ScheduleMetrics::ScaleFactor - << " ) is: " << Sm.getMetric() << " [ " << Sm.getBubbles() << "/" - << Sm.getLength() << " ]\n"; - return OS; -} - class GCNScheduleDAGMILive final : public ScheduleDAGMILive { friend class GCNSchedStage; friend class OccInitialScheduleStage; @@ -296,15 +275,9 @@ // Check result of scheduling. void checkScheduling(); - // computes the given schedule virtual execution time in clocks - ScheduleMetrics getScheduleMetrics(const std::vector &InputSchedule); - ScheduleMetrics getScheduleMetrics(const GCNScheduleDAGMILive &DAG); - unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, - DenseMap &ReadyCycles, - const TargetSchedModel &SM); - // Returns true if scheduling should be reverted. - virtual bool shouldRevertScheduling(unsigned WavesAfter); + virtual bool shouldRevertScheduling(unsigned WavesAfter, + unsigned WavesBefore); // Returns true if current region has known excess pressure. bool isRegionWithExcessRP() const { @@ -324,7 +297,8 @@ class OccInitialScheduleStage : public GCNSchedStage { public: - bool shouldRevertScheduling(unsigned WavesAfter) override; + bool shouldRevertScheduling(unsigned WavesAfter, + unsigned WavesBefore) override; OccInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) : GCNSchedStage(StageID, DAG) {} @@ -342,7 +316,8 @@ bool initGCNRegion() override; - bool shouldRevertScheduling(unsigned WavesAfter) override; + bool shouldRevertScheduling(unsigned WavesAfter, + unsigned WavesBefore) override; UnclusteredHighRPStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) : GCNSchedStage(StageID, DAG) {} @@ -357,7 +332,8 @@ bool initGCNRegion() override; - bool shouldRevertScheduling(unsigned WavesAfter) override; + bool shouldRevertScheduling(unsigned WavesAfter, + unsigned WavesBefore) override; ClusteredLowOccStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) : GCNSchedStage(StageID, DAG) {} @@ -393,7 +369,8 @@ bool initGCNRegion() override; - bool shouldRevertScheduling(unsigned WavesAfter) override; + bool shouldRevertScheduling(unsigned WavesAfter, + unsigned WavesBefore = 0) override; PreRARematStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) : GCNSchedStage(StageID, DAG) {} @@ -401,7 +378,8 @@ class ILPInitialScheduleStage : public GCNSchedStage { public: - bool shouldRevertScheduling(unsigned WavesAfter) override; + bool shouldRevertScheduling(unsigned WavesAfter, + unsigned WavesBefore) override; ILPInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) : GCNSchedStage(StageID, DAG) {} @@ -423,6 +401,87 @@ bool RemoveKillFlags); }; +#ifndef NDEBUG +struct EarlierIssuingCycle { + bool operator()(std::pair A, + std::pair B) const { + return A.second < B.second; + } +}; +#endif + +/// The goal of this scheduling strategy is to find a reasonable tradeof between +/// the kernel occupancy (i.e. maximum number of waves per simd). and ILP (i.e. +/// minimize the amount of stall cycles by means of the better latency +/// covering). +class GCNBalancedSchedStrategy final : public GCNSchedStrategy { + + const unsigned ScaleFactor = 100; + unsigned StallTotal = 0; + unsigned CurrCycle = 0; + DenseMap ReadyCycles; + DenseMap> Metrics; + const TargetSchedModel *SM; + unsigned computeSUnitReadyCycle(const SUnit &SU); + + void clearMetric() override { + StallTotal = 0; + CurrCycle = 0; + ReadyCycles.clear(); +#ifndef NDEBUG + PrintableSchedule.clear(); +#endif + } + +#ifndef NDEBUG + std::set, EarlierIssuingCycle> PrintableSchedule; + + void printSchedule() { + if (PrintableSchedule.empty()) + return; + + unsigned BBNum = PrintableSchedule.begin()->first->getParent()->getNumber(); + dbgs() << "\n################## Schedule time ReadyCycles for MBB : " + << BBNum + << " ##################\n# Cycle #\t\t\tInstruction " + " " + " \n"; + unsigned IPrev = 1; + for (auto &I : PrintableSchedule) { + if (I.second > IPrev + 1) + dbgs() << "****************************** BUBBLE OF " + << I.second - IPrev + << " CYCLES DETECTED ******************************\n\n"; + dbgs() << "[ " << I.second << " ] : " << *I.first << "\n"; + IPrev = I.second; + } + dbgs() << "\n\t" + << "Metric: " + << (StallTotal + ? (StallTotal * ScaleFactor) / CurrCycle + : 1) + << "\n\n"; + } +#endif + +public: + GCNBalancedSchedStrategy(const MachineSchedContext *C) : GCNSchedStrategy(C) { + SchedStages.push_back(GCNSchedStageID::OccInitialSchedule); + SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule); + SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule); + SchedStages.push_back(GCNSchedStageID::PreRARematerialize); + } + + void initialize(ScheduleDAGMI *DAG) override { + GCNSchedStrategy::initialize(DAG); + const GCNSubtarget &ST = MF->getSubtarget(); + SM = &ST.getInstrInfo()->getSchedModel(); + } + + bool computeScheduleMetric(unsigned RegionIdx, unsigned WavesAfter, + unsigned WavesBefore) override; +}; + } // End namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -52,8 +52,6 @@ "Wave Limited (amdgpu-limit-wave-threshold)."), cl::init(false)); -const unsigned ScheduleMetrics::ScaleFactor = 100; - GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), HasHighPressure(false) {} @@ -703,7 +701,7 @@ if (!GCNSchedStage::initGCNSchedStage()) return false; - if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none()) + if (DAG.RegionsWithExcessRP.none()) return false; SavedMutations.swap(DAG.Mutations); @@ -839,6 +837,7 @@ S.HasHighPressure = false; S.KnownExcessRP = isRegionWithExcessRP(); + S.clearMetric(); if (DAG.RegionsWithIGLPInstrs[RegionIdx] && StageID != GCNSchedStageID::UnclusteredHighRPReschedule) { @@ -970,9 +969,7 @@ DAG.RegionsWithExcessRP[RegionIdx] = true; } - // Revert if this region's schedule would cause a drop in occupancy or - // spilling. - if (shouldRevertScheduling(WavesAfter)) { + if (shouldRevertScheduling(WavesAfter, WavesBefore)) { revertScheduling(); } else { DAG.Pressure[RegionIdx] = PressureAfter; @@ -981,193 +978,52 @@ } } -unsigned -GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, - DenseMap &ReadyCycles, - const TargetSchedModel &SM) { - unsigned ReadyCycle = CurrCycle; - for (auto &D : SU.Preds) { - if (D.isAssignedRegDep()) { - MachineInstr *DefMI = D.getSUnit()->getInstr(); - unsigned Latency = SM.computeInstrLatency(DefMI); - unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum]; - ReadyCycle = std::max(ReadyCycle, DefReady + Latency); - } - } - ReadyCycles[SU.NodeNum] = ReadyCycle; - return ReadyCycle; -} - -#ifndef NDEBUG -struct EarlierIssuingCycle { - bool operator()(std::pair A, - std::pair B) const { - return A.second < B.second; - } -}; - -static void printScheduleModel(std::set, - EarlierIssuingCycle> &ReadyCycles) { - if (ReadyCycles.empty()) - return; - unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber(); - dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum - << " ##################\n# Cycle #\t\t\tInstruction " - " " - " \n"; - unsigned IPrev = 1; - for (auto &I : ReadyCycles) { - if (I.second > IPrev + 1) - dbgs() << "****************************** BUBBLE OF " << I.second - IPrev - << " CYCLES DETECTED ******************************\n\n"; - dbgs() << "[ " << I.second << " ] : " << *I.first << "\n"; - IPrev = I.second; - } -} -#endif - -ScheduleMetrics -GCNSchedStage::getScheduleMetrics(const std::vector &InputSchedule) { -#ifndef NDEBUG - std::set, EarlierIssuingCycle> - ReadyCyclesSorted; -#endif - const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); - unsigned SumBubbles = 0; - DenseMap ReadyCycles; - unsigned CurrCycle = 0; - for (auto &SU : InputSchedule) { - unsigned ReadyCycle = - computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM); - SumBubbles += ReadyCycle - CurrCycle; -#ifndef NDEBUG - ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle)); -#endif - CurrCycle = ++ReadyCycle; - } -#ifndef NDEBUG - LLVM_DEBUG( - printScheduleModel(ReadyCyclesSorted); - dbgs() << "\n\t" - << "Metric: " - << (SumBubbles - ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle - : 1) - << "\n\n"); -#endif - - return ScheduleMetrics(CurrCycle, SumBubbles); -} - -ScheduleMetrics -GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) { -#ifndef NDEBUG - std::set, EarlierIssuingCycle> - ReadyCyclesSorted; -#endif - const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); - unsigned SumBubbles = 0; - DenseMap ReadyCycles; - unsigned CurrCycle = 0; - for (auto &MI : DAG) { - SUnit *SU = DAG.getSUnit(&MI); - if (!SU) - continue; - unsigned ReadyCycle = - computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM); - SumBubbles += ReadyCycle - CurrCycle; -#ifndef NDEBUG - ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle)); -#endif - CurrCycle = ++ReadyCycle; - } -#ifndef NDEBUG - LLVM_DEBUG( - printScheduleModel(ReadyCyclesSorted); - dbgs() << "\n\t" - << "Metric: " - << (SumBubbles - ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle - : 1) - << "\n\n"); -#endif - - return ScheduleMetrics(CurrCycle, SumBubbles); -} - -bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { +bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter, + unsigned WavesBefore) { if (WavesAfter < DAG.MinOccupancy) return true; return false; } -bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { +bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter, + unsigned WavesBefore) { if (PressureAfter == PressureBefore) return false; - if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) - return true; - if (mayCauseSpilling(WavesAfter)) return true; - return false; + return S.computeScheduleMetric(RegionIdx, WavesAfter, WavesBefore); } -bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { - // If RP is not reduced in the unclustred reschedule stage, revert to the - // old schedule. - if ((WavesAfter <= PressureBefore.getOccupancy(ST) && - mayCauseSpilling(WavesAfter)) || - GCNSchedStage::shouldRevertScheduling(WavesAfter)) { - LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); - return true; - } - - // Do not attempt to relax schedule even more if we are already spilling. - if (isRegionWithExcessRP()) - return false; - - LLVM_DEBUG( - dbgs() - << "\n\t *** In shouldRevertScheduling ***\n" - << " *********** BEFORE UnclusteredHighRPStage ***********\n"); - ScheduleMetrics MBefore = - getScheduleMetrics(DAG.SUnits); - LLVM_DEBUG( - dbgs() - << "\n *********** AFTER UnclusteredHighRPStage ***********\n"); - ScheduleMetrics MAfter = getScheduleMetrics(DAG); - unsigned OldMetric = MBefore.getMetric(); - unsigned NewMetric = MAfter.getMetric(); - unsigned WavesBefore = - std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST)); - unsigned Profit = - ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * - ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / - NewMetric) / - ScheduleMetrics::ScaleFactor; - LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after " - << MAfter << "Profit: " << Profit << "\n"); - return Profit < ScheduleMetrics::ScaleFactor; +bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter, + unsigned WavesBefore) { + // Revert if may cause spilling. Otherwise rely on the metric computed by the + // strategy class. Exception: does not make sense to revert the unclustered + // schedule if we are still in excess RP state as it will not become better. + return GCNSchedStage::shouldRevertScheduling(WavesAfter, WavesBefore) || + (S.computeScheduleMetric(RegionIdx, WavesAfter, WavesBefore) && + !isRegionWithExcessRP()); } -bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { +bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter, + unsigned WavesBefore) { if (PressureAfter == PressureBefore) return false; - if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) + if (GCNSchedStage::shouldRevertScheduling(WavesAfter, WavesBefore)) return true; if (mayCauseSpilling(WavesAfter)) return true; - return false; + return S.computeScheduleMetric(RegionIdx, WavesAfter, WavesBefore); } -bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { - if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) +bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter, + unsigned WavesBefore) { + if (GCNSchedStage::shouldRevertScheduling(WavesAfter, WavesBefore)) return true; if (mayCauseSpilling(WavesAfter)) @@ -1176,7 +1032,8 @@ return false; } -bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { +bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter, + unsigned WavesBefore) { if (mayCauseSpilling(WavesAfter)) return true; @@ -1570,3 +1427,52 @@ ScheduleDAGMI::finalizeSchedule(); } + +unsigned GCNBalancedSchedStrategy::computeSUnitReadyCycle(const SUnit &SU) { + unsigned ReadyCycle = CurrCycle; + for (auto &D : SU.Preds) { + if (D.isAssignedRegDep()) { + MachineInstr *DefMI = D.getSUnit()->getInstr(); + unsigned Latency = SM->computeInstrLatency(DefMI); + unsigned DefReady = ReadyCycles[DAG->getSUnit(DefMI)->NodeNum]; + ReadyCycle = std::max(ReadyCycle, DefReady + Latency); + } + } + ReadyCycles[SU.NodeNum] = ReadyCycle; + return ReadyCycle; +} + +bool llvm::GCNBalancedSchedStrategy::computeScheduleMetric( + unsigned RegionIdx, unsigned WavesAfter, unsigned WavesBefore) { + bool Result = false; + unsigned PrevMetric = 0; + if (Metrics.count(RegionIdx)) { + PrevMetric = Metrics[RegionIdx].back(); + } + for (auto &MI : *DAG) { + SUnit *SU = DAG->getSUnit(&MI); + if (!SU) + continue; + unsigned ReadyCycle = computeSUnitReadyCycle(*SU); + StallTotal += ReadyCycle - CurrCycle; +#ifndef NDEBUG + PrintableSchedule.insert(std::make_pair(SU->getInstr(), ReadyCycle)); +#endif + CurrCycle = ++ReadyCycle; + } + unsigned Metric = StallTotal ? StallTotal * ScaleFactor / CurrCycle : 1; +#ifndef NDEBUG + LLVM_DEBUG(printSchedule()); +#endif + if (PrevMetric) { + unsigned Profit = + ((WavesAfter * ScaleFactor) / WavesBefore * + ((PrevMetric + ScheduleMetricBias) * ScaleFactor) / Metric) / + ScaleFactor; + Result = Profit < ScaleFactor; + } + if (!Result) + Metrics[RegionIdx].push_back(Metric); + clearMetric(); + return Result; +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll @@ -699,24 +699,24 @@ ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; GFX9-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX9-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GFX9-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; GFX9-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; GFX9-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 +; GFX9-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5) ; GFX9-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32] -; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 -; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16 +; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34] +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(3) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36] ; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25] -; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 -; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24 ; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25] -; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32 ; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25] +; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38] ; GFX9-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; @@ -725,24 +725,24 @@ ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; GFX9-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GFX9-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; GFX9-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; GFX9-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 +; GFX9-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5) ; GFX9-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32] -; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 -; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16 +; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5) +; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(3) +; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36] ; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25] -; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 -; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24 ; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25] -; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32 ; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25] +; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38] ; GFX9-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; @@ -870,24 +870,24 @@ ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; GFX9-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX9-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GFX9-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; GFX9-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; GFX9-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 +; GFX9-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5) ; GFX9-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32] -; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 -; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16 +; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34] +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(3) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36] ; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25] -; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 -; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24 ; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25] -; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32 ; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25] +; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38] ; GFX9-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; @@ -896,24 +896,24 @@ ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; GFX9-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GFX9-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; GFX9-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; GFX9-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 +; GFX9-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5) ; GFX9-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32] -; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 -; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16 +; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5) +; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(3) +; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36] ; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25] -; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 -; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24 ; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25] -; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32 ; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25] +; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38] ; GFX9-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -365,256 +365,256 @@ ; GISEL-LABEL: v_udiv_v2i64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 ; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v7 -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc -; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v11 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v7 +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v7, vcc +; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v13 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v12 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v9 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 ; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v10 -; GISEL-NEXT: v_trunc_f32_e32 v11, v11 +; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v11 ; GISEL-NEXT: v_trunc_f32_e32 v12, v12 -; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_trunc_f32_e32 v13, v13 ; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 ; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v11 +; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 ; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v18, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v10 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v14, v13 +; GISEL-NEXT: v_mul_lo_u32 v18, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v20, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v21, v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v22, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v23, v14, v11 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 ; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v10, v17 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v18 -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v9, v16 -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v9, v19 -; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 +; GISEL-NEXT: v_mul_hi_u32 v24, v10, v18 ; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v22, v17 +; GISEL-NEXT: v_mul_lo_u32 v22, v13, v21 +; GISEL-NEXT: v_mul_hi_u32 v25, v11, v21 +; GISEL-NEXT: v_mul_hi_u32 v21, v13, v21 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v23 +; GISEL-NEXT: v_mul_lo_u32 v20, v10, v16 +; GISEL-NEXT: v_mul_lo_u32 v23, v12, v16 +; GISEL-NEXT: v_mul_hi_u32 v26, v10, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v27, v11, v17 +; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v27 +; GISEL-NEXT: v_cndmask_b32_e64 v27, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v25 +; GISEL-NEXT: v_mul_lo_u32 v22, v13, v17 +; GISEL-NEXT: v_mul_hi_u32 v25, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v13, v17 +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v23, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17] -; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 +; GISEL-NEXT: v_add_i32_e64 v21, s[4:5], v22, v21 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v24 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v8, v9 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v10 -; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v9, v20 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v19 -; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v26 +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v26, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v21, vcc, v21, v25 +; GISEL-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v19 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v23, v24 +; GISEL-NEXT: v_add_i32_e32 v23, vcc, v27, v26 +; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v25 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v21, vcc, v21, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v19 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v22, v23 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v19 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v16, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v18, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v21 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v17, vcc +; GISEL-NEXT: v_mul_lo_u32 v17, v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v12 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v16 +; GISEL-NEXT: v_mul_hi_u32 v21, v10, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v14, v14, v13 +; GISEL-NEXT: v_mul_lo_u32 v22, v13, v17 +; GISEL-NEXT: v_mul_hi_u32 v23, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v13, v17 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v14 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v11, v8 -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v10, v13 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 -; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v10, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v15, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v18, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 +; GISEL-NEXT: v_mul_lo_u32 v19, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v24, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v25, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v20, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 -; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v22, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v24, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v21 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v17 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v25 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v20, v14 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v22, v19 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v24, v21 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v12, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v17 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v13, v9, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v15, v2, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v18, v0, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 +; GISEL-NEXT: v_mul_lo_u32 v19, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v21, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v20, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v21 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v19, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v15 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v5, v10 +; GISEL-NEXT: v_mul_hi_u32 v17, v4, v10 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_mul_lo_u32 v15, v6, v11 +; GISEL-NEXT: v_mul_lo_u32 v18, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v6, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v17, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v18, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v19, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v7, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v9 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v16 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 1, v10 -; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v19 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], 1, v15 -; GISEL-NEXT: v_add_i32_e64 v8, s[12:13], v8, v13 -; GISEL-NEXT: v_add_i32_e64 v13, s[12:13], 1, v16 -; GISEL-NEXT: v_add_i32_e64 v11, s[14:15], v11, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v0, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v6 -; GISEL-NEXT: v_sub_i32_e64 v0, s[18:19], v0, v4 -; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v6 ; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v0, v4 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v6, v11 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], 0, v11, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v17, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[16:17] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v20, v4 -; GISEL-NEXT: v_addc_u32_e64 v20, s[6:7], 0, v0, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v12, v18 -; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], 0, v2, s[12:13] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v14 -; GISEL-NEXT: v_subb_u32_e64 v14, s[6:7], v1, v12, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v12 -; GISEL-NEXT: v_subb_u32_e64 v12, s[6:7], v3, v4, s[8:9] -; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[22:23] +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v21, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v22, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v23, vcc, 0, v9, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13 +; GISEL-NEXT: v_addc_u32_e32 v24, vcc, 0, v20, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v21 +; GISEL-NEXT: v_add_i32_e32 v21, vcc, 1, v22 +; GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v23, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v19 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v1, v12, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v12 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v15 +; GISEL-NEXT: v_subb_u32_e64 v15, s[6:7], v3, v17, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v17 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[8:9] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v6, v12, v6, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[18:19] -; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, v17, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[6:7] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v15, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc +; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v7, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v18, v12, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v19, v17, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v7 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v3, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[8:9] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v15, v19, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v16, v13, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v20, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v2, v18, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v3, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v5, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v14, v2, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v13, v16, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, v22, v21, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v20, v24, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v23, v25, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v11, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v5, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64: @@ -1248,258 +1248,258 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000 -; GISEL-NEXT: v_lshl_b64 v[7:8], s[4:5], v4 -; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v8 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v5 -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v4 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v8, vcc -; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v5, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v11 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 +; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v7 +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v7, vcc +; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v13 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v12 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v9 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 ; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v10 -; GISEL-NEXT: v_trunc_f32_e32 v11, v11 +; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v11 ; GISEL-NEXT: v_trunc_f32_e32 v12, v12 -; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_trunc_f32_e32 v13, v13 ; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 ; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v11 +; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 ; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v18, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v10 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v14, v13 +; GISEL-NEXT: v_mul_lo_u32 v18, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v20, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v21, v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v22, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v23, v14, v11 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 ; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v10, v17 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v18 -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v6, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v9 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v9, v16 -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v9, v19 -; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 +; GISEL-NEXT: v_mul_hi_u32 v24, v10, v18 ; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v22, v17 +; GISEL-NEXT: v_mul_lo_u32 v22, v13, v21 +; GISEL-NEXT: v_mul_hi_u32 v25, v11, v21 +; GISEL-NEXT: v_mul_hi_u32 v21, v13, v21 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v23 +; GISEL-NEXT: v_mul_lo_u32 v20, v10, v16 +; GISEL-NEXT: v_mul_lo_u32 v23, v12, v16 +; GISEL-NEXT: v_mul_hi_u32 v26, v10, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v27, v11, v17 +; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v27 +; GISEL-NEXT: v_cndmask_b32_e64 v27, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v25 +; GISEL-NEXT: v_mul_lo_u32 v22, v13, v17 +; GISEL-NEXT: v_mul_hi_u32 v25, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v13, v17 +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v23, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17] -; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 +; GISEL-NEXT: v_add_i32_e64 v21, s[4:5], v22, v21 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v24 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v6, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v9 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v10 -; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v9, v20 -; GISEL-NEXT: v_mul_lo_u32 v6, v6, v11 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v14, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v19 -; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v11, v6 -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, v10, v13 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 -; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v10, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v26 +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v26, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v21, vcc, v21, v25 +; GISEL-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v19 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v23, v24 +; GISEL-NEXT: v_add_i32_e32 v23, vcc, v27, v26 +; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v25 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v21, vcc, v21, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v19 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v22, v23 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v19 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 -; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v16, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v18, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v21 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v17, vcc +; GISEL-NEXT: v_mul_lo_u32 v17, v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v12 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v16 +; GISEL-NEXT: v_mul_hi_u32 v21, v10, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v14, v14, v13 +; GISEL-NEXT: v_mul_lo_u32 v22, v13, v17 +; GISEL-NEXT: v_mul_hi_u32 v23, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v13, v17 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v14 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v15, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v18, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 +; GISEL-NEXT: v_mul_lo_u32 v19, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v24, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v25, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v20, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v22, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v24, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v21 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v17 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v11, v6, vcc -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v25 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v20, v14 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v22, v19 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v24, v21 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v12, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v17 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v13, v9, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v15, v2, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v18, v0, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 +; GISEL-NEXT: v_mul_lo_u32 v19, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v21, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v20, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v21 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v19, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v15 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v5, v10 +; GISEL-NEXT: v_mul_hi_u32 v17, v4, v10 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_mul_lo_u32 v15, v6, v11 +; GISEL-NEXT: v_mul_lo_u32 v18, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v6, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v7, v9 -; GISEL-NEXT: v_mul_lo_u32 v17, v8, v9 -; GISEL-NEXT: v_mul_hi_u32 v18, v7, v9 -; GISEL-NEXT: v_mul_lo_u32 v19, v4, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v5, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v9 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v16 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 1, v10 -; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v19 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], 1, v15 -; GISEL-NEXT: v_add_i32_e64 v6, s[12:13], v6, v13 -; GISEL-NEXT: v_add_i32_e64 v13, s[12:13], 1, v16 -; GISEL-NEXT: v_add_i32_e64 v11, s[14:15], v11, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v0, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v4 -; GISEL-NEXT: v_sub_i32_e64 v0, s[18:19], v0, v7 -; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, v7, v6 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v0, v7 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; GISEL-NEXT: v_mul_lo_u32 v2, v4, v11 -; GISEL-NEXT: v_add_i32_e64 v4, s[24:25], v17, v12 -; GISEL-NEXT: v_addc_u32_e64 v7, s[6:7], 0, v11, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], v20, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v18 -; GISEL-NEXT: v_subb_u32_e64 v17, s[6:7], v1, v4, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[16:17] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v17, v8 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[14:15], v17, v8 -; GISEL-NEXT: v_addc_u32_e64 v17, s[10:11], 0, v0, s[10:11] -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v8, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[18:19] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v8 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v1, v8 -; GISEL-NEXT: v_addc_u32_e64 v1, s[12:13], 0, v7, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[22:23] -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; GISEL-NEXT: v_subb_u32_e64 v14, vcc, v3, v2, s[8:9] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v14, v5 -; GISEL-NEXT: v_subb_u32_e64 v2, s[8:9], v2, v5, s[8:9] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[6:7] +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v21, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v22, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v23, vcc, 0, v9, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13 +; GISEL-NEXT: v_addc_u32_e32 v24, vcc, 0, v20, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v21 +; GISEL-NEXT: v_add_i32_e32 v21, vcc, 1, v22 +; GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v23, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v19 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v1, v12, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v12 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v15 +; GISEL-NEXT: v_subb_u32_e64 v15, s[6:7], v3, v17, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v17 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[6:7] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v15, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc +; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v7, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v18, v12, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v19, v17, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v2, vcc, 0, v2, s[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[14:15] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v14, v4, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v14, v2, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v13, v16, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, v22, v21, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v20, v24, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v23, v25, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v18, s[6:7] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v2 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v15, v19, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v16, v13, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v17, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v7, v1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v3, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v5, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v2, v11, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v5, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_pow2_shl_denom: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -359,254 +359,254 @@ ; GISEL-LABEL: v_urem_v2i64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 ; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v7 -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc -; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v11 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v7 +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v7, vcc +; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v13 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v12 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v9 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 ; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v10 -; GISEL-NEXT: v_trunc_f32_e32 v11, v11 +; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v11 ; GISEL-NEXT: v_trunc_f32_e32 v12, v12 -; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_trunc_f32_e32 v13, v13 ; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 ; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v11 +; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 ; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v18, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v10 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v14, v13 +; GISEL-NEXT: v_mul_lo_u32 v18, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v20, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v21, v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v22, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v23, v14, v11 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 ; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v10, v17 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v18 -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v9, v16 -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v9, v19 -; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 +; GISEL-NEXT: v_mul_hi_u32 v24, v10, v18 ; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v22, v17 +; GISEL-NEXT: v_mul_lo_u32 v22, v13, v21 +; GISEL-NEXT: v_mul_hi_u32 v25, v11, v21 +; GISEL-NEXT: v_mul_hi_u32 v21, v13, v21 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v23 +; GISEL-NEXT: v_mul_lo_u32 v20, v10, v16 +; GISEL-NEXT: v_mul_lo_u32 v23, v12, v16 +; GISEL-NEXT: v_mul_hi_u32 v26, v10, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v27, v11, v17 +; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v27 +; GISEL-NEXT: v_cndmask_b32_e64 v27, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v25 +; GISEL-NEXT: v_mul_lo_u32 v22, v13, v17 +; GISEL-NEXT: v_mul_hi_u32 v25, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v13, v17 +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v23, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17] -; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 +; GISEL-NEXT: v_add_i32_e64 v21, s[4:5], v22, v21 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v24 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v8, v9 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v10 -; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v9, v20 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v19 -; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v26 +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v26, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v21, vcc, v21, v25 +; GISEL-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v19 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v23, v24 +; GISEL-NEXT: v_add_i32_e32 v23, vcc, v27, v26 +; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v25 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v21, vcc, v21, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v19 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v22, v23 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v19 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v16, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v18, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v21 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v17, vcc +; GISEL-NEXT: v_mul_lo_u32 v17, v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v12 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v16 +; GISEL-NEXT: v_mul_hi_u32 v21, v10, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v14, v14, v13 +; GISEL-NEXT: v_mul_lo_u32 v22, v13, v17 +; GISEL-NEXT: v_mul_hi_u32 v23, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v13, v17 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v14 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v11, v8 -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v10, v13 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 -; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v10, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v15, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v18, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 +; GISEL-NEXT: v_mul_lo_u32 v19, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v24, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v25, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v20, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 -; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v22, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v24, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v21 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v17 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v25 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v20, v14 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v22, v19 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v24, v21 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v12, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v17 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v13, v9, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v15, v2, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v18, v0, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 +; GISEL-NEXT: v_mul_lo_u32 v19, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v21, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v20, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v21 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v19, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v15 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v5, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v4, v10 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_mul_lo_u32 v15, v6, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v6, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v17, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v6 -; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v4 -; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v6 ; GISEL-NEXT: v_mul_lo_u32 v8, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v6, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v13, v6 -; GISEL-NEXT: v_sub_i32_e64 v4, s[14:15], v12, v4 -; GISEL-NEXT: v_sub_i32_e64 v6, s[16:17], v13, v6 -; GISEL-NEXT: v_add_i32_e64 v8, s[18:19], v17, v8 -; GISEL-NEXT: v_add_i32_e64 v11, s[18:19], v19, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v9 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v11, v10 -; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v8 -; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], v3, v9, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v9, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v16, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v17, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v8, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v15 +; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], v3, v9, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v8, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v18, vcc, 0, v1, s[10:11] -; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[10:11] -; GISEL-NEXT: v_subbrev_u32_e64 v19, vcc, 0, v3, s[12:13] -; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v7, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[14:15] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v19, v7 -; GISEL-NEXT: v_subbrev_u32_e64 v3, s[6:7], 0, v3, s[16:17] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v19, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[8:9] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v5 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v6, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v1, v18, v1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[8:9] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v7, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v0, v4 +; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v9, vcc +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e64 v16, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v12, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v18, v15, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v16, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v17, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v6, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v16, v3, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64: @@ -1753,254 +1753,254 @@ ; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000 ; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 ; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 ; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v7 -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc -; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v11 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v7 +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v7, vcc +; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v13 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v12 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v9 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 ; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v10 -; GISEL-NEXT: v_trunc_f32_e32 v11, v11 +; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v11 ; GISEL-NEXT: v_trunc_f32_e32 v12, v12 -; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_trunc_f32_e32 v13, v13 ; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 ; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v11 +; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 ; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v18, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v10 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v14, v13 +; GISEL-NEXT: v_mul_lo_u32 v18, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v20, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v21, v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v22, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v23, v14, v11 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 ; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v10, v17 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v18 -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v9, v16 -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v9, v19 -; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 +; GISEL-NEXT: v_mul_hi_u32 v24, v10, v18 ; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v22, v17 +; GISEL-NEXT: v_mul_lo_u32 v22, v13, v21 +; GISEL-NEXT: v_mul_hi_u32 v25, v11, v21 +; GISEL-NEXT: v_mul_hi_u32 v21, v13, v21 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v23 +; GISEL-NEXT: v_mul_lo_u32 v20, v10, v16 +; GISEL-NEXT: v_mul_lo_u32 v23, v12, v16 +; GISEL-NEXT: v_mul_hi_u32 v26, v10, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v27, v11, v17 +; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v27 +; GISEL-NEXT: v_cndmask_b32_e64 v27, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v25 +; GISEL-NEXT: v_mul_lo_u32 v22, v13, v17 +; GISEL-NEXT: v_mul_hi_u32 v25, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v13, v17 +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v23, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17] -; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 +; GISEL-NEXT: v_add_i32_e64 v21, s[4:5], v22, v21 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v24 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v8, v9 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v10 -; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v9, v20 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v19 -; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v26 +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v26, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v21, vcc, v21, v25 +; GISEL-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v19 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v23, v24 +; GISEL-NEXT: v_add_i32_e32 v23, vcc, v27, v26 +; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v25 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v21, vcc, v21, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v19 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v22, v23 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v19 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v16, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v18, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v21 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v17, vcc +; GISEL-NEXT: v_mul_lo_u32 v17, v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v12 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v16 +; GISEL-NEXT: v_mul_hi_u32 v21, v10, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v14, v14, v13 +; GISEL-NEXT: v_mul_lo_u32 v22, v13, v17 +; GISEL-NEXT: v_mul_hi_u32 v23, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v13, v17 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v14 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v11, v8 -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v10, v13 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 -; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v10, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v15, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v18, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 +; GISEL-NEXT: v_mul_lo_u32 v19, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v24, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v25, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v20, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 -; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v22, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v24, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v21 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v17 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v25 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v20, v14 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v22, v19 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v24, v21 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v12, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v17 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v13, v9, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v15, v2, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v18, v0, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 +; GISEL-NEXT: v_mul_lo_u32 v19, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v21, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v20, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v21 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v19, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v15 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v5, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v4, v10 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_mul_lo_u32 v15, v6, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v6, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v17, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v6 -; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v4 -; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v6 ; GISEL-NEXT: v_mul_lo_u32 v8, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v6, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v13, v6 -; GISEL-NEXT: v_sub_i32_e64 v4, s[14:15], v12, v4 -; GISEL-NEXT: v_sub_i32_e64 v6, s[16:17], v13, v6 -; GISEL-NEXT: v_add_i32_e64 v8, s[18:19], v17, v8 -; GISEL-NEXT: v_add_i32_e64 v11, s[18:19], v19, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v9 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v11, v10 -; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v8 -; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], v3, v9, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v9, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v16, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v17, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v8, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v15 +; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], v3, v9, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v8, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v18, vcc, 0, v1, s[10:11] -; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e64 v14, vcc, 0, v3, s[12:13] -; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v7, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[14:15] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v7 -; GISEL-NEXT: v_subbrev_u32_e64 v3, s[6:7], 0, v3, s[16:17] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v14, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[8:9] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v5 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v6, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v1, v18, v1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v3, s[8:9] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v7, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v0, v4 +; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v9, vcc +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e64 v16, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v12, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v18, v15, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v16, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v17, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v6, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v16, v3, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_pow2_shl_denom: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -1348,29 +1348,29 @@ ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v16 ; GFX6-NEXT: v_min_u32_e32 v16, v3, v19 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v4, v20 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v5, v21 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v6, v22 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v7, v23 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v8, v24 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v9, v25 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v10, v26 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v16 ; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX6-NEXT: v_min_u32_e32 v17, v11, v27 -; GFX6-NEXT: v_min_u32_e32 v18, v12, v28 -; GFX6-NEXT: v_min_u32_e32 v19, v13, v29 -; GFX6-NEXT: v_min_u32_e32 v20, v14, v30 -; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v17 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v18 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v19 -; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v20 +; GFX6-NEXT: v_min_u32_e32 v17, v4, v20 +; GFX6-NEXT: v_min_u32_e32 v18, v5, v21 +; GFX6-NEXT: v_min_u32_e32 v19, v6, v22 +; GFX6-NEXT: v_min_u32_e32 v20, v7, v23 +; GFX6-NEXT: v_min_u32_e32 v21, v8, v24 +; GFX6-NEXT: v_min_u32_e32 v22, v9, v25 +; GFX6-NEXT: v_min_u32_e32 v23, v10, v26 +; GFX6-NEXT: v_min_u32_e32 v24, v11, v27 +; GFX6-NEXT: v_min_u32_e32 v25, v12, v28 +; GFX6-NEXT: v_min_u32_e32 v26, v13, v29 +; GFX6-NEXT: v_min_u32_e32 v27, v14, v30 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v17 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v18 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v19 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v20 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v21 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v22 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v23 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v24 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v25 +; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v26 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v27 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_min_u32_e32 v16, v15, v16 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16 diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -2800,6 +2800,7 @@ ; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x6c, v0 ; GCN-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x64, v0 ; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen @@ -2814,7 +2815,6 @@ ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x50, v0 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 @@ -2897,21 +2897,21 @@ ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; GFX7-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; GFX7-NEXT: v_add_i32_e32 v27, vcc, 0x5c, v0 ; GFX7-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; GFX7-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; GFX7-NEXT: v_add_i32_e32 v26, vcc, 0x54, v0 +; GFX7-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; GFX7-NEXT: v_add_i32_e32 v25, vcc, 0x50, v0 +; GFX7-NEXT: v_add_i32_e32 v27, vcc, 0x4c, v0 +; GFX7-NEXT: v_add_i32_e32 v24, vcc, 0x44, v0 +; GFX7-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v21, v27, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; GFX7-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; GFX7-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 60, v0 @@ -2975,21 +2975,21 @@ ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; GFX8-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; GFX8-NEXT: v_add_u32_e32 v27, vcc, 0x5c, v0 ; GFX8-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 -; GFX8-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, 0x54, v0 +; GFX8-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 -; GFX8-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 -; GFX8-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 -; GFX8-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; GFX8-NEXT: v_add_u32_e32 v25, vcc, 0x50, v0 +; GFX8-NEXT: v_add_u32_e32 v27, vcc, 0x4c, v0 +; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0x44, v0 +; GFX8-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v21, v27, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 -; GFX8-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; GFX8-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 60, v0 @@ -3035,20 +3035,20 @@ ; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 ; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84 ; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 @@ -3065,11 +3065,11 @@ ; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:116 ; GFX9-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir --- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir @@ -7,13 +7,11 @@ # CHECK-NEXT: From: DBG_VALUE %17:vgpr_32, 0, 0 # CHECK-NEXT: To: S_ENDPGM 0, implicit %69:vgpr_32, implicit %70:vgpr_32 # CHECK-NEXT: RegionInstrs: 46 -# CHECK: Attempting to revert scheduling. # CHECK: ********** MI Scheduling ********** # CHECK: test_same_num_instrs:%bb.2 # CHECK-NEXT: From: DBG_VALUE %17:vgpr_32, 0, 0 # CHECK-NEXT: To: S_ENDPGM 0, implicit %69:vgpr_32, implicit %70:vgpr_32 # CHECK-NEXT: RegionInstrs: 46 -# CHECK: Attempting to revert scheduling. --- name: test_same_num_instrs diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -3141,49 +3141,49 @@ ; VI-LABEL: v_test_canonicalize_var_v32f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v20, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v27, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v20 -; VI-NEXT: v_max_f16_sdwa v20, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v0, v27 +; VI-NEXT: v_max_f16_sdwa v27, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v20 -; VI-NEXT: v_max_f16_sdwa v20, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v1, v1, v27 +; VI-NEXT: v_max_f16_sdwa v27, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v2, v2, v2 -; VI-NEXT: v_or_b32_e32 v2, v2, v20 -; VI-NEXT: v_max_f16_sdwa v20, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v2, v27 +; VI-NEXT: v_max_f16_sdwa v27, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v3, v3, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v20 -; VI-NEXT: v_max_f16_sdwa v20, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v4, v4, v4 -; VI-NEXT: v_or_b32_e32 v4, v4, v20 -; VI-NEXT: v_max_f16_sdwa v20, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v5, v5, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v20 -; VI-NEXT: v_max_f16_sdwa v20, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v6, v6, v6 -; VI-NEXT: v_or_b32_e32 v6, v6, v20 -; VI-NEXT: v_max_f16_sdwa v20, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v7, v7, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v20 -; VI-NEXT: v_max_f16_sdwa v20, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v8, v8, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v20 -; VI-NEXT: v_max_f16_sdwa v20, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v9, v9, v9 -; VI-NEXT: v_or_b32_e32 v9, v9, v20 -; VI-NEXT: v_max_f16_sdwa v20, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v10, v10, v10 ; VI-NEXT: v_max_f16_sdwa v16, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_sdwa v17, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_sdwa v18, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_sdwa v19, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v10, v10, v20 ; VI-NEXT: v_max_f16_sdwa v20, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v21, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v22, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v23, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v24, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v25, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v26, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v3, v3, v27 +; VI-NEXT: v_max_f16_sdwa v27, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v15, v15, v15 ; VI-NEXT: v_max_f16_e32 v14, v14, v14 ; VI-NEXT: v_max_f16_e32 v13, v13, v13 ; VI-NEXT: v_max_f16_e32 v12, v12, v12 ; VI-NEXT: v_max_f16_e32 v11, v11, v11 +; VI-NEXT: v_max_f16_e32 v10, v10, v10 +; VI-NEXT: v_max_f16_e32 v9, v9, v9 +; VI-NEXT: v_max_f16_e32 v8, v8, v8 +; VI-NEXT: v_max_f16_e32 v7, v7, v7 +; VI-NEXT: v_max_f16_e32 v6, v6, v6 +; VI-NEXT: v_max_f16_e32 v5, v5, v5 +; VI-NEXT: v_max_f16_e32 v4, v4, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v27 +; VI-NEXT: v_or_b32_e32 v5, v5, v26 +; VI-NEXT: v_or_b32_e32 v6, v6, v25 +; VI-NEXT: v_or_b32_e32 v7, v7, v24 +; VI-NEXT: v_or_b32_e32 v8, v8, v23 +; VI-NEXT: v_or_b32_e32 v9, v9, v22 +; VI-NEXT: v_or_b32_e32 v10, v10, v21 ; VI-NEXT: v_or_b32_e32 v11, v11, v20 ; VI-NEXT: v_or_b32_e32 v12, v12, v19 ; VI-NEXT: v_or_b32_e32 v13, v13, v18 @@ -3488,9 +3488,9 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v8, v16 ; CI-NEXT: v_or_b32_e32 v7, v9, v7 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v28 ; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v29 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v25 ; CI-NEXT: v_or_b32_e32 v8, v9, v8 ; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; CI-NEXT: v_cvt_f16_f32_e32 v10, v20 @@ -3501,30 +3501,30 @@ ; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 ; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8 ; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v29 ; CI-NEXT: v_or_b32_e32 v10, v11, v10 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v24 ; CI-NEXT: v_or_b32_e32 v11, v13, v11 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v23 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:24 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v23, v27 ; CI-NEXT: v_cvt_f16_f32_e32 v24, v30 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; CI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; CI-NEXT: v_or_b32_e32 v12, v13, v12 ; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; CI-NEXT: v_or_b32_e32 v22, v23, v22 +; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; CI-NEXT: v_or_b32_e32 v13, v15, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v27 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:36 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:40 -; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; CI-NEXT: v_or_b32_e32 v14, v15, v14 -; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v24 -; CI-NEXT: v_or_b32_e32 v15, v25, v15 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 +; CI-NEXT: v_or_b32_e32 v23, v25, v23 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:40 ; CI-NEXT: s_waitcnt vmcnt(11) ; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; CI-NEXT: s_waitcnt vmcnt(10) @@ -3538,58 +3538,55 @@ ; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; CI-NEXT: v_or_b32_e32 v17, v19, v17 ; CI-NEXT: s_waitcnt vmcnt(7) -; CI-NEXT: v_cvt_f16_f32_e32 v18, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; CI-NEXT: s_waitcnt vmcnt(6) -; CI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; CI-NEXT: s_waitcnt vmcnt(5) -; CI-NEXT: v_cvt_f16_f32_e32 v20, v22 -; CI-NEXT: s_waitcnt vmcnt(4) -; CI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; CI-NEXT: v_or_b32_e32 v18, v19, v18 -; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; CI-NEXT: v_or_b32_e32 v19, v21, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; CI-NEXT: s_waitcnt vmcnt(3) -; CI-NEXT: v_cvt_f16_f32_e32 v20, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v20 ; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v21, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v21 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v29 -; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; CI-NEXT: v_or_b32_e32 v20, v21, v20 -; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60 -; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; CI-NEXT: v_or_b32_e32 v21, v27, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; CI-NEXT: v_or_b32_e32 v14, v15, v14 +; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 +; CI-NEXT: v_or_b32_e32 v24, v25, v24 +; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; CI-NEXT: v_or_b32_e32 v15, v19, v15 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56 +; CI-NEXT: v_or_b32_e32 v25, v27, v25 ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; CI-NEXT: s_waitcnt vmcnt(5) -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; CI-NEXT: s_waitcnt vmcnt(4) -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: s_waitcnt vmcnt(3) -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; CI-NEXT: v_or_b32_e32 v24, v25, v24 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; CI-NEXT: v_or_b32_e32 v18, v19, v18 ; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; CI-NEXT: v_or_b32_e32 v26, v27, v26 ; CI-NEXT: v_add_i32_e32 v27, vcc, 0x7c, v0 ; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 -; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; CI-NEXT: v_or_b32_e32 v22, v22, v23 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v20 +; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; CI-NEXT: v_or_b32_e32 v19, v21, v19 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; CI-NEXT: s_waitcnt vmcnt(2) ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; CI-NEXT: s_waitcnt vmcnt(1) @@ -3601,7 +3598,7 @@ ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; CI-NEXT: s_waitcnt vmcnt(3) -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3613,77 +3610,77 @@ ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v25, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v26 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v26, v27 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 -; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; CI-NEXT: v_or_b32_e32 v25, v26, v25 +; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; CI-NEXT: v_or_b32_e32 v20, v26, v20 ; CI-NEXT: v_add_i32_e32 v26, vcc, 0x70, v0 -; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 +; CI-NEXT: buffer_store_dword v20, v26, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; CI-NEXT: s_waitcnt vmcnt(3) ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; CI-NEXT: v_or_b32_e32 v23, v23, v27 +; CI-NEXT: v_or_b32_e32 v21, v21, v27 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0 -; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; CI-NEXT: v_or_b32_e32 v25, v26, v25 +; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; CI-NEXT: v_or_b32_e32 v20, v26, v20 ; CI-NEXT: v_add_i32_e32 v26, vcc, 0x6c, v0 -; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 +; CI-NEXT: buffer_store_dword v20, v26, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 +; CI-NEXT: buffer_store_dword v21, v27, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 ; CI-NEXT: s_waitcnt vmcnt(3) -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; CI-NEXT: v_or_b32_e32 v25, v26, v25 +; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; CI-NEXT: v_or_b32_e32 v20, v26, v20 ; CI-NEXT: s_waitcnt vmcnt(2) ; CI-NEXT: v_cvt_f16_f32_e32 v26, v27 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v27, v29 -; CI-NEXT: v_or_b32_e32 v23, v26, v23 +; CI-NEXT: v_or_b32_e32 v21, v26, v21 ; CI-NEXT: v_cvt_f16_f32_e32 v26, v28 ; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; CI-NEXT: v_or_b32_e32 v26, v27, v26 ; CI-NEXT: v_add_i32_e32 v27, vcc, 0x64, v0 ; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v26, vcc, 0x60, v0 -; CI-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v23, vcc, 0x5c, v0 -; CI-NEXT: buffer_store_dword v25, v23, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v23, vcc, 0x58, v0 -; CI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v22, vcc, 0x54, v0 -; CI-NEXT: buffer_store_dword v24, v22, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v22, vcc, 0x50, v0 -; CI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v21, vcc, 0x4c, v0 +; CI-NEXT: buffer_store_dword v21, v26, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v21, vcc, 0x5c, v0 ; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v20, vcc, 0x48, v0 +; CI-NEXT: v_add_i32_e32 v20, vcc, 0x58, v0 ; CI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 +; CI-NEXT: v_add_i32_e32 v19, vcc, 0x54, v0 ; CI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v18, vcc, 64, v0 -; CI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 60, v0 -; CI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v16, vcc, 56, v0 -; CI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 +; CI-NEXT: v_add_i32_e32 v18, vcc, 0x50, v0 +; CI-NEXT: buffer_store_dword v25, v18, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v18, vcc, 0x4c, v0 +; CI-NEXT: buffer_store_dword v24, v18, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v18, vcc, 0x48, v0 +; CI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 ; CI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v14, vcc, 64, v0 +; CI-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v14, vcc, 60, v0 +; CI-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v14, vcc, 56, v0 +; CI-NEXT: buffer_store_dword v23, v14, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 +; CI-NEXT: buffer_store_dword v22, v14, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v14, vcc, 48, v0 ; CI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v13, vcc, 44, v0 diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -670,19 +670,19 @@ ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: s_waitcnt vmcnt(8) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(5) -; CI-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: buffer_store_dword v20, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -691,19 +691,19 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -712,21 +712,21 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1968,13 +1968,13 @@ ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:8 ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 @@ -1983,9 +1983,9 @@ ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dword v20, off, s[4:7], 0 +; CI-NEXT: buffer_store_dword v26, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx2 v[24:25], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2000,13 +2000,13 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 @@ -2015,9 +2015,9 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v26, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[24:25], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2032,14 +2032,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 @@ -2048,9 +2048,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v26, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[24:25], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2107,14 +2107,14 @@ ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_ubyte v25, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:12 ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 @@ -2123,15 +2123,15 @@ ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v19, v20 -; CI-NEXT: v_and_b32_e32 v0, 1, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_and_b32_e32 v0, 1, v25 ; CI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v26, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v18, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v27, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v19, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v24, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2146,14 +2146,14 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v24, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 @@ -2162,14 +2162,14 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, 1, v20 +; VI-NEXT: v_and_b32_e32 v0, 1, v24 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v25, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_short v17, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v26, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_short v18, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v27, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2184,16 +2184,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v24, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:16 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 @@ -2202,14 +2201,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v20 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v24 ; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v25, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_short v17, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_short v26, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_short v18, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_short v27, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2275,14 +2274,14 @@ ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:12 ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 @@ -2291,9 +2290,9 @@ ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx2 v[24:25], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx2 v[26:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2308,14 +2307,14 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 @@ -2324,9 +2323,9 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[24:25], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[26:27], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2341,15 +2340,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:12 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 @@ -2358,9 +2357,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[24:25], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[26:27], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2417,17 +2416,18 @@ ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:4 +; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v20 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 @@ -2436,13 +2436,13 @@ ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v17, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v26, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v18, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v27, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v16, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v25, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v19, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v24, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2457,12 +2457,12 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 @@ -2471,9 +2471,9 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v24, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v25, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2488,14 +2488,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 @@ -2504,9 +2503,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v24, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v25, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2561,29 +2560,29 @@ ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2598,29 +2597,29 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2635,31 +2634,31 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2722,27 +2721,27 @@ ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2759,27 +2758,27 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2796,29 +2795,28 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2879,41 +2877,41 @@ ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 -; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2928,41 +2926,41 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2977,45 +2975,44 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3090,66 +3087,66 @@ ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 +; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 +; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 -; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 -; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 -; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 -; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; CI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; CI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 -; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 -; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 -; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: void_func_v32i32_v16i32_v16f32: @@ -3163,66 +3160,66 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: void_func_v32i32_v16i32_v16f32: @@ -3236,74 +3233,72 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_v16i32_v16f32: @@ -3594,65 +3589,65 @@ ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:44 ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:24 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:16 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v25, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v24, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v22, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v21, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v20, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v27, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v26, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v23, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v8, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v17, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v16, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v10, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v19, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v9, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v11, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v12, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v4, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v13, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v5, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v14, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v6, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v15, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -3667,65 +3662,65 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v24, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ubyte v25, off, s[0:3], s32 offset:64 ; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ubyte v21, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ubyte v22, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ubyte v23, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ubyte v26, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ubyte v27, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v8, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ubyte v9, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ubyte v10, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ubyte v11, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v4, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ubyte v5, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ubyte v6, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v25, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v24, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v22, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v21, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v20, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v27, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v26, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v23, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v8, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v17, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v10, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v19, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v9, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v18, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v11, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v12, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v4, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v13, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v5, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v14, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v6, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v15, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3740,69 +3735,69 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v24, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ubyte v25, off, s[0:3], s32 offset:64 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ubyte v21, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ubyte v22, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ubyte v23, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ubyte v26, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ubyte v27, off, s[0:3], s32 offset:44 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:24 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:4 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v8, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ubyte v9, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ubyte v10, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ubyte v11, off, s[0:3], s32 offset:16 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v4, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ubyte v5, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ubyte v6, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v25, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v24, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v22, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v21, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v20, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v27, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v26, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v23, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v8, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v17, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v10, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v19, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v9, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v18, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v11, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v12, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v4, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v13, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v5, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v14, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v6, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v15, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -1496,8 +1496,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 ; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 -; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 ; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 ; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 ; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 ; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 @@ -1514,13 +1514,13 @@ ; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100 ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:92 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:84 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:68 @@ -1789,8 +1789,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 ; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 -; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 ; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 ; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 ; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 ; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 @@ -1807,13 +1807,13 @@ ; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100 ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:92 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:84 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:68 @@ -2082,8 +2082,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:240 ; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224 -; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208 ; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208 ; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:192 ; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:176 ; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160 @@ -2100,13 +2100,13 @@ ; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:228 ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:224 ; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:204 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:200 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:196 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -2399,125 +2399,125 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:160 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:284 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:156 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:280 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:152 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:276 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:272 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:268 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:264 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:260 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:256 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:280 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:272 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:264 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:256 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:252 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:248 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:244 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:240 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:236 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:232 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:228 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:224 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:248 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:240 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:232 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:224 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:220 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:216 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:212 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:208 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:204 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:200 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:196 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:192 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:216 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:212 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:208 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:204 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:200 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:196 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:192 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:188 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:184 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:180 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:176 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:172 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:168 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:164 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:160 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:184 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:180 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:176 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:172 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:168 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:164 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:160 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:156 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:152 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:148 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:144 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:140 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:136 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:132 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:152 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:144 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:136 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:128 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:124 @@ -2834,8 +2834,8 @@ ; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_add_i32 s33, s32, 0x7fc0 ; GFX9-NEXT: s_and_b32 s33, s33, 0xffff8000 -; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_add_i32 s32, s32, 0x28000 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -2843,7 +2843,6 @@ ; GFX9-NEXT: s_addc_u32 s35, s35, return_72xi32@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill @@ -2901,7 +2900,7 @@ ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 -; GFX9-NEXT: v_writelane_b32 v33, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x200, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2934,27 +2933,27 @@ ; GFX9-NEXT: v_mov_b32_e32 v29, 0 ; GFX9-NEXT: v_mov_b32_e32 v30, 0 ; GFX9-NEXT: v_mov_b32_e32 v31, 0 -; GFX9-NEXT: v_writelane_b32 v33, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:636 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:640 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:644 -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s33 offset:648 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s33 offset:652 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s33 offset:656 -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s33 offset:660 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s33 offset:664 -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:668 -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s33 offset:672 -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s33 offset:676 -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s33 offset:680 -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s33 offset:684 -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s33 offset:688 -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s33 offset:692 -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s33 offset:696 -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s33 offset:700 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:704 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:648 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s33 offset:652 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s33 offset:656 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s33 offset:660 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s33 offset:664 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s33 offset:668 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:672 +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s33 offset:676 +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s33 offset:680 +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s33 offset:684 +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s33 offset:688 +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s33 offset:692 +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s33 offset:696 +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s33 offset:700 +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s33 offset:704 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:708 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:712 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:716 @@ -3026,21 +3025,21 @@ ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 @@ -3099,11 +3098,10 @@ ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v33, 1 -; GFX9-NEXT: v_readlane_b32 s30, v33, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_add_i32 s32, s32, 0xfffd8000 ; GFX9-NEXT: s_mov_b32 s33, s36 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -1864,85 +1864,84 @@ ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: s_add_u32 s2, s0, 48 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_mov_b32_e32 v9, s3 +; CI-NEXT: v_mov_b32_e32 v8, s2 ; CI-NEXT: s_add_u32 s2, s0, 32 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v17, s3 -; CI-NEXT: v_mov_b32_e32 v16, s2 +; CI-NEXT: v_mov_b32_e32 v13, s3 +; CI-NEXT: v_mov_b32_e32 v12, s2 ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v19, s3 -; CI-NEXT: v_mov_b32_e32 v18, s2 -; CI-NEXT: s_add_u32 s2, s0, 0x70 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_mov_b32_e32 v12, s0 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; CI-NEXT: s_nop 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 ; CI-NEXT: v_mov_b32_e32 v14, s2 -; CI-NEXT: s_add_u32 s2, s0, 0x60 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; CI-NEXT: s_add_u32 s2, s0, 0x70 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; CI-NEXT: v_mov_b32_e32 v17, s3 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v5 ; CI-NEXT: v_mov_b32_e32 v16, s2 +; CI-NEXT: s_add_u32 s2, s0, 0x60 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v19, s3 +; CI-NEXT: v_mov_b32_e32 v11, s1 +; CI-NEXT: v_mov_b32_e32 v18, s2 ; CI-NEXT: s_add_u32 s2, s0, 0x50 +; CI-NEXT: v_mov_b32_e32 v10, s0 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 ; CI-NEXT: s_add_u32 s0, s0, 64 -; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v19 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; CI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; CI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v26, v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_cvt_f32_f16_e32 v20, v21 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v28, v7 +; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v29, v6 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v22 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v20 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v30, v5 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v4 +; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v24 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v27 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v26 +; CI-NEXT: v_cvt_f32_f16_e32 v25, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v26, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v31 +; CI-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v28 +; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v24 +; CI-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v30 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v29 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v25 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v26 ; CI-NEXT: v_mov_b32_e32 v21, s3 -; CI-NEXT: v_mov_b32_e32 v13, s1 +; CI-NEXT: v_mov_b32_e32 v23, s1 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v32 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v27 ; CI-NEXT: v_mov_b32_e32 v20, s2 -; CI-NEXT: v_mov_b32_e32 v12, s0 -; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; CI-NEXT: flat_store_dwordx4 v[20:21], v[0:3] -; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; CI-NEXT: v_mov_b32_e32 v22, s0 +; CI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[20:21], v[8:11] +; CI-NEXT: flat_store_dwordx4 v[22:23], v[4:7] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v16f16_to_v16f64: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -1024,6 +1024,22 @@ ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x44 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_lshrrev_b16_e64 v4, 1, s4 +; GCN-NEXT: v_lshrrev_b16_e64 v5, 2, s4 +; GCN-NEXT: v_lshrrev_b16_e64 v6, 3, s4 +; GCN-NEXT: v_lshrrev_b16_e64 v7, 4, s4 +; GCN-NEXT: v_lshrrev_b16_e64 v8, 5, s4 +; GCN-NEXT: v_lshrrev_b16_e64 v9, 6, s4 +; GCN-NEXT: v_lshrrev_b16_e64 v10, 7, s4 +; GCN-NEXT: v_lshrrev_b16_e64 v11, 8, s4 +; GCN-NEXT: v_lshrrev_b16_e64 v12, 9, s4 +; GCN-NEXT: v_lshrrev_b16_e64 v13, 10, s4 +; GCN-NEXT: v_lshrrev_b16_e64 v14, 11, s4 +; GCN-NEXT: v_lshrrev_b16_e64 v15, 12, s4 +; GCN-NEXT: v_lshrrev_b16_e64 v16, 13, s4 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 14, s4 +; GCN-NEXT: v_lshrrev_b16_e64 v18, 15, s4 ; GCN-NEXT: s_lshr_b32 s1, s4, 24 ; GCN-NEXT: s_lshr_b32 s8, s4, 16 ; GCN-NEXT: s_lshr_b32 s9, s4, 17 @@ -1032,861 +1048,832 @@ ; GCN-NEXT: s_lshr_b32 s12, s4, 20 ; GCN-NEXT: s_lshr_b32 s13, s4, 21 ; GCN-NEXT: s_lshr_b32 s14, s4, 22 -; GCN-NEXT: s_lshr_b32 s15, s4, 23 -; GCN-NEXT: s_lshr_b32 s16, s5, 24 -; GCN-NEXT: s_lshr_b32 s17, s5, 16 -; GCN-NEXT: s_lshr_b32 s18, s5, 17 -; GCN-NEXT: s_lshr_b32 s19, s5, 18 -; GCN-NEXT: s_lshr_b32 s20, s5, 19 -; GCN-NEXT: s_lshr_b32 s21, s5, 20 -; GCN-NEXT: s_lshr_b32 s22, s5, 21 -; GCN-NEXT: s_lshr_b32 s23, s5, 22 -; GCN-NEXT: s_lshr_b32 s24, s5, 23 -; GCN-NEXT: s_lshr_b32 s25, s6, 24 -; GCN-NEXT: s_lshr_b32 s26, s6, 16 -; GCN-NEXT: s_lshr_b32 s27, s6, 17 -; GCN-NEXT: s_lshr_b32 s28, s6, 18 -; GCN-NEXT: s_lshr_b32 s29, s6, 19 -; GCN-NEXT: s_lshr_b32 s30, s6, 20 -; GCN-NEXT: s_lshr_b32 s31, s6, 21 -; GCN-NEXT: s_lshr_b32 s33, s6, 22 -; GCN-NEXT: s_lshr_b32 s34, s6, 23 -; GCN-NEXT: s_lshr_b32 s35, s7, 24 -; GCN-NEXT: s_lshr_b32 s36, s7, 16 -; GCN-NEXT: s_lshr_b32 s37, s7, 17 -; GCN-NEXT: s_lshr_b32 s38, s7, 18 -; GCN-NEXT: s_lshr_b32 s39, s7, 19 -; GCN-NEXT: s_lshr_b32 s40, s7, 20 -; GCN-NEXT: s_lshr_b32 s41, s7, 21 -; GCN-NEXT: s_lshr_b32 s42, s7, 22 -; GCN-NEXT: s_lshr_b32 s43, s7, 23 +; GCN-NEXT: s_lshr_b32 s4, s4, 23 +; GCN-NEXT: s_lshr_b32 s15, s5, 24 +; GCN-NEXT: s_lshr_b32 s16, s5, 16 +; GCN-NEXT: s_lshr_b32 s17, s5, 17 +; GCN-NEXT: s_lshr_b32 s18, s5, 18 +; GCN-NEXT: s_lshr_b32 s19, s5, 19 +; GCN-NEXT: s_lshr_b32 s20, s5, 20 +; GCN-NEXT: s_lshr_b32 s21, s5, 21 +; GCN-NEXT: s_lshr_b32 s22, s5, 22 +; GCN-NEXT: s_lshr_b32 s23, s5, 23 +; GCN-NEXT: s_lshr_b32 s24, s6, 24 +; GCN-NEXT: s_lshr_b32 s25, s6, 16 +; GCN-NEXT: s_lshr_b32 s26, s6, 17 +; GCN-NEXT: s_lshr_b32 s27, s6, 18 +; GCN-NEXT: s_lshr_b32 s28, s6, 19 +; GCN-NEXT: s_lshr_b32 s29, s6, 20 +; GCN-NEXT: s_lshr_b32 s30, s6, 21 +; GCN-NEXT: s_lshr_b32 s31, s6, 22 +; GCN-NEXT: s_lshr_b32 s33, s6, 23 +; GCN-NEXT: s_lshr_b32 s34, s7, 24 +; GCN-NEXT: s_lshr_b32 s35, s7, 16 +; GCN-NEXT: s_lshr_b32 s36, s7, 17 +; GCN-NEXT: s_lshr_b32 s37, s7, 18 +; GCN-NEXT: s_lshr_b32 s38, s7, 19 +; GCN-NEXT: s_lshr_b32 s39, s7, 20 +; GCN-NEXT: s_lshr_b32 s40, s7, 21 +; GCN-NEXT: s_lshr_b32 s41, s7, 22 +; GCN-NEXT: s_lshr_b32 s42, s7, 23 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x77 -; GCN-NEXT: v_mov_b32_e32 v15, s43 +; GCN-NEXT: v_mov_b32_e32 v22, s42 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x76 -; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc -; GCN-NEXT: v_mov_b32_e32 v18, s42 +; GCN-NEXT: v_cndmask_b32_e32 v22, 1, v22, vcc +; GCN-NEXT: v_mov_b32_e32 v25, s41 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc +; GCN-NEXT: v_and_b32_e32 v25, 1, v25 +; GCN-NEXT: v_lshlrev_b16_e32 v22, 3, v22 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 2, v25 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x75 -; GCN-NEXT: v_or_b32_e32 v15, v15, v18 -; GCN-NEXT: v_mov_b32_e32 v18, s41 +; GCN-NEXT: v_or_b32_e32 v22, v22, v25 +; GCN-NEXT: v_mov_b32_e32 v25, s40 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x74 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_mov_b32_e32 v19, s40 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v18, 3, v18 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc +; GCN-NEXT: v_mov_b32_e32 v26, s39 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v25, 1, v25 +; GCN-NEXT: v_and_b32_e32 v26, 1, v26 +; GCN-NEXT: v_or_b32_e32 v25, v26, v25 +; GCN-NEXT: v_and_b32_e32 v25, 3, v25 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x73 -; GCN-NEXT: v_or_b32_e32 v15, v18, v15 -; GCN-NEXT: v_mov_b32_e32 v18, s39 +; GCN-NEXT: v_or_b32_e32 v22, v25, v22 +; GCN-NEXT: v_mov_b32_e32 v25, s38 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x72 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_mov_b32_e32 v19, s38 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc +; GCN-NEXT: v_mov_b32_e32 v26, s37 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: v_and_b32_e32 v26, 1, v26 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 3, v25 +; GCN-NEXT: v_lshlrev_b16_e32 v26, 2, v26 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x71 -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_mov_b32_e32 v19, s37 +; GCN-NEXT: v_or_b32_e32 v25, v25, v26 +; GCN-NEXT: v_mov_b32_e32 v26, s36 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x70 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_mov_b32_e32 v20, s36 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v19, v20, v19 -; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 -; GCN-NEXT: v_and_b32_e32 v18, 15, v18 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: v_mov_b32_e32 v27, s35 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v26, 1, v26 +; GCN-NEXT: v_and_b32_e32 v27, 1, v27 +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: v_and_b32_e32 v26, 3, v26 +; GCN-NEXT: v_or_b32_e32 v25, v26, v25 +; GCN-NEXT: v_lshlrev_b16_e32 v22, 4, v22 +; GCN-NEXT: v_and_b32_e32 v25, 15, v25 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7f -; GCN-NEXT: v_or_b32_e32 v15, v18, v15 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s35 +; GCN-NEXT: v_or_b32_e32 v22, v25, v22 +; GCN-NEXT: v_lshrrev_b16_e64 v25, 7, s34 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7e -; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s35 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v26, 6, s34 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: v_and_b32_e32 v26, 1, v26 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 3, v25 +; GCN-NEXT: v_lshlrev_b16_e32 v26, 2, v26 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7d -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s35 +; GCN-NEXT: v_or_b32_e32 v25, v25, v26 +; GCN-NEXT: v_lshrrev_b16_e64 v26, 5, s34 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7c -; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s35 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v19, v20, v19 -; GCN-NEXT: v_and_b32_e32 v19, 3, v19 +; GCN-NEXT: v_lshrrev_b16_e64 v27, 4, s34 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v26, 1, v26 +; GCN-NEXT: v_and_b32_e32 v27, 1, v27 +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: v_and_b32_e32 v26, 3, v26 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7b -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s35 +; GCN-NEXT: v_or_b32_e32 v25, v26, v25 +; GCN-NEXT: v_lshrrev_b16_e64 v26, 3, s34 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7a -; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s35 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v27, 2, s34 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc +; GCN-NEXT: v_and_b32_e32 v27, 1, v27 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x78 -; GCN-NEXT: v_mov_b32_e32 v13, s35 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 +; GCN-NEXT: v_mov_b32_e32 v20, s34 +; GCN-NEXT: v_lshlrev_b16_e32 v26, 3, v26 +; GCN-NEXT: v_lshlrev_b16_e32 v27, 2, v27 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x79 -; GCN-NEXT: v_or_b32_e32 v19, v19, v20 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s35 -; GCN-NEXT: v_cndmask_b32_e32 v13, 1, v13, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_or_b32_e32 v26, v26, v27 +; GCN-NEXT: v_lshrrev_b16_e64 v27, 1, s34 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_and_b32_e32 v13, 1, v13 -; GCN-NEXT: v_lshlrev_b16_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v13, v13, v20 -; GCN-NEXT: v_and_b32_e32 v13, 3, v13 -; GCN-NEXT: v_or_b32_e32 v19, v13, v19 -; GCN-NEXT: v_mov_b32_e32 v13, 15 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 -; GCN-NEXT: v_and_b32_sdwa v19, v19, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc +; GCN-NEXT: v_and_b32_e32 v20, 1, v20 +; GCN-NEXT: v_lshlrev_b16_e32 v27, 1, v27 +; GCN-NEXT: v_or_b32_e32 v20, v20, v27 +; GCN-NEXT: v_and_b32_e32 v20, 3, v20 +; GCN-NEXT: v_or_b32_e32 v26, v20, v26 +; GCN-NEXT: v_mov_b32_e32 v20, 15 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 12, v25 +; GCN-NEXT: v_and_b32_sdwa v26, v26, v20 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: v_or_b32_e32 v25, v25, v26 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6f -; GCN-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_lshrrev_b16_e64 v18, 15, s7 +; GCN-NEXT: v_or_b32_sdwa v22, v22, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: v_lshrrev_b16_e64 v25, 15, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6e -; GCN-NEXT: v_lshrrev_b16_e64 v19, 14, s7 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v26, 14, s7 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: v_and_b32_e32 v26, 1, v26 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 3, v25 +; GCN-NEXT: v_lshlrev_b16_e32 v26, 2, v26 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6d -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 13, s7 +; GCN-NEXT: v_or_b32_e32 v25, v25, v26 +; GCN-NEXT: v_lshrrev_b16_e64 v26, 13, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6c -; GCN-NEXT: v_lshrrev_b16_e64 v20, 12, s7 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v19, v20, v19 -; GCN-NEXT: v_and_b32_e32 v19, 3, v19 +; GCN-NEXT: v_lshrrev_b16_e64 v27, 12, s7 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v26, 1, v26 +; GCN-NEXT: v_and_b32_e32 v27, 1, v27 +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: v_and_b32_e32 v26, 3, v26 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6b -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 11, s7 +; GCN-NEXT: v_or_b32_e32 v25, v26, v25 +; GCN-NEXT: v_lshrrev_b16_e64 v26, 11, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6a -; GCN-NEXT: v_lshrrev_b16_e64 v20, 10, s7 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v27, 10, s7 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc +; GCN-NEXT: v_and_b32_e32 v27, 1, v27 +; GCN-NEXT: v_lshlrev_b16_e32 v26, 3, v26 +; GCN-NEXT: v_lshlrev_b16_e32 v27, 2, v27 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x69 -; GCN-NEXT: v_or_b32_e32 v19, v19, v20 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 9, s7 +; GCN-NEXT: v_or_b32_e32 v26, v26, v27 +; GCN-NEXT: v_lshrrev_b16_e64 v27, 9, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x68 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 8, s7 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v20, 1, v20 -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_or_b32_e32 v17, v17, v20 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: v_or_b32_e32 v17, v17, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 -; GCN-NEXT: v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: v_lshrrev_b16_e64 v24, 8, s7 +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v24, 1, v24, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v27, 1, v27 +; GCN-NEXT: v_and_b32_e32 v24, 1, v24 +; GCN-NEXT: v_or_b32_e32 v24, v24, v27 +; GCN-NEXT: v_and_b32_e32 v24, 3, v24 +; GCN-NEXT: v_or_b32_e32 v24, v24, v26 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 12, v25 +; GCN-NEXT: v_and_b32_sdwa v24, v24, v20 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x67 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s7 +; GCN-NEXT: v_or_b32_e32 v24, v25, v24 +; GCN-NEXT: v_lshrrev_b16_e64 v25, 7, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x66 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s7 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v26, 6, s7 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: v_and_b32_e32 v26, 1, v26 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 3, v25 +; GCN-NEXT: v_lshlrev_b16_e32 v26, 2, v26 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x65 -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s7 +; GCN-NEXT: v_or_b32_e32 v25, v25, v26 +; GCN-NEXT: v_lshrrev_b16_e64 v26, 5, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x64 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s7 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v19, v20, v19 -; GCN-NEXT: v_and_b32_e32 v19, 3, v19 +; GCN-NEXT: v_lshrrev_b16_e64 v27, 4, s7 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v26, 1, v26 +; GCN-NEXT: v_and_b32_e32 v27, 1, v27 +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: v_and_b32_e32 v26, 3, v26 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x63 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s7 +; GCN-NEXT: v_or_b32_e32 v25, v26, v25 +; GCN-NEXT: v_lshrrev_b16_e64 v26, 3, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x62 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s7 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v27, 2, s7 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc +; GCN-NEXT: v_and_b32_e32 v27, 1, v27 +; GCN-NEXT: v_lshlrev_b16_e32 v26, 3, v26 +; GCN-NEXT: v_lshlrev_b16_e32 v27, 2, v27 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x61 -; GCN-NEXT: v_or_b32_e32 v19, v19, v20 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s7 +; GCN-NEXT: v_or_b32_e32 v26, v26, v27 +; GCN-NEXT: v_lshrrev_b16_e64 v27, 1, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x60 -; GCN-NEXT: v_mov_b32_e32 v16, s7 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v20, 1, v20 -; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_or_b32_e32 v16, v16, v20 -; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: v_or_b32_e32 v16, v16, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 4, v18 -; GCN-NEXT: v_and_b32_e32 v16, 15, v16 -; GCN-NEXT: v_or_b32_e32 v16, v16, v18 +; GCN-NEXT: v_mov_b32_e32 v23, s7 +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v23, 1, v23, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v27, 1, v27 +; GCN-NEXT: v_and_b32_e32 v23, 1, v23 +; GCN-NEXT: v_or_b32_e32 v23, v23, v27 +; GCN-NEXT: v_and_b32_e32 v23, 3, v23 +; GCN-NEXT: v_or_b32_e32 v23, v23, v26 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 4, v25 +; GCN-NEXT: v_and_b32_e32 v23, 15, v23 +; GCN-NEXT: v_or_b32_e32 v23, v23, v25 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x57 -; GCN-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v17, s34 +; GCN-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: v_mov_b32_e32 v24, s33 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x56 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_mov_b32_e32 v18, s33 +; GCN-NEXT: v_cndmask_b32_e32 v24, 1, v24, vcc +; GCN-NEXT: v_mov_b32_e32 v25, s31 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc +; GCN-NEXT: v_and_b32_e32 v25, 1, v25 +; GCN-NEXT: v_lshlrev_b16_e32 v24, 3, v24 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 2, v25 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x55 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_mov_b32_e32 v18, s31 +; GCN-NEXT: v_or_b32_e32 v24, v24, v25 +; GCN-NEXT: v_mov_b32_e32 v25, s30 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x54 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_mov_b32_e32 v19, s30 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v18, 3, v18 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc +; GCN-NEXT: v_mov_b32_e32 v26, s29 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v25, 1, v25 +; GCN-NEXT: v_and_b32_e32 v26, 1, v26 +; GCN-NEXT: v_or_b32_e32 v25, v26, v25 +; GCN-NEXT: v_and_b32_e32 v25, 3, v25 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x53 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_mov_b32_e32 v18, s29 +; GCN-NEXT: v_or_b32_e32 v24, v25, v24 +; GCN-NEXT: v_mov_b32_e32 v25, s28 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x52 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_mov_b32_e32 v19, s28 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc +; GCN-NEXT: v_mov_b32_e32 v26, s27 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: v_and_b32_e32 v26, 1, v26 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 3, v25 +; GCN-NEXT: v_lshlrev_b16_e32 v26, 2, v26 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x51 -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_mov_b32_e32 v19, s27 +; GCN-NEXT: v_or_b32_e32 v25, v25, v26 +; GCN-NEXT: v_mov_b32_e32 v26, s26 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x50 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_mov_b32_e32 v20, s26 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v19, v20, v19 -; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 4, v17 -; GCN-NEXT: v_and_b32_e32 v18, 15, v18 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: v_mov_b32_e32 v27, s25 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v26, 1, v26 +; GCN-NEXT: v_and_b32_e32 v27, 1, v27 +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: v_and_b32_e32 v26, 3, v26 +; GCN-NEXT: v_or_b32_e32 v25, v26, v25 +; GCN-NEXT: v_lshlrev_b16_e32 v24, 4, v24 +; GCN-NEXT: v_and_b32_e32 v25, 15, v25 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5f -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s25 +; GCN-NEXT: v_or_b32_e32 v24, v25, v24 +; GCN-NEXT: v_lshrrev_b16_e64 v25, 7, s24 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5e -; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s25 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v26, 6, s24 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: v_and_b32_e32 v26, 1, v26 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 3, v25 +; GCN-NEXT: v_lshlrev_b16_e32 v26, 2, v26 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5d -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s25 +; GCN-NEXT: v_or_b32_e32 v25, v25, v26 +; GCN-NEXT: v_lshrrev_b16_e64 v26, 5, s24 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5c -; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s25 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v19, v20, v19 -; GCN-NEXT: v_and_b32_e32 v19, 3, v19 +; GCN-NEXT: v_lshrrev_b16_e64 v27, 4, s24 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v26, 1, v26 +; GCN-NEXT: v_and_b32_e32 v27, 1, v27 +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: v_and_b32_e32 v26, 3, v26 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5b -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s25 +; GCN-NEXT: v_or_b32_e32 v25, v26, v25 +; GCN-NEXT: v_lshrrev_b16_e64 v26, 3, s24 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5a -; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s25 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v27, 2, s24 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc +; GCN-NEXT: v_and_b32_e32 v27, 1, v27 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x58 -; GCN-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 +; GCN-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NEXT: v_lshlrev_b16_e32 v26, 3, v26 +; GCN-NEXT: v_lshlrev_b16_e32 v27, 2, v27 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x59 -; GCN-NEXT: v_or_b32_e32 v19, v19, v20 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s25 +; GCN-NEXT: v_or_b32_e32 v26, v26, v27 +; GCN-NEXT: v_lshrrev_b16_e64 v27, 1, s24 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc ; GCN-NEXT: v_and_b32_e32 v3, 1, v3 -; GCN-NEXT: v_lshlrev_b16_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v3, v3, v20 +; GCN-NEXT: v_lshlrev_b16_e32 v27, 1, v27 +; GCN-NEXT: v_or_b32_e32 v3, v3, v27 ; GCN-NEXT: v_and_b32_e32 v3, 3, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 -; GCN-NEXT: v_and_b32_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: v_or_b32_e32 v3, v18, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v26 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 12, v25 +; GCN-NEXT: v_and_b32_sdwa v3, v3, v20 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: v_or_b32_e32 v3, v25, v3 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4f -; GCN-NEXT: v_or_b32_sdwa v17, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: v_or_b32_sdwa v24, v24, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v3, 15, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4e -; GCN-NEXT: v_lshrrev_b16_e64 v18, 14, s6 +; GCN-NEXT: v_lshrrev_b16_e64 v25, 14, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc +; GCN-NEXT: v_and_b32_e32 v25, 1, v25 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 2, v25 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4d -; GCN-NEXT: v_or_b32_e32 v3, v3, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 13, s6 +; GCN-NEXT: v_or_b32_e32 v3, v3, v25 +; GCN-NEXT: v_lshrrev_b16_e64 v25, 13, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4c -; GCN-NEXT: v_lshrrev_b16_e64 v19, 12, s6 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v18, 3, v18 +; GCN-NEXT: v_lshrrev_b16_e64 v26, 12, s6 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v25, 1, v25 +; GCN-NEXT: v_and_b32_e32 v26, 1, v26 +; GCN-NEXT: v_or_b32_e32 v25, v26, v25 +; GCN-NEXT: v_and_b32_e32 v25, 3, v25 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4b -; GCN-NEXT: v_or_b32_e32 v3, v18, v3 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 11, s6 +; GCN-NEXT: v_or_b32_e32 v3, v25, v3 +; GCN-NEXT: v_lshrrev_b16_e64 v25, 11, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4a -; GCN-NEXT: v_lshrrev_b16_e64 v19, 10, s6 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v26, 10, s6 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: v_and_b32_e32 v26, 1, v26 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 3, v25 +; GCN-NEXT: v_lshlrev_b16_e32 v26, 2, v26 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x49 -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 9, s6 +; GCN-NEXT: v_or_b32_e32 v25, v25, v26 +; GCN-NEXT: v_lshrrev_b16_e64 v26, 9, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x48 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 8, s6 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v19, v20, v19 -; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_lshrrev_b16_e64 v27, 8, s6 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v26, 1, v26 +; GCN-NEXT: v_and_b32_e32 v27, 1, v27 +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: v_and_b32_e32 v26, 3, v26 +; GCN-NEXT: v_or_b32_e32 v25, v26, v25 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 12, v3 -; GCN-NEXT: v_and_b32_sdwa v18, v18, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: v_and_b32_sdwa v25, v25, v20 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x47 -; GCN-NEXT: v_or_b32_e32 v18, v3, v18 +; GCN-NEXT: v_or_b32_e32 v25, v3, v25 ; GCN-NEXT: v_lshrrev_b16_e64 v3, 7, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x46 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s6 +; GCN-NEXT: v_lshrrev_b16_e64 v26, 6, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: v_and_b32_e32 v26, 1, v26 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 +; GCN-NEXT: v_lshlrev_b16_e32 v26, 2, v26 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x45 -; GCN-NEXT: v_or_b32_e32 v3, v3, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s6 +; GCN-NEXT: v_or_b32_e32 v3, v3, v26 +; GCN-NEXT: v_lshrrev_b16_e64 v26, 5, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x44 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s6 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v19, v20, v19 -; GCN-NEXT: v_and_b32_e32 v19, 3, v19 +; GCN-NEXT: v_lshrrev_b16_e64 v27, 4, s6 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v26, 1, v26 +; GCN-NEXT: v_and_b32_e32 v27, 1, v27 +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: v_and_b32_e32 v26, 3, v26 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x43 -; GCN-NEXT: v_or_b32_e32 v19, v19, v3 +; GCN-NEXT: v_or_b32_e32 v26, v26, v3 ; GCN-NEXT: v_lshrrev_b16_e64 v3, 3, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x42 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s6 +; GCN-NEXT: v_lshrrev_b16_e64 v27, 2, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc +; GCN-NEXT: v_and_b32_e32 v27, 1, v27 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 -; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 +; GCN-NEXT: v_lshlrev_b16_e32 v27, 2, v27 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x41 -; GCN-NEXT: v_or_b32_e32 v3, v3, v20 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s6 +; GCN-NEXT: v_or_b32_e32 v3, v3, v27 +; GCN-NEXT: v_lshrrev_b16_e64 v27, 1, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 64 ; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc +; GCN-NEXT: v_cndmask_b32_e32 v27, 1, v27, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v20, 1, v20 +; GCN-NEXT: v_lshlrev_b16_e32 v27, 1, v27 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v20 +; GCN-NEXT: v_or_b32_e32 v2, v2, v27 ; GCN-NEXT: v_and_b32_e32 v2, 3, v2 ; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: v_or_b32_sdwa v3, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v19 +; GCN-NEXT: v_or_b32_sdwa v3, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GCN-NEXT: v_lshlrev_b16_e32 v22, 4, v26 ; GCN-NEXT: v_and_b32_e32 v2, 15, v2 ; GCN-NEXT: s_cmp_lg_u32 s0, 55 -; GCN-NEXT: v_or_b32_e32 v2, v2, v15 -; GCN-NEXT: v_mov_b32_e32 v15, s24 +; GCN-NEXT: v_or_b32_e32 v2, v2, v22 +; GCN-NEXT: v_mov_b32_e32 v22, s23 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 54 -; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc -; GCN-NEXT: v_mov_b32_e32 v16, s23 +; GCN-NEXT: v_cndmask_b32_e32 v22, 1, v22, vcc +; GCN-NEXT: v_mov_b32_e32 v23, s22 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 +; GCN-NEXT: v_cndmask_b32_e32 v23, 1, v23, vcc +; GCN-NEXT: v_and_b32_e32 v23, 1, v23 +; GCN-NEXT: v_lshlrev_b16_e32 v22, 3, v22 +; GCN-NEXT: v_lshlrev_b16_e32 v23, 2, v23 ; GCN-NEXT: s_cmp_lg_u32 s0, 53 -; GCN-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_mov_b32_e32 v16, s22 +; GCN-NEXT: v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: v_or_b32_e32 v22, v22, v23 +; GCN-NEXT: v_mov_b32_e32 v23, s21 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 52 -; GCN-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_mov_b32_e32 v17, s21 +; GCN-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GCN-NEXT: v_cndmask_b32_e32 v23, 1, v23, vcc +; GCN-NEXT: v_mov_b32_e32 v24, s20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v16, 1, v16 -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_and_b32_e32 v16, 3, v16 +; GCN-NEXT: v_cndmask_b32_e32 v24, 1, v24, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v23, 1, v23 +; GCN-NEXT: v_and_b32_e32 v24, 1, v24 +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 +; GCN-NEXT: v_and_b32_e32 v23, 3, v23 ; GCN-NEXT: s_cmp_lg_u32 s0, 51 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_mov_b32_e32 v16, s20 +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 +; GCN-NEXT: v_mov_b32_e32 v23, s19 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 50 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_mov_b32_e32 v17, s19 +; GCN-NEXT: v_cndmask_b32_e32 v23, 1, v23, vcc +; GCN-NEXT: v_mov_b32_e32 v24, s18 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 +; GCN-NEXT: v_cndmask_b32_e32 v24, 1, v24, vcc +; GCN-NEXT: v_and_b32_e32 v24, 1, v24 +; GCN-NEXT: v_lshlrev_b16_e32 v23, 3, v23 +; GCN-NEXT: v_lshlrev_b16_e32 v24, 2, v24 ; GCN-NEXT: s_cmp_lg_u32 s0, 49 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_mov_b32_e32 v17, s18 +; GCN-NEXT: v_or_b32_e32 v23, v23, v24 +; GCN-NEXT: v_mov_b32_e32 v24, s17 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 48 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_mov_b32_e32 v18, s17 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 -; GCN-NEXT: v_and_b32_e32 v16, 15, v16 +; GCN-NEXT: v_cndmask_b32_e32 v24, 1, v24, vcc +; GCN-NEXT: v_mov_b32_e32 v25, s16 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v24, 1, v24 +; GCN-NEXT: v_and_b32_e32 v25, 1, v25 +; GCN-NEXT: v_or_b32_e32 v24, v25, v24 +; GCN-NEXT: v_and_b32_e32 v24, 3, v24 +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 +; GCN-NEXT: v_lshlrev_b16_e32 v22, 4, v22 +; GCN-NEXT: v_and_b32_e32 v23, 15, v23 ; GCN-NEXT: s_cmp_lg_u32 s0, 63 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s16 +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 +; GCN-NEXT: v_lshrrev_b16_e64 v23, 7, s15 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 62 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s16 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v24, 6, s15 +; GCN-NEXT: v_cndmask_b32_e32 v23, 1, v23, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 +; GCN-NEXT: v_cndmask_b32_e32 v24, 1, v24, vcc +; GCN-NEXT: v_and_b32_e32 v24, 1, v24 +; GCN-NEXT: v_lshlrev_b16_e32 v23, 3, v23 +; GCN-NEXT: v_lshlrev_b16_e32 v24, 2, v24 ; GCN-NEXT: s_cmp_lg_u32 s0, 61 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s16 +; GCN-NEXT: v_or_b32_e32 v23, v23, v24 +; GCN-NEXT: v_lshrrev_b16_e64 v24, 5, s15 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 60 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 4, s16 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 +; GCN-NEXT: v_lshrrev_b16_e64 v25, 4, s15 +; GCN-NEXT: v_cndmask_b32_e32 v24, 1, v24, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v24, 1, v24 +; GCN-NEXT: v_and_b32_e32 v25, 1, v25 +; GCN-NEXT: v_or_b32_e32 v24, v25, v24 +; GCN-NEXT: v_and_b32_e32 v24, 3, v24 ; GCN-NEXT: s_cmp_lg_u32 s0, 59 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s16 +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 +; GCN-NEXT: v_lshrrev_b16_e64 v24, 3, s15 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 58 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 2, s16 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v25, 2, s15 +; GCN-NEXT: v_cndmask_b32_e32 v24, 1, v24, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc +; GCN-NEXT: v_and_b32_e32 v25, 1, v25 ; GCN-NEXT: s_cmp_lg_u32 s0, 56 -; GCN-NEXT: v_mov_b32_e32 v14, s16 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 +; GCN-NEXT: v_mov_b32_e32 v21, s15 +; GCN-NEXT: v_lshlrev_b16_e32 v24, 3, v24 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 2, v25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 57 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 1, s16 -; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v14, 1, v14 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 -; GCN-NEXT: v_or_b32_e32 v14, v14, v18 -; GCN-NEXT: v_and_b32_e32 v14, 3, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 -; GCN-NEXT: v_and_b32_sdwa v14, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: v_or_b32_e32 v14, v16, v14 +; GCN-NEXT: v_or_b32_e32 v24, v24, v25 +; GCN-NEXT: v_lshrrev_b16_e64 v25, 1, s15 +; GCN-NEXT: v_cndmask_b32_e32 v21, 1, v21, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc +; GCN-NEXT: v_and_b32_e32 v21, 1, v21 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 1, v25 +; GCN-NEXT: v_or_b32_e32 v21, v21, v25 +; GCN-NEXT: v_and_b32_e32 v21, 3, v21 +; GCN-NEXT: v_or_b32_e32 v21, v21, v24 +; GCN-NEXT: v_lshlrev_b16_e32 v23, 12, v23 +; GCN-NEXT: v_and_b32_sdwa v21, v21, v20 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: v_or_b32_e32 v21, v23, v21 ; GCN-NEXT: s_cmp_lg_u32 s0, 47 -; GCN-NEXT: v_or_b32_sdwa v15, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_lshrrev_b16_e64 v14, 15, s5 +; GCN-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: v_lshrrev_b16_e64 v22, 15, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 46 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 14, s5 -; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v23, 14, s5 +; GCN-NEXT: v_cndmask_b32_e32 v22, 1, v22, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 +; GCN-NEXT: v_cndmask_b32_e32 v23, 1, v23, vcc +; GCN-NEXT: v_and_b32_e32 v23, 1, v23 +; GCN-NEXT: v_lshlrev_b16_e32 v22, 3, v22 +; GCN-NEXT: v_lshlrev_b16_e32 v23, 2, v23 ; GCN-NEXT: s_cmp_lg_u32 s0, 45 -; GCN-NEXT: v_or_b32_e32 v14, v14, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 13, s5 +; GCN-NEXT: v_or_b32_e32 v22, v22, v23 +; GCN-NEXT: v_lshrrev_b16_e64 v23, 13, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 44 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 12, s5 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v16, 1, v16 -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_and_b32_e32 v16, 3, v16 +; GCN-NEXT: v_lshrrev_b16_e64 v24, 12, s5 +; GCN-NEXT: v_cndmask_b32_e32 v23, 1, v23, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v24, 1, v24, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v23, 1, v23 +; GCN-NEXT: v_and_b32_e32 v24, 1, v24 +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 +; GCN-NEXT: v_and_b32_e32 v23, 3, v23 ; GCN-NEXT: s_cmp_lg_u32 s0, 43 -; GCN-NEXT: v_or_b32_e32 v14, v16, v14 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 11, s5 +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 +; GCN-NEXT: v_lshrrev_b16_e64 v23, 11, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 42 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 10, s5 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v24, 10, s5 +; GCN-NEXT: v_cndmask_b32_e32 v23, 1, v23, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 +; GCN-NEXT: v_cndmask_b32_e32 v24, 1, v24, vcc +; GCN-NEXT: v_and_b32_e32 v24, 1, v24 +; GCN-NEXT: v_lshlrev_b16_e32 v23, 3, v23 +; GCN-NEXT: v_lshlrev_b16_e32 v24, 2, v24 ; GCN-NEXT: s_cmp_lg_u32 s0, 41 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 9, s5 +; GCN-NEXT: v_or_b32_e32 v23, v23, v24 +; GCN-NEXT: v_lshrrev_b16_e64 v24, 9, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 40 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 8, s5 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v14, 12, v14 -; GCN-NEXT: v_and_b32_sdwa v16, v16, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: v_lshrrev_b16_e64 v25, 8, s5 +; GCN-NEXT: v_cndmask_b32_e32 v24, 1, v24, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v24, 1, v24 +; GCN-NEXT: v_and_b32_e32 v25, 1, v25 +; GCN-NEXT: v_or_b32_e32 v24, v25, v24 +; GCN-NEXT: v_and_b32_e32 v24, 3, v24 +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 +; GCN-NEXT: v_lshlrev_b16_e32 v22, 12, v22 +; GCN-NEXT: v_and_b32_sdwa v23, v23, v20 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: s_cmp_lg_u32 s0, 39 -; GCN-NEXT: v_or_b32_e32 v16, v14, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v14, 7, s5 +; GCN-NEXT: v_or_b32_e32 v22, v22, v23 +; GCN-NEXT: v_lshrrev_b16_e64 v23, 7, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 38 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s5 -; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v24, 6, s5 +; GCN-NEXT: v_cndmask_b32_e32 v23, 1, v23, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 +; GCN-NEXT: v_cndmask_b32_e32 v24, 1, v24, vcc +; GCN-NEXT: v_and_b32_e32 v24, 1, v24 +; GCN-NEXT: v_lshlrev_b16_e32 v23, 3, v23 +; GCN-NEXT: v_lshlrev_b16_e32 v24, 2, v24 ; GCN-NEXT: s_cmp_lg_u32 s0, 37 -; GCN-NEXT: v_or_b32_e32 v14, v14, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s5 +; GCN-NEXT: v_or_b32_e32 v23, v23, v24 +; GCN-NEXT: v_lshrrev_b16_e64 v24, 5, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 36 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 4, s5 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 +; GCN-NEXT: v_lshrrev_b16_e64 v25, 4, s5 +; GCN-NEXT: v_cndmask_b32_e32 v24, 1, v24, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v24, 1, v24 +; GCN-NEXT: v_and_b32_e32 v25, 1, v25 +; GCN-NEXT: v_or_b32_e32 v24, v25, v24 +; GCN-NEXT: v_and_b32_e32 v24, 3, v24 ; GCN-NEXT: s_cmp_lg_u32 s0, 35 -; GCN-NEXT: v_or_b32_e32 v17, v17, v14 -; GCN-NEXT: v_lshrrev_b16_e64 v14, 3, s5 +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 +; GCN-NEXT: v_lshrrev_b16_e64 v24, 3, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 34 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 2, s5 -; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v25, 2, s5 +; GCN-NEXT: v_cndmask_b32_e32 v24, 1, v24, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc +; GCN-NEXT: v_and_b32_e32 v25, 1, v25 +; GCN-NEXT: v_lshlrev_b16_e32 v24, 3, v24 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 2, v25 ; GCN-NEXT: s_cmp_lg_u32 s0, 33 -; GCN-NEXT: v_or_b32_e32 v18, v14, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v14, 1, s5 +; GCN-NEXT: v_or_b32_e32 v24, v24, v25 +; GCN-NEXT: v_lshrrev_b16_e64 v25, 1, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 32 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc +; GCN-NEXT: v_cndmask_b32_e32 v25, 1, v25, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v14, 1, v14 +; GCN-NEXT: v_lshlrev_b16_e32 v25, 1, v25 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v14 +; GCN-NEXT: v_or_b32_e32 v1, v1, v25 ; GCN-NEXT: v_and_b32_e32 v1, 3, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 4, v17 +; GCN-NEXT: v_or_b32_e32 v1, v1, v24 +; GCN-NEXT: v_lshlrev_b16_e32 v23, 4, v23 ; GCN-NEXT: v_and_b32_e32 v1, 15, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v17 -; GCN-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: v_or_b32_e32 v1, v1, v23 +; GCN-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: s_cmp_lg_u32 s0, 23 -; GCN-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v15, s15 +; GCN-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GCN-NEXT: v_mov_b32_e32 v21, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 22 -; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc -; GCN-NEXT: v_mov_b32_e32 v16, s14 +; GCN-NEXT: v_cndmask_b32_e32 v21, 1, v21, vcc +; GCN-NEXT: v_mov_b32_e32 v22, s14 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 +; GCN-NEXT: v_cndmask_b32_e32 v22, 1, v22, vcc +; GCN-NEXT: v_and_b32_e32 v22, 1, v22 +; GCN-NEXT: v_lshlrev_b16_e32 v21, 3, v21 +; GCN-NEXT: v_lshlrev_b16_e32 v22, 2, v22 ; GCN-NEXT: s_cmp_lg_u32 s0, 21 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_mov_b32_e32 v16, s13 +; GCN-NEXT: v_or_b32_e32 v21, v21, v22 +; GCN-NEXT: v_mov_b32_e32 v22, s13 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 20 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_mov_b32_e32 v17, s12 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v16, 1, v16 -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_and_b32_e32 v16, 3, v16 +; GCN-NEXT: v_cndmask_b32_e32 v22, 1, v22, vcc +; GCN-NEXT: v_mov_b32_e32 v23, s12 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v23, 1, v23, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v22, 1, v22 +; GCN-NEXT: v_and_b32_e32 v23, 1, v23 +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 +; GCN-NEXT: v_and_b32_e32 v22, 3, v22 ; GCN-NEXT: s_cmp_lg_u32 s0, 19 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_mov_b32_e32 v16, s11 +; GCN-NEXT: v_or_b32_e32 v21, v22, v21 +; GCN-NEXT: v_mov_b32_e32 v22, s11 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 18 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_mov_b32_e32 v17, s10 +; GCN-NEXT: v_cndmask_b32_e32 v22, 1, v22, vcc +; GCN-NEXT: v_mov_b32_e32 v23, s10 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 +; GCN-NEXT: v_cndmask_b32_e32 v23, 1, v23, vcc +; GCN-NEXT: v_and_b32_e32 v23, 1, v23 +; GCN-NEXT: v_lshlrev_b16_e32 v22, 3, v22 +; GCN-NEXT: v_lshlrev_b16_e32 v23, 2, v23 ; GCN-NEXT: s_cmp_lg_u32 s0, 17 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_mov_b32_e32 v17, s9 +; GCN-NEXT: v_or_b32_e32 v22, v22, v23 +; GCN-NEXT: v_mov_b32_e32 v23, s9 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 16 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_mov_b32_e32 v19, s8 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v17, v19, v17 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 -; GCN-NEXT: v_and_b32_e32 v16, 15, v16 +; GCN-NEXT: v_cndmask_b32_e32 v23, 1, v23, vcc +; GCN-NEXT: v_mov_b32_e32 v26, s8 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v23, 1, v23 +; GCN-NEXT: v_and_b32_e32 v26, 1, v26 +; GCN-NEXT: v_or_b32_e32 v23, v26, v23 +; GCN-NEXT: v_and_b32_e32 v23, 3, v23 +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 +; GCN-NEXT: v_lshlrev_b16_e32 v21, 4, v21 +; GCN-NEXT: v_and_b32_e32 v22, 15, v22 ; GCN-NEXT: s_cmp_lg_u32 s0, 31 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s1 +; GCN-NEXT: v_or_b32_e32 v21, v22, v21 +; GCN-NEXT: v_lshrrev_b16_e64 v22, 7, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 30 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s1 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v23, 6, s1 +; GCN-NEXT: v_cndmask_b32_e32 v22, 1, v22, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 +; GCN-NEXT: v_cndmask_b32_e32 v23, 1, v23, vcc +; GCN-NEXT: v_and_b32_e32 v23, 1, v23 +; GCN-NEXT: v_lshlrev_b16_e32 v22, 3, v22 +; GCN-NEXT: v_lshlrev_b16_e32 v23, 2, v23 ; GCN-NEXT: s_cmp_lg_u32 s0, 29 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s1 +; GCN-NEXT: v_or_b32_e32 v22, v22, v23 +; GCN-NEXT: v_lshrrev_b16_e64 v23, 5, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 28 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 4, s1 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v26, 4, s1 +; GCN-NEXT: v_cndmask_b32_e32 v23, 1, v23, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v17, v19, v17 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1, v26, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v23, 1, v23 +; GCN-NEXT: v_and_b32_e32 v26, 1, v26 +; GCN-NEXT: v_or_b32_e32 v23, v26, v23 ; GCN-NEXT: s_cmp_lg_u32 s0, 27 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s1 +; GCN-NEXT: v_lshrrev_b16_e64 v26, 3, s1 +; GCN-NEXT: v_and_b32_e32 v23, 3, v23 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 26 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 2, s1 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v24, 2, s1 +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 +; GCN-NEXT: v_cndmask_b32_e32 v23, 1, v26, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 +; GCN-NEXT: v_cndmask_b32_e32 v24, 1, v24, vcc ; GCN-NEXT: s_cmp_lg_u32 s0, 24 -; GCN-NEXT: v_mov_b32_e32 v18, s1 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 +; GCN-NEXT: v_mov_b32_e32 v19, s1 +; GCN-NEXT: v_and_b32_e32 v24, 1, v24 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 25 -; GCN-NEXT: v_or_b32_e32 v17, v17, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 1, s1 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_lshrrev_b16_e64 v25, 1, s1 +; GCN-NEXT: v_lshlrev_b16_e32 v23, 3, v23 +; GCN-NEXT: v_lshlrev_b16_e32 v24, 2, v24 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 -; GCN-NEXT: v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 15 -; GCN-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_lshrrev_b16_e64 v16, 15, s4 +; GCN-NEXT: v_or_b32_e32 v23, v23, v24 +; GCN-NEXT: v_cndmask_b32_e32 v24, 1, v25, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 14 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 14, s4 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc +; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 ; GCN-NEXT: s_cmp_lg_u32 s0, 13 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 13, s4 +; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 12 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 12, s4 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc +; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: s_cmp_lg_u32 s0, 11 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 11, s4 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 +; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 10 -; GCN-NEXT: v_lshrrev_b16_e64 v14, 10, s4 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v19, vcc +; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 9 -; GCN-NEXT: v_lshrrev_b16_e64 v12, 9, s4 -; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc +; GCN-NEXT: v_cndmask_b32_e32 v13, 1, v13, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 8 -; GCN-NEXT: v_lshrrev_b16_e64 v11, 8, s4 ; GCN-NEXT: v_cndmask_b32_e32 v12, 1, v12, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 7 -; GCN-NEXT: v_lshrrev_b16_e64 v10, 7, s4 ; GCN-NEXT: v_cndmask_b32_e32 v11, 1, v11, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 6 -; GCN-NEXT: v_lshrrev_b16_e64 v9, 6, s4 ; GCN-NEXT: v_cndmask_b32_e32 v10, 1, v10, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 5 -; GCN-NEXT: v_lshrrev_b16_e64 v8, 5, s4 ; GCN-NEXT: v_cndmask_b32_e32 v9, 1, v9, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 4 -; GCN-NEXT: v_lshrrev_b16_e64 v7, 4, s4 ; GCN-NEXT: v_cndmask_b32_e32 v8, 1, v8, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 3 -; GCN-NEXT: v_lshrrev_b16_e64 v6, 3, s4 ; GCN-NEXT: v_cndmask_b32_e32 v7, 1, v7, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 2 -; GCN-NEXT: v_lshrrev_b16_e64 v5, 2, s4 ; GCN-NEXT: v_cndmask_b32_e32 v6, 1, v6, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 1 -; GCN-NEXT: v_lshrrev_b16_e64 v4, 1, s4 ; GCN-NEXT: v_cndmask_b32_e32 v5, 1, v5, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_and_b32_e32 v14, 1, v14 +; GCN-NEXT: v_and_b32_e32 v19, 1, v19 +; GCN-NEXT: v_lshlrev_b16_e32 v24, 1, v24 +; GCN-NEXT: v_and_b32_e32 v17, 1, v17 +; GCN-NEXT: v_lshlrev_b16_e32 v16, 1, v16 +; GCN-NEXT: v_and_b32_e32 v15, 1, v15 +; GCN-NEXT: v_and_b32_e32 v13, 1, v13 ; GCN-NEXT: v_lshlrev_b16_e32 v12, 1, v12 ; GCN-NEXT: v_and_b32_e32 v11, 1, v11 ; GCN-NEXT: v_and_b32_e32 v9, 1, v9 @@ -1895,8 +1882,12 @@ ; GCN-NEXT: v_and_b32_e32 v5, 1, v5 ; GCN-NEXT: v_lshlrev_b16_e32 v4, 1, v4 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v14, 2, v14 +; GCN-NEXT: v_or_b32_e32 v19, v19, v24 +; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 +; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 +; GCN-NEXT: v_or_b32_e32 v15, v15, v16 +; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 +; GCN-NEXT: v_lshlrev_b16_e32 v13, 2, v13 ; GCN-NEXT: v_or_b32_e32 v11, v11, v12 ; GCN-NEXT: v_lshlrev_b16_e32 v10, 3, v10 ; GCN-NEXT: v_lshlrev_b16_e32 v9, 2, v9 @@ -1904,24 +1895,33 @@ ; GCN-NEXT: v_lshlrev_b16_e32 v6, 3, v6 ; GCN-NEXT: v_lshlrev_b16_e32 v5, 2, v5 ; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_or_b32_e32 v14, v17, v14 +; GCN-NEXT: v_and_b32_e32 v19, 3, v19 +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: v_and_b32_e32 v15, 3, v15 +; GCN-NEXT: v_or_b32_e32 v13, v14, v13 ; GCN-NEXT: v_and_b32_e32 v11, 3, v11 ; GCN-NEXT: v_or_b32_e32 v9, v10, v9 ; GCN-NEXT: v_and_b32_e32 v7, 3, v7 ; GCN-NEXT: v_or_b32_e32 v5, v6, v5 ; GCN-NEXT: v_and_b32_e32 v0, 3, v0 -; GCN-NEXT: v_or_b32_e32 v11, v11, v14 +; GCN-NEXT: v_or_b32_e32 v19, v19, v23 +; GCN-NEXT: v_or_b32_e32 v15, v15, v17 +; GCN-NEXT: v_or_b32_e32 v11, v11, v13 ; GCN-NEXT: v_or_b32_e32 v7, v7, v9 ; GCN-NEXT: v_or_b32_e32 v0, v0, v5 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 -; GCN-NEXT: v_and_b32_sdwa v11, v11, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: v_lshlrev_b16_e32 v22, 12, v22 +; GCN-NEXT: v_and_b32_sdwa v19, v19, v20 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: v_lshlrev_b16_e32 v15, 12, v15 +; GCN-NEXT: v_and_b32_sdwa v11, v11, v20 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_lshlrev_b16_e32 v7, 4, v7 ; GCN-NEXT: v_and_b32_e32 v0, 15, v0 -; GCN-NEXT: v_or_b32_e32 v11, v16, v11 +; GCN-NEXT: v_or_b32_e32 v19, v22, v19 +; GCN-NEXT: v_or_b32_e32 v11, v15, v11 ; GCN-NEXT: v_or_b32_e32 v0, v0, v7 +; GCN-NEXT: v_or_b32_sdwa v19, v21, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GCN-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -2317,39 +2317,41 @@ ; GFX6-NEXT: s_bfe_u32 s57, s3, 0x1000a ; GFX6-NEXT: s_bfe_u32 s58, s3, 0x10008 ; GFX6-NEXT: s_bfe_u32 s59, s3, 0x1000e -; GFX6-NEXT: s_bfe_u32 s60, s3, 0x10012 -; GFX6-NEXT: s_bfe_u32 s61, s3, 0x10010 -; GFX6-NEXT: s_bfe_u32 s62, s3, 0x10016 -; GFX6-NEXT: s_bfe_u32 s63, s3, 0x1001a -; GFX6-NEXT: s_bfe_u32 s64, s3, 0x10018 -; GFX6-NEXT: s_bfe_u32 s65, s3, 0x1001e -; GFX6-NEXT: s_bfe_u32 s66, s3, 0x1001c -; GFX6-NEXT: s_bfe_u32 s67, s3, 0x10014 -; GFX6-NEXT: s_bfe_u32 s68, s3, 0x1000c +; GFX6-NEXT: s_bfe_u32 s60, s3, 0x1000c +; GFX6-NEXT: s_bfe_u32 s61, s3, 0x10012 +; GFX6-NEXT: s_bfe_u32 s62, s3, 0x10010 +; GFX6-NEXT: s_bfe_u32 s63, s3, 0x10016 +; GFX6-NEXT: s_bfe_u32 s64, s3, 0x1001a +; GFX6-NEXT: s_bfe_u32 s65, s3, 0x10018 +; GFX6-NEXT: s_bfe_u32 s66, s3, 0x1001e +; GFX6-NEXT: s_bfe_u32 s67, s3, 0x1001c +; GFX6-NEXT: s_bfe_u32 s68, s3, 0x10014 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s66 +; GFX6-NEXT: v_mov_b32_e32 v0, s67 ; GFX6-NEXT: v_mov_b32_e32 v1, s36 -; GFX6-NEXT: v_mov_b32_e32 v2, s65 +; GFX6-NEXT: v_mov_b32_e32 v2, s66 ; GFX6-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NEXT: v_mov_b32_e32 v4, s64 +; GFX6-NEXT: v_mov_b32_e32 v4, s65 ; GFX6-NEXT: v_mov_b32_e32 v5, s34 -; GFX6-NEXT: v_mov_b32_e32 v6, s63 +; GFX6-NEXT: v_mov_b32_e32 v6, s64 ; GFX6-NEXT: v_mov_b32_e32 v7, s33 -; GFX6-NEXT: v_mov_b32_e32 v8, s67 -; GFX6-NEXT: v_mov_b32_e32 v9, s31 -; GFX6-NEXT: v_mov_b32_e32 v10, s62 -; GFX6-NEXT: v_mov_b32_e32 v11, s30 -; GFX6-NEXT: v_mov_b32_e32 v12, s61 -; GFX6-NEXT: v_mov_b32_e32 v13, s29 -; GFX6-NEXT: v_mov_b32_e32 v14, s60 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GFX6-NEXT: v_mov_b32_e32 v15, s28 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GFX6-NEXT: s_waitcnt expcnt(3) +; GFX6-NEXT: s_waitcnt expcnt(1) ; GFX6-NEXT: v_mov_b32_e32 v0, s68 +; GFX6-NEXT: v_mov_b32_e32 v1, s31 +; GFX6-NEXT: v_mov_b32_e32 v2, s63 +; GFX6-NEXT: v_mov_b32_e32 v3, s30 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s62 +; GFX6-NEXT: v_mov_b32_e32 v1, s29 +; GFX6-NEXT: v_mov_b32_e32 v2, s61 +; GFX6-NEXT: v_mov_b32_e32 v3, s28 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s60 ; GFX6-NEXT: v_mov_b32_e32 v1, s27 ; GFX6-NEXT: v_mov_b32_e32 v2, s59 ; GFX6-NEXT: v_mov_b32_e32 v3, s26 @@ -2901,45 +2903,47 @@ ; GFX6-NEXT: s_bfe_i32 s50, s3, 0x1000e ; GFX6-NEXT: s_bfe_i32 s51, s3, 0x1000d ; GFX6-NEXT: s_bfe_i32 s52, s3, 0x1000c -; GFX6-NEXT: s_bfe_i32 s53, s3, 0x10012 -; GFX6-NEXT: s_bfe_i32 s54, s3, 0x10011 -; GFX6-NEXT: s_bfe_i32 s55, s3, 0x10010 -; GFX6-NEXT: s_bfe_i32 s56, s3, 0x10017 -; GFX6-NEXT: s_bfe_i32 s57, s3, 0x10016 -; GFX6-NEXT: s_bfe_i32 s58, s3, 0x10015 -; GFX6-NEXT: s_bfe_i32 s59, s3, 0x1001b -; GFX6-NEXT: s_bfe_i32 s60, s3, 0x1001a -; GFX6-NEXT: s_bfe_i32 s61, s3, 0x10019 -; GFX6-NEXT: s_bfe_i32 s62, s3, 0x10018 -; GFX6-NEXT: s_ashr_i32 s63, s3, 31 -; GFX6-NEXT: s_bfe_i32 s64, s3, 0x1001e -; GFX6-NEXT: s_bfe_i32 s65, s3, 0x1001d -; GFX6-NEXT: s_bfe_i32 s66, s3, 0x1001c -; GFX6-NEXT: s_bfe_i32 s67, s3, 0x10014 -; GFX6-NEXT: s_bfe_i32 s68, s3, 0x10013 +; GFX6-NEXT: s_bfe_i32 s53, s3, 0x10013 +; GFX6-NEXT: s_bfe_i32 s54, s3, 0x10012 +; GFX6-NEXT: s_bfe_i32 s55, s3, 0x10011 +; GFX6-NEXT: s_bfe_i32 s56, s3, 0x10010 +; GFX6-NEXT: s_bfe_i32 s57, s3, 0x10017 +; GFX6-NEXT: s_bfe_i32 s58, s3, 0x10016 +; GFX6-NEXT: s_bfe_i32 s59, s3, 0x10015 +; GFX6-NEXT: s_bfe_i32 s60, s3, 0x1001b +; GFX6-NEXT: s_bfe_i32 s61, s3, 0x1001a +; GFX6-NEXT: s_bfe_i32 s62, s3, 0x10019 +; GFX6-NEXT: s_bfe_i32 s63, s3, 0x10018 +; GFX6-NEXT: s_ashr_i32 s64, s3, 31 +; GFX6-NEXT: s_bfe_i32 s65, s3, 0x1001e +; GFX6-NEXT: s_bfe_i32 s66, s3, 0x1001d +; GFX6-NEXT: s_bfe_i32 s67, s3, 0x1001c +; GFX6-NEXT: s_bfe_i32 s68, s3, 0x10014 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s66 -; GFX6-NEXT: v_mov_b32_e32 v1, s65 -; GFX6-NEXT: v_mov_b32_e32 v2, s64 -; GFX6-NEXT: v_mov_b32_e32 v3, s63 -; GFX6-NEXT: v_mov_b32_e32 v4, s62 -; GFX6-NEXT: v_mov_b32_e32 v5, s61 -; GFX6-NEXT: v_mov_b32_e32 v6, s60 -; GFX6-NEXT: v_mov_b32_e32 v7, s59 -; GFX6-NEXT: v_mov_b32_e32 v8, s67 -; GFX6-NEXT: v_mov_b32_e32 v9, s58 -; GFX6-NEXT: v_mov_b32_e32 v10, s57 -; GFX6-NEXT: v_mov_b32_e32 v11, s56 -; GFX6-NEXT: v_mov_b32_e32 v12, s55 -; GFX6-NEXT: v_mov_b32_e32 v13, s54 -; GFX6-NEXT: v_mov_b32_e32 v14, s53 +; GFX6-NEXT: v_mov_b32_e32 v0, s67 +; GFX6-NEXT: v_mov_b32_e32 v1, s66 +; GFX6-NEXT: v_mov_b32_e32 v2, s65 +; GFX6-NEXT: v_mov_b32_e32 v3, s64 +; GFX6-NEXT: v_mov_b32_e32 v4, s63 +; GFX6-NEXT: v_mov_b32_e32 v5, s62 +; GFX6-NEXT: v_mov_b32_e32 v6, s61 +; GFX6-NEXT: v_mov_b32_e32 v7, s60 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GFX6-NEXT: v_mov_b32_e32 v15, s68 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GFX6-NEXT: s_waitcnt expcnt(3) +; GFX6-NEXT: s_waitcnt expcnt(1) +; GFX6-NEXT: v_mov_b32_e32 v0, s68 +; GFX6-NEXT: v_mov_b32_e32 v1, s59 +; GFX6-NEXT: v_mov_b32_e32 v2, s58 +; GFX6-NEXT: v_mov_b32_e32 v3, s57 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s56 +; GFX6-NEXT: v_mov_b32_e32 v1, s55 +; GFX6-NEXT: v_mov_b32_e32 v2, s54 +; GFX6-NEXT: v_mov_b32_e32 v3, s53 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s52 ; GFX6-NEXT: v_mov_b32_e32 v1, s51 ; GFX6-NEXT: v_mov_b32_e32 v2, s50 @@ -4506,13 +4510,12 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s8, s6 ; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: buffer_load_ushort v29, off, s[8:11], 0 +; GFX6-NEXT: buffer_load_ushort v31, off, s[8:11], 0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v1 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v1 ; GFX6-NEXT: v_mov_b32_e32 v10, v1 ; GFX6-NEXT: v_mov_b32_e32 v12, v1 ; GFX6-NEXT: v_mov_b32_e32 v14, v1 @@ -4523,36 +4526,35 @@ ; GFX6-NEXT: v_mov_b32_e32 v24, v1 ; GFX6-NEXT: v_mov_b32_e32 v26, v1 ; GFX6-NEXT: v_mov_b32_e32 v28, v1 +; GFX6-NEXT: v_mov_b32_e32 v30, v1 ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_bfe_u32 v2, v29, 11, 1 -; GFX6-NEXT: v_bfe_u32 v0, v29, 10, 1 +; GFX6-NEXT: v_bfe_u32 v2, v31, 11, 1 +; GFX6-NEXT: v_bfe_u32 v0, v31, 10, 1 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GFX6-NEXT: v_bfe_u32 v5, v29, 9, 1 +; GFX6-NEXT: v_bfe_u32 v29, v31, 5, 1 +; GFX6-NEXT: v_bfe_u32 v25, v31, 7, 1 +; GFX6-NEXT: v_bfe_u32 v21, v31, 1, 1 +; GFX6-NEXT: v_bfe_u32 v17, v31, 3, 1 +; GFX6-NEXT: v_bfe_u32 v13, v31, 13, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 15, v31 +; GFX6-NEXT: v_bfe_u32 v5, v31, 9, 1 +; GFX6-NEXT: v_bfe_u32 v27, v31, 4, 1 +; GFX6-NEXT: v_bfe_u32 v23, v31, 6, 1 +; GFX6-NEXT: v_and_b32_e32 v19, 1, v31 +; GFX6-NEXT: v_bfe_u32 v15, v31, 2, 1 +; GFX6-NEXT: v_bfe_u32 v11, v31, 12, 1 +; GFX6-NEXT: v_bfe_u32 v7, v31, 14, 1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_bfe_u32 v3, v29, 8, 1 +; GFX6-NEXT: v_bfe_u32 v3, v31, 8, 1 ; GFX6-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:64 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 15, v29 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_bfe_u32 v6, v29, 14, 1 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GFX6-NEXT: v_bfe_u32 v27, v29, 5, 1 -; GFX6-NEXT: v_bfe_u32 v23, v29, 7, 1 -; GFX6-NEXT: v_bfe_u32 v19, v29, 1, 1 -; GFX6-NEXT: v_bfe_u32 v15, v29, 3, 1 -; GFX6-NEXT: v_bfe_u32 v11, v29, 13, 1 -; GFX6-NEXT: v_bfe_u32 v25, v29, 4, 1 -; GFX6-NEXT: v_bfe_u32 v21, v29, 6, 1 -; GFX6-NEXT: v_and_b32_e32 v17, 1, v29 -; GFX6-NEXT: v_bfe_u32 v13, v29, 2, 1 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_bfe_u32 v9, v29, 12, 1 -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:96 -; GFX6-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 -; GFX6-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:48 -; GFX6-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:112 +; GFX6-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:96 +; GFX6-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_zextload_v16i1_to_v16i64: @@ -5475,128 +5477,130 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s52, s4, 30 -; GFX6-NEXT: s_lshr_b32 s40, s4, 31 -; GFX6-NEXT: s_lshr_b32 s42, s4, 28 -; GFX6-NEXT: s_lshr_b32 s20, s4, 29 -; GFX6-NEXT: s_lshr_b32 s24, s4, 26 -; GFX6-NEXT: s_lshr_b32 s16, s4, 27 -; GFX6-NEXT: s_lshr_b32 s22, s4, 24 -; GFX6-NEXT: s_lshr_b32 s6, s4, 25 -; GFX6-NEXT: s_lshr_b32 s8, s4, 22 -; GFX6-NEXT: s_lshr_b32 s10, s4, 23 -; GFX6-NEXT: s_lshr_b32 s12, s4, 20 -; GFX6-NEXT: s_lshr_b32 s14, s4, 21 -; GFX6-NEXT: s_lshr_b32 s18, s4, 18 -; GFX6-NEXT: s_lshr_b32 s26, s4, 19 -; GFX6-NEXT: s_lshr_b32 s28, s4, 16 -; GFX6-NEXT: s_lshr_b32 s30, s4, 17 -; GFX6-NEXT: s_lshr_b32 s34, s4, 14 -; GFX6-NEXT: s_lshr_b32 s36, s4, 15 -; GFX6-NEXT: s_lshr_b32 s38, s4, 12 -; GFX6-NEXT: s_lshr_b32 s44, s4, 13 -; GFX6-NEXT: s_lshr_b32 s46, s4, 10 -; GFX6-NEXT: s_lshr_b32 s48, s4, 11 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[4:5], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v0, s50 -; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: s_lshr_b32 s50, s4, 8 -; GFX6-NEXT: v_mov_b32_e32 v2, s52 -; GFX6-NEXT: v_mov_b32_e32 v3, s53 -; GFX6-NEXT: s_lshr_b32 s52, s4, 9 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v4, s40 -; GFX6-NEXT: v_mov_b32_e32 v5, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 6 -; GFX6-NEXT: v_mov_b32_e32 v6, s42 -; GFX6-NEXT: v_mov_b32_e32 v7, s43 -; GFX6-NEXT: s_lshr_b32 s42, s4, 7 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v8, s20 -; GFX6-NEXT: v_mov_b32_e32 v9, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 4 -; GFX6-NEXT: v_mov_b32_e32 v10, s24 -; GFX6-NEXT: v_mov_b32_e32 v11, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 5 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: s_lshr_b32 s60, s10, 30 +; GFX6-NEXT: s_lshr_b32 s18, s10, 31 +; GFX6-NEXT: s_lshr_b32 s22, s10, 28 +; GFX6-NEXT: s_lshr_b32 s4, s10, 29 +; GFX6-NEXT: s_lshr_b32 s6, s10, 26 +; GFX6-NEXT: s_lshr_b32 s8, s10, 27 +; GFX6-NEXT: s_lshr_b32 s12, s10, 24 +; GFX6-NEXT: s_lshr_b32 s14, s10, 25 +; GFX6-NEXT: s_lshr_b32 s16, s10, 22 +; GFX6-NEXT: s_lshr_b32 s20, s10, 23 +; GFX6-NEXT: s_lshr_b32 s24, s10, 20 +; GFX6-NEXT: s_lshr_b32 s26, s10, 21 +; GFX6-NEXT: s_lshr_b32 s28, s10, 18 +; GFX6-NEXT: s_lshr_b32 s30, s10, 19 +; GFX6-NEXT: s_lshr_b32 s34, s10, 16 +; GFX6-NEXT: s_lshr_b32 s36, s10, 17 +; GFX6-NEXT: s_lshr_b32 s38, s10, 14 +; GFX6-NEXT: s_lshr_b32 s40, s10, 15 +; GFX6-NEXT: s_lshr_b32 s42, s10, 12 +; GFX6-NEXT: s_lshr_b32 s44, s10, 13 +; GFX6-NEXT: s_lshr_b32 s46, s10, 10 +; GFX6-NEXT: s_lshr_b32 s48, s10, 11 +; GFX6-NEXT: s_lshr_b32 s50, s10, 8 +; GFX6-NEXT: s_lshr_b32 s52, s10, 9 +; GFX6-NEXT: s_lshr_b32 s54, s10, 6 +; GFX6-NEXT: s_lshr_b32 s56, s10, 7 +; GFX6-NEXT: s_bfe_i64 s[58:59], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s58 +; GFX6-NEXT: v_mov_b32_e32 v1, s59 +; GFX6-NEXT: s_lshr_b32 s58, s10, 4 +; GFX6-NEXT: v_mov_b32_e32 v2, s60 +; GFX6-NEXT: v_mov_b32_e32 v3, s61 +; GFX6-NEXT: s_lshr_b32 s60, s10, 5 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v12, s16 -; GFX6-NEXT: v_mov_b32_e32 v13, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 2 -; GFX6-NEXT: v_mov_b32_e32 v14, s22 -; GFX6-NEXT: v_mov_b32_e32 v15, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 3 -; GFX6-NEXT: s_lshr_b32 s4, s4, 1 -; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v4, s18 +; GFX6-NEXT: v_mov_b32_e32 v5, s19 +; GFX6-NEXT: s_lshr_b32 s18, s10, 2 +; GFX6-NEXT: v_mov_b32_e32 v6, s22 +; GFX6-NEXT: v_mov_b32_e32 v7, s23 +; GFX6-NEXT: s_lshr_b32 s22, s10, 3 +; GFX6-NEXT: s_lshr_b32 s10, s10, 1 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:240 +; GFX6-NEXT: v_mov_b32_e32 v8, s4 +; GFX6-NEXT: v_mov_b32_e32 v9, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:224 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208 -; GFX6-NEXT: v_mov_b32_e32 v16, s6 -; GFX6-NEXT: v_mov_b32_e32 v17, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:192 -; GFX6-NEXT: s_waitcnt expcnt(3) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NEXT: v_mov_b32_e32 v4, s10 -; GFX6-NEXT: v_mov_b32_e32 v5, s11 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 +; GFX6-NEXT: s_waitcnt expcnt(1) +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: v_mov_b32_e32 v4, s8 +; GFX6-NEXT: v_mov_b32_e32 v5, s9 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_mov_b32_e32 v3, s13 ; GFX6-NEXT: v_mov_b32_e32 v4, s14 ; GFX6-NEXT: v_mov_b32_e32 v5, s15 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: v_mov_b32_e32 v4, s20 +; GFX6-NEXT: v_mov_b32_e32 v5, s21 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s24 +; GFX6-NEXT: v_mov_b32_e32 v3, s25 ; GFX6-NEXT: v_mov_b32_e32 v4, s26 ; GFX6-NEXT: v_mov_b32_e32 v5, s27 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:144 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s28 ; GFX6-NEXT: v_mov_b32_e32 v3, s29 ; GFX6-NEXT: v_mov_b32_e32 v4, s30 ; GFX6-NEXT: v_mov_b32_e32 v5, s31 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s34 ; GFX6-NEXT: v_mov_b32_e32 v3, s35 ; GFX6-NEXT: v_mov_b32_e32 v4, s36 ; GFX6-NEXT: v_mov_b32_e32 v5, s37 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s38 ; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: v_mov_b32_e32 v4, s40 +; GFX6-NEXT: v_mov_b32_e32 v5, s41 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v3, s43 ; GFX6-NEXT: v_mov_b32_e32 v4, s44 ; GFX6-NEXT: v_mov_b32_e32 v5, s45 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:96 @@ -5613,26 +5617,26 @@ ; GFX6-NEXT: v_mov_b32_e32 v5, s53 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: v_mov_b32_e32 v4, s42 -; GFX6-NEXT: v_mov_b32_e32 v5, s43 +; GFX6-NEXT: v_mov_b32_e32 v2, s54 +; GFX6-NEXT: v_mov_b32_e32 v3, s55 +; GFX6-NEXT: v_mov_b32_e32 v4, s56 +; GFX6-NEXT: v_mov_b32_e32 v5, s57 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NEXT: v_mov_b32_e32 v4, s24 -; GFX6-NEXT: v_mov_b32_e32 v5, s25 +; GFX6-NEXT: v_mov_b32_e32 v2, s58 +; GFX6-NEXT: v_mov_b32_e32 v3, s59 +; GFX6-NEXT: v_mov_b32_e32 v4, s60 +; GFX6-NEXT: v_mov_b32_e32 v5, s61 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: v_mov_b32_e32 v3, s19 ; GFX6-NEXT: v_mov_b32_e32 v4, s22 ; GFX6-NEXT: v_mov_b32_e32 v5, s23 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_mov_b32_e32 v3, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -6894,319 +6898,317 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s48, s5, 30 -; GFX6-NEXT: s_lshr_b32 s46, s5, 28 -; GFX6-NEXT: s_lshr_b32 s42, s5, 29 -; GFX6-NEXT: s_lshr_b32 s38, s5, 26 -; GFX6-NEXT: s_lshr_b32 s44, s5, 27 +; GFX6-NEXT: s_lshr_b32 s58, s5, 30 +; GFX6-NEXT: s_lshr_b32 s30, s5, 28 +; GFX6-NEXT: s_lshr_b32 s46, s5, 29 +; GFX6-NEXT: s_lshr_b32 s34, s5, 26 +; GFX6-NEXT: s_lshr_b32 s48, s5, 27 ; GFX6-NEXT: s_lshr_b32 s36, s5, 24 -; GFX6-NEXT: s_lshr_b32 s40, s5, 25 -; GFX6-NEXT: s_lshr_b32 s30, s5, 22 -; GFX6-NEXT: s_lshr_b32 s34, s5, 23 +; GFX6-NEXT: s_lshr_b32 s50, s5, 25 +; GFX6-NEXT: s_lshr_b32 s38, s5, 22 +; GFX6-NEXT: s_lshr_b32 s52, s5, 23 ; GFX6-NEXT: s_lshr_b32 s26, s5, 20 -; GFX6-NEXT: s_lshr_b32 s28, s5, 21 -; GFX6-NEXT: s_lshr_b32 s22, s5, 18 -; GFX6-NEXT: s_lshr_b32 s24, s5, 19 -; GFX6-NEXT: s_lshr_b32 s18, s5, 16 -; GFX6-NEXT: s_lshr_b32 s20, s5, 17 +; GFX6-NEXT: s_lshr_b32 s54, s5, 21 +; GFX6-NEXT: s_lshr_b32 s20, s5, 18 +; GFX6-NEXT: s_lshr_b32 s44, s5, 19 +; GFX6-NEXT: s_lshr_b32 s16, s5, 16 +; GFX6-NEXT: s_lshr_b32 s42, s5, 17 ; GFX6-NEXT: s_lshr_b32 s14, s5, 14 -; GFX6-NEXT: s_lshr_b32 s16, s5, 15 -; GFX6-NEXT: s_lshr_b32 s10, s5, 12 -; GFX6-NEXT: s_lshr_b32 s12, s5, 13 -; GFX6-NEXT: s_lshr_b32 s6, s5, 10 -; GFX6-NEXT: s_lshr_b32 s8, s5, 11 -; GFX6-NEXT: s_mov_b32 s50, s5 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[4:5], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v0, s50 -; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: s_lshr_b32 s50, s5, 8 -; GFX6-NEXT: v_mov_b32_e32 v4, s52 -; GFX6-NEXT: v_mov_b32_e32 v5, s53 -; GFX6-NEXT: s_lshr_b32 s52, s5, 9 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v6, s48 -; GFX6-NEXT: v_mov_b32_e32 v7, s49 -; GFX6-NEXT: s_lshr_b32 s48, s5, 6 -; GFX6-NEXT: v_mov_b32_e32 v10, s46 -; GFX6-NEXT: v_mov_b32_e32 v11, s47 -; GFX6-NEXT: s_lshr_b32 s46, s5, 7 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: s_lshr_b32 s40, s5, 15 +; GFX6-NEXT: s_lshr_b32 s12, s5, 12 +; GFX6-NEXT: s_lshr_b32 s28, s5, 13 +; GFX6-NEXT: s_lshr_b32 s10, s5, 10 +; GFX6-NEXT: s_lshr_b32 s24, s5, 11 +; GFX6-NEXT: s_lshr_b32 s8, s5, 8 +; GFX6-NEXT: s_lshr_b32 s22, s5, 9 +; GFX6-NEXT: s_lshr_b32 s6, s5, 6 +; GFX6-NEXT: s_lshr_b32 s18, s5, 7 +; GFX6-NEXT: s_mov_b32 s56, s5 ; GFX6-NEXT: s_ashr_i32 s7, s5, 31 -; GFX6-NEXT: v_mov_b32_e32 v12, s42 -; GFX6-NEXT: v_mov_b32_e32 v13, s43 -; GFX6-NEXT: s_lshr_b32 s42, s5, 4 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v14, s38 -; GFX6-NEXT: v_mov_b32_e32 v15, s39 -; GFX6-NEXT: s_lshr_b32 s54, s5, 5 -; GFX6-NEXT: v_mov_b32_e32 v16, s44 -; GFX6-NEXT: v_mov_b32_e32 v17, s45 -; GFX6-NEXT: s_lshr_b32 s38, s5, 2 -; GFX6-NEXT: v_mov_b32_e32 v8, s7 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v9, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:496 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s36 -; GFX6-NEXT: v_mov_b32_e32 v7, s37 -; GFX6-NEXT: s_lshr_b32 s36, s5, 3 -; GFX6-NEXT: v_mov_b32_e32 v8, s40 -; GFX6-NEXT: v_mov_b32_e32 v9, s41 -; GFX6-NEXT: s_lshr_b32 s40, s5, 1 +; GFX6-NEXT: s_bfe_i64 s[60:61], s[56:57], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NEXT: s_lshr_b32 s56, s5, 4 +; GFX6-NEXT: v_mov_b32_e32 v4, s60 +; GFX6-NEXT: v_mov_b32_e32 v5, s61 +; GFX6-NEXT: s_bfe_i64 s[60:61], s[4:5], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s60 +; GFX6-NEXT: v_mov_b32_e32 v9, s61 +; GFX6-NEXT: s_lshr_b32 s60, s5, 5 +; GFX6-NEXT: v_mov_b32_e32 v0, s58 +; GFX6-NEXT: v_mov_b32_e32 v1, s59 +; GFX6-NEXT: s_lshr_b32 s58, s5, 2 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:480 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v10, s30 ; GFX6-NEXT: v_mov_b32_e32 v11, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 30 -; GFX6-NEXT: v_mov_b32_e32 v12, s34 -; GFX6-NEXT: v_mov_b32_e32 v13, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 31 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: s_lshr_b32 s30, s5, 3 +; GFX6-NEXT: v_mov_b32_e32 v12, s46 +; GFX6-NEXT: v_mov_b32_e32 v13, s47 +; GFX6-NEXT: s_lshr_b32 s46, s5, 1 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v14, s34 +; GFX6-NEXT: v_mov_b32_e32 v15, s35 +; GFX6-NEXT: s_lshr_b32 s34, s4, 30 +; GFX6-NEXT: v_mov_b32_e32 v16, s48 +; GFX6-NEXT: v_mov_b32_e32 v17, s49 +; GFX6-NEXT: s_lshr_b32 s48, s4, 31 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v18, s36 +; GFX6-NEXT: v_mov_b32_e32 v19, s37 +; GFX6-NEXT: s_lshr_b32 s36, s4, 28 +; GFX6-NEXT: v_mov_b32_e32 v20, s50 +; GFX6-NEXT: v_mov_b32_e32 v21, s51 +; GFX6-NEXT: s_lshr_b32 s50, s4, 29 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v22, s38 +; GFX6-NEXT: v_mov_b32_e32 v23, s39 +; GFX6-NEXT: s_lshr_b32 s38, s4, 26 +; GFX6-NEXT: v_mov_b32_e32 v24, s52 +; GFX6-NEXT: v_mov_b32_e32 v25, s53 +; GFX6-NEXT: s_lshr_b32 s52, s4, 27 +; GFX6-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:464 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s26 -; GFX6-NEXT: v_mov_b32_e32 v15, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 28 -; GFX6-NEXT: v_mov_b32_e32 v16, s28 -; GFX6-NEXT: v_mov_b32_e32 v17, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 29 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:448 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s22 -; GFX6-NEXT: v_mov_b32_e32 v7, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 26 -; GFX6-NEXT: v_mov_b32_e32 v8, s24 -; GFX6-NEXT: v_mov_b32_e32 v9, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 27 +; GFX6-NEXT: v_mov_b32_e32 v0, s26 +; GFX6-NEXT: v_mov_b32_e32 v1, s27 +; GFX6-NEXT: s_lshr_b32 s26, s4, 24 +; GFX6-NEXT: v_mov_b32_e32 v2, s54 +; GFX6-NEXT: v_mov_b32_e32 v3, s55 +; GFX6-NEXT: s_lshr_b32 s54, s4, 25 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:432 +; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, s18 -; GFX6-NEXT: v_mov_b32_e32 v11, s19 -; GFX6-NEXT: s_lshr_b32 s44, s4, 24 -; GFX6-NEXT: v_mov_b32_e32 v12, s20 -; GFX6-NEXT: v_mov_b32_e32 v13, s21 -; GFX6-NEXT: s_lshr_b32 s18, s4, 25 +; GFX6-NEXT: v_mov_b32_e32 v10, s20 +; GFX6-NEXT: v_mov_b32_e32 v11, s21 +; GFX6-NEXT: s_lshr_b32 s20, s4, 22 +; GFX6-NEXT: v_mov_b32_e32 v12, s44 +; GFX6-NEXT: v_mov_b32_e32 v13, s45 +; GFX6-NEXT: s_lshr_b32 s44, s4, 23 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:464 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v14, s16 +; GFX6-NEXT: v_mov_b32_e32 v15, s17 +; GFX6-NEXT: s_lshr_b32 s16, s4, 20 +; GFX6-NEXT: v_mov_b32_e32 v16, s42 +; GFX6-NEXT: v_mov_b32_e32 v17, s43 +; GFX6-NEXT: s_lshr_b32 s42, s4, 21 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:416 +; GFX6-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:448 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s14 -; GFX6-NEXT: v_mov_b32_e32 v15, s15 -; GFX6-NEXT: s_lshr_b32 s14, s4, 22 -; GFX6-NEXT: v_mov_b32_e32 v16, s16 -; GFX6-NEXT: v_mov_b32_e32 v17, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 23 +; GFX6-NEXT: v_mov_b32_e32 v18, s14 +; GFX6-NEXT: v_mov_b32_e32 v19, s15 +; GFX6-NEXT: s_lshr_b32 s14, s4, 18 +; GFX6-NEXT: v_mov_b32_e32 v20, s40 +; GFX6-NEXT: v_mov_b32_e32 v21, s41 +; GFX6-NEXT: s_lshr_b32 s40, s4, 19 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:432 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v22, s12 +; GFX6-NEXT: v_mov_b32_e32 v23, s13 +; GFX6-NEXT: s_lshr_b32 s12, s4, 16 +; GFX6-NEXT: v_mov_b32_e32 v24, s28 +; GFX6-NEXT: v_mov_b32_e32 v25, s29 +; GFX6-NEXT: s_lshr_b32 s28, s4, 17 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:400 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:416 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s10 -; GFX6-NEXT: v_mov_b32_e32 v7, s11 -; GFX6-NEXT: s_lshr_b32 s10, s4, 20 -; GFX6-NEXT: v_mov_b32_e32 v8, s12 -; GFX6-NEXT: v_mov_b32_e32 v9, s13 -; GFX6-NEXT: s_lshr_b32 s12, s4, 21 +; GFX6-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NEXT: v_mov_b32_e32 v1, s11 +; GFX6-NEXT: s_lshr_b32 s10, s4, 14 +; GFX6-NEXT: v_mov_b32_e32 v2, s24 +; GFX6-NEXT: v_mov_b32_e32 v3, s25 +; GFX6-NEXT: s_lshr_b32 s24, s4, 15 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:400 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v10, s8 +; GFX6-NEXT: v_mov_b32_e32 v11, s9 +; GFX6-NEXT: s_lshr_b32 s8, s4, 12 +; GFX6-NEXT: v_mov_b32_e32 v12, s22 +; GFX6-NEXT: v_mov_b32_e32 v13, s23 +; GFX6-NEXT: s_lshr_b32 s22, s4, 13 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:384 +; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:384 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, s6 -; GFX6-NEXT: v_mov_b32_e32 v11, s7 -; GFX6-NEXT: s_lshr_b32 s6, s4, 18 -; GFX6-NEXT: v_mov_b32_e32 v12, s8 -; GFX6-NEXT: v_mov_b32_e32 v13, s9 -; GFX6-NEXT: s_lshr_b32 s8, s4, 19 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[52:53], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:368 +; GFX6-NEXT: v_mov_b32_e32 v14, s6 +; GFX6-NEXT: v_mov_b32_e32 v15, s7 +; GFX6-NEXT: s_lshr_b32 s6, s4, 10 +; GFX6-NEXT: v_mov_b32_e32 v16, s18 +; GFX6-NEXT: v_mov_b32_e32 v17, s19 +; GFX6-NEXT: s_lshr_b32 s18, s4, 11 +; GFX6-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:368 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v18, s56 +; GFX6-NEXT: v_mov_b32_e32 v19, s57 +; GFX6-NEXT: s_lshr_b32 s56, s4, 8 +; GFX6-NEXT: v_mov_b32_e32 v20, s60 +; GFX6-NEXT: v_mov_b32_e32 v21, s61 +; GFX6-NEXT: s_lshr_b32 s60, s4, 9 +; GFX6-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:352 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s50 -; GFX6-NEXT: v_mov_b32_e32 v15, s51 -; GFX6-NEXT: s_lshr_b32 s50, s4, 16 -; GFX6-NEXT: v_mov_b32_e32 v16, s20 -; GFX6-NEXT: v_mov_b32_e32 v17, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 17 +; GFX6-NEXT: v_mov_b32_e32 v22, s58 +; GFX6-NEXT: v_mov_b32_e32 v23, s59 +; GFX6-NEXT: s_lshr_b32 s58, s4, 6 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v24, s30 +; GFX6-NEXT: v_mov_b32_e32 v25, s31 +; GFX6-NEXT: s_lshr_b32 s30, s4, 7 +; GFX6-NEXT: v_mov_b32_e32 v6, s46 +; GFX6-NEXT: v_mov_b32_e32 v7, s47 +; GFX6-NEXT: s_lshr_b32 s46, s4, 4 ; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:352 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s48 -; GFX6-NEXT: v_mov_b32_e32 v7, s49 -; GFX6-NEXT: s_lshr_b32 s48, s4, 14 -; GFX6-NEXT: v_mov_b32_e32 v8, s46 -; GFX6-NEXT: v_mov_b32_e32 v9, s47 -; GFX6-NEXT: s_lshr_b32 s46, s4, 15 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[54:55], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:336 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, s42 -; GFX6-NEXT: v_mov_b32_e32 v11, s43 -; GFX6-NEXT: s_lshr_b32 s42, s4, 12 -; GFX6-NEXT: v_mov_b32_e32 v12, s52 -; GFX6-NEXT: v_mov_b32_e32 v13, s53 -; GFX6-NEXT: s_lshr_b32 s52, s4, 13 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:320 +; GFX6-NEXT: v_mov_b32_e32 v0, s34 +; GFX6-NEXT: v_mov_b32_e32 v1, s35 +; GFX6-NEXT: s_lshr_b32 s34, s4, 5 +; GFX6-NEXT: v_mov_b32_e32 v2, s48 +; GFX6-NEXT: v_mov_b32_e32 v3, s49 +; GFX6-NEXT: s_lshr_b32 s48, s4, 2 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s38 -; GFX6-NEXT: v_mov_b32_e32 v15, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 10 +; GFX6-NEXT: v_mov_b32_e32 v10, s36 +; GFX6-NEXT: v_mov_b32_e32 v11, s37 +; GFX6-NEXT: s_lshr_b32 s36, s4, 3 +; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v16, s36 -; GFX6-NEXT: v_mov_b32_e32 v17, s37 -; GFX6-NEXT: s_lshr_b32 s36, s4, 11 -; GFX6-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 8 +; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:304 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s30 -; GFX6-NEXT: v_mov_b32_e32 v7, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 9 -; GFX6-NEXT: v_mov_b32_e32 v8, s34 -; GFX6-NEXT: v_mov_b32_e32 v9, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 6 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:288 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, s26 -; GFX6-NEXT: v_mov_b32_e32 v11, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 7 -; GFX6-NEXT: v_mov_b32_e32 v12, s28 -; GFX6-NEXT: v_mov_b32_e32 v13, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 4 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:272 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s22 -; GFX6-NEXT: v_mov_b32_e32 v15, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v16, s24 -; GFX6-NEXT: v_mov_b32_e32 v17, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 2 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s44 -; GFX6-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NEXT: s_lshr_b32 s44, s4, 3 -; GFX6-NEXT: s_lshr_b32 s4, s4, 1 -; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240 +; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:304 +; GFX6-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:288 +; GFX6-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:272 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:256 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GFX6-NEXT: v_mov_b32_e32 v12, s50 +; GFX6-NEXT: v_mov_b32_e32 v13, s51 ; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:224 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:208 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NEXT: s_waitcnt expcnt(1) +; GFX6-NEXT: v_mov_b32_e32 v0, s38 +; GFX6-NEXT: v_mov_b32_e32 v1, s39 +; GFX6-NEXT: v_mov_b32_e32 v2, s52 +; GFX6-NEXT: v_mov_b32_e32 v3, s53 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s26 +; GFX6-NEXT: v_mov_b32_e32 v1, s27 +; GFX6-NEXT: v_mov_b32_e32 v2, s54 +; GFX6-NEXT: v_mov_b32_e32 v3, s55 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_mov_b32_e32 v1, s15 -; GFX6-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: v_mov_b32_e32 v1, s21 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v3, s45 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_mov_b32_e32 v1, s11 -; GFX6-NEXT: v_mov_b32_e32 v2, s12 -; GFX6-NEXT: v_mov_b32_e32 v3, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v3, s43 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v1, s15 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v3, s41 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s50 -; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s28 +; GFX6-NEXT: v_mov_b32_e32 v3, s29 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s48 -; GFX6-NEXT: v_mov_b32_e32 v1, s49 -; GFX6-NEXT: v_mov_b32_e32 v2, s46 -; GFX6-NEXT: v_mov_b32_e32 v3, s47 +; GFX6-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NEXT: v_mov_b32_e32 v1, s11 +; GFX6-NEXT: v_mov_b32_e32 v2, s24 +; GFX6-NEXT: v_mov_b32_e32 v3, s25 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s42 -; GFX6-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NEXT: v_mov_b32_e32 v2, s52 -; GFX6-NEXT: v_mov_b32_e32 v3, s53 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NEXT: v_mov_b32_e32 v3, s23 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s38 -; GFX6-NEXT: v_mov_b32_e32 v1, s39 -; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: v_mov_b32_e32 v3, s19 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 -; GFX6-NEXT: v_mov_b32_e32 v1, s41 -; GFX6-NEXT: v_mov_b32_e32 v2, s30 -; GFX6-NEXT: v_mov_b32_e32 v3, s31 +; GFX6-NEXT: v_mov_b32_e32 v0, s56 +; GFX6-NEXT: v_mov_b32_e32 v1, s57 +; GFX6-NEXT: v_mov_b32_e32 v2, s60 +; GFX6-NEXT: v_mov_b32_e32 v3, s61 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s34 -; GFX6-NEXT: v_mov_b32_e32 v1, s35 -; GFX6-NEXT: v_mov_b32_e32 v2, s26 -; GFX6-NEXT: v_mov_b32_e32 v3, s27 +; GFX6-NEXT: v_mov_b32_e32 v0, s58 +; GFX6-NEXT: v_mov_b32_e32 v1, s59 +; GFX6-NEXT: v_mov_b32_e32 v2, s30 +; GFX6-NEXT: v_mov_b32_e32 v3, s31 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s28 -; GFX6-NEXT: v_mov_b32_e32 v1, s29 -; GFX6-NEXT: v_mov_b32_e32 v2, s22 -; GFX6-NEXT: v_mov_b32_e32 v3, s23 +; GFX6-NEXT: v_mov_b32_e32 v0, s46 +; GFX6-NEXT: v_mov_b32_e32 v1, s47 +; GFX6-NEXT: v_mov_b32_e32 v2, s34 +; GFX6-NEXT: v_mov_b32_e32 v3, s35 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s24 -; GFX6-NEXT: v_mov_b32_e32 v1, s25 -; GFX6-NEXT: v_mov_b32_e32 v2, s44 -; GFX6-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NEXT: v_mov_b32_e32 v0, s48 +; GFX6-NEXT: v_mov_b32_e32 v1, s49 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v3, s37 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NEXT: v_mov_b32_e32 v6, s4 -; GFX6-NEXT: v_mov_b32_e32 v7, s5 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v10, s4 +; GFX6-NEXT: v_mov_b32_e32 v11, s5 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64: diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -3027,6 +3027,7 @@ ; GCN-NOHSA-SI-NEXT: s_and_b32 s21, s21, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s20, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s23, s23, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s22, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s25, s25, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s24, s24, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s27, 0xffff @@ -3035,7 +3036,6 @@ ; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s31, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s30, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s26, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s22, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s36 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s37 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -3048,19 +3048,21 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s68 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s25 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s66 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s65 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s62 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 @@ -3222,32 +3224,25 @@ ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s65 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s64 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s63 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s65 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s62 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 @@ -3259,31 +3254,38 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s58 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s67 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s57 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s56 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s54 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s53 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s52 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s68 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s18 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s42 @@ -3813,8 +3815,9 @@ ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s62, s22, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s23, s23 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s22, s22 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s24, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s64, s25 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s25, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s64, s24, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s25, s25 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s24, s24 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s65, s27, 16 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s66, s26, 16 @@ -3828,7 +3831,6 @@ ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s31, s31 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s30, s30 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s26, s26 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s25, 16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s36 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s37 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -3841,19 +3843,21 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s68 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s66 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s65 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s62 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 @@ -3999,48 +4003,39 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 +; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 +; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 ; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 ; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 ; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 ; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 ; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 -; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 -; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 -; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 @@ -4052,31 +4047,40 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s58 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s52 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s18 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s50 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s49 @@ -6917,99 +6921,101 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s34, s3 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s0, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[56:57], s[22:23], 0x100000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s15 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s13 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s11 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s34, s9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s38, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s54, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s0, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[58:59], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[60:61], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[60:61], s[28:29], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[12:13], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[14:15], 0x100000 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[68:69], s[0:1], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[72:73], s[4:5], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[2:3], s[8:9], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[12:13], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[8:9], s[14:15], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[12:13], s[6:7], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[2:3], s[12:13], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[14:15], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s55 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s52 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s53 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s57 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[36:37], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[30:31], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[46:47], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[44:45], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[42:43], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[40:41], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[38:39], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[58:59], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[54:55], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[52:53], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[50:51], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[48:49], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[46:47], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x100000 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s78 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s79 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s76 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s77 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s74 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s75 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s39 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s72 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s73 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s41 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s70 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s71 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s68 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 @@ -7020,43 +7026,41 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s65 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s62 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s59 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s27 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s11 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: @@ -7130,32 +7134,26 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s45 ; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s56 ; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s45 ; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s45 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GCN-HSA-NEXT: s_add_u32 s38, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s45 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 -; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s79 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 -; GCN-HSA-NEXT: s_add_u32 s36, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s49 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77 @@ -7163,28 +7161,36 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s74 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s75 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s44 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s38 -; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 +; GCN-HSA-NEXT: s_add_u32 s38, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s54 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s39 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s37 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s39 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 +; GCN-HSA-NEXT: s_add_u32 s36, s16, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s37 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GCN-HSA-NEXT: s_add_u32 s14, s16, 0xe0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -3110,27 +3110,26 @@ ; GFX6-NOHSA-NEXT: s_ashr_i32 s42, s4, 31 ; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s7, 31 ; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s6, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s45, s17, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s46, s16, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s47, s19, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s48, s18, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s49, s21, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s50, s20, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s51, s23, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s52, s30, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s31, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s52 -; GFX6-NOHSA-NEXT: s_ashr_i32 s52, s28, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s53 -; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s29, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s52 -; GFX6-NOHSA-NEXT: s_ashr_i32 s52, s26, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s53 -; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s27, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s52 -; GFX6-NOHSA-NEXT: s_ashr_i32 s52, s22, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s53 -; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s25, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s45, s9, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s46, s8, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s47, s11, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s48, s17, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s49, s16, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s50, s19, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s51, s18, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s52, s21, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s20, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s54, s23, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s55, s22, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s56, s25, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s57, s24, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s58, s27, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s59, s30, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s60, s31, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s59 +; GFX6-NOHSA-NEXT: s_ashr_i32 s59, s26, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s60 +; GFX6-NOHSA-NEXT: s_ashr_i32 s60, s29, 31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s28 @@ -3141,90 +3140,89 @@ ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s25 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s22 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s19 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:240 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s17 +; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s28, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s10, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s13, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s12, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s15, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s14, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s60 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:224 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s59 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s58 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:208 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s17 -; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s24, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s9, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s8, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s11, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s10, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s13, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s12, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s15, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s24, s14, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s53 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s57 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s56 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:192 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s52 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s51 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s55 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s54 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:176 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s50 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s49 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:160 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s47 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:144 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s53 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s52 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[36:39], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s46 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s45 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:128 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s51 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s50 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[36:39], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s23 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:112 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s49 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s48 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:128 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s3 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s20 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:112 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s17 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s43 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s41 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s18 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s47 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s46 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s45 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s43 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[36:39], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s41 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[36:39], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s33 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -2856,6 +2856,7 @@ ; GFX6-NOHSA-NEXT: s_and_b32 s65, s10, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s10, s10, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s66, s11, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s11, s11, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s67, s12, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s12, s12, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s68, s13, 0xff @@ -2864,7 +2865,6 @@ ; GFX6-NOHSA-NEXT: s_and_b32 s70, s15, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s15, s15, 0x80010 ; GFX6-NOHSA-NEXT: s_bfe_u32 s13, s13, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_u32 s11, s11, 0x80010 ; GFX6-NOHSA-NEXT: s_mov_b32 s0, s16 ; GFX6-NOHSA-NEXT: s_mov_b32 s1, s17 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -2877,19 +2877,21 @@ ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s48 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s14 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s47 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s68 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s46 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s45 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s67 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s12 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s43 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(3) +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s68 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s46 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s67 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s66 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s42 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s11 @@ -3049,30 +3051,24 @@ ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s68 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s52 ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0xb0 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s4 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s67 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0xa0 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s67 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s51 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s50 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0x80 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s68 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0x90 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s66 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s49 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s13 @@ -3084,28 +3080,34 @@ ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s45 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s38 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s63 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s44 ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s43 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s62 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s41 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[2:3], v[20:23] +; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s62 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s40 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0x80 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0x70 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s60 @@ -3626,9 +3628,10 @@ ; GFX6-NOHSA-NEXT: s_bfe_i32 s57, s11, 0x80010 ; GFX6-NOHSA-NEXT: s_bfe_i32 s58, s11, 0x80008 ; GFX6-NOHSA-NEXT: s_sext_i32_i8 s11, s11 -; GFX6-NOHSA-NEXT: s_bfe_i32 s59, s12, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s60, s12, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s61, s12 +; GFX6-NOHSA-NEXT: s_ashr_i32 s59, s12, 24 +; GFX6-NOHSA-NEXT: s_bfe_i32 s60, s12, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s61, s12, 0x80008 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s12, s12 ; GFX6-NOHSA-NEXT: s_ashr_i32 s62, s13, 24 ; GFX6-NOHSA-NEXT: s_bfe_i32 s63, s13, 0x80010 ; GFX6-NOHSA-NEXT: s_bfe_i32 s64, s13, 0x80008 @@ -3641,7 +3644,6 @@ ; GFX6-NOHSA-NEXT: s_bfe_i32 s70, s15, 0x80008 ; GFX6-NOHSA-NEXT: s_sext_i32_i8 s15, s15 ; GFX6-NOHSA-NEXT: s_sext_i32_i8 s13, s13 -; GFX6-NOHSA-NEXT: s_ashr_i32 s12, s12, 24 ; GFX6-NOHSA-NEXT: s_mov_b32 s0, s16 ; GFX6-NOHSA-NEXT: s_mov_b32 s1, s17 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -3654,19 +3656,21 @@ ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s67 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s66 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s65 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s63 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s59 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s12 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(3) +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s63 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s62 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s61 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s60 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s59 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s58 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s57 @@ -3810,40 +3814,33 @@ ; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xc0 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s15, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s68 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s67 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s66 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xb0 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s6 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xa0 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s7 -; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 +; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 +; GFX7-HSA-NEXT: s_sext_i32_i8 s12, s12 +; GFX7-HSA-NEXT: s_sext_i32_i8 s13, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s65 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x80 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s8, s8 -; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 -; GFX7-HSA-NEXT: s_sext_i32_i8 s12, s12 -; GFX7-HSA-NEXT: s_sext_i32_i8 s13, s13 -; GFX7-HSA-NEXT: s_sext_i32_i8 s15, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x90 ; GFX7-HSA-NEXT: s_sext_i32_i8 s9, s9 -; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s68 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s67 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s66 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s62 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s61 @@ -3855,28 +3852,35 @@ ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s56 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s55 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s45 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s54 ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s53 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s50 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s49 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[2:3], v[20:23] +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s49 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s48 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x80 +; GFX7-HSA-NEXT: s_sext_i32_i8 s8, s8 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x70 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s44 @@ -6820,153 +6824,156 @@ ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s7, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s50, s7 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s48, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s3, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s3, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s40, s3 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s2, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s2, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s2, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s1, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s1, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[48:49], 0x80000 -; GFX6-NOHSA-NEXT: s_mov_b32 s54, s1 -; GFX6-NOHSA-NEXT: s_lshr_b32 s56, s0, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s58, s0, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s60, s0, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[48:49], s[0:1], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[0:1], s[0:1], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[2:3], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[64:65], s[4:5], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[66:67], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s7, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s10, s7 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s38, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s3, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s3, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s44, s3 +; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s2, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s52, s1, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s54, s1, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s56, s1 +; GFX6-NOHSA-NEXT: s_lshr_b32 s58, s0, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s60, s0, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s62, s0, 8 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[64:65], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[26:27], s[0:1], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[66:67], s[2:3], 56 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[2:3], s[4:5], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[68:69], s[6:7], 0x80000 ; GFX6-NOHSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[2:3], s[2:3], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[70:71], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_mov_b32 s0, s8 +; GFX6-NOHSA-NEXT: s_mov_b32 s1, s9 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s50 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s51 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s66 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s67 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s65 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s52 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s53 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s5 -; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:240 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[54:55], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[40:41], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[60:61], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[58:59], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[50:51], s[56:57], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s65 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s68 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s69 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s3 +; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[64:65], s[44:45], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[68:69], s[38:39], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[62:63], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[60:61], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[58:59], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[54:55], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[52:53], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:224 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s68 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s69 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s70 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s71 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s66 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s67 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:208 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s17 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:192 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s65 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s15 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[8:11], 0 offset:176 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s63 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[8:11], 0 offset:160 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s19 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:208 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s43 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s27 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[8:11], 0 offset:144 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:176 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s27 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[8:11], 0 offset:128 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s56 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s57 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s29 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s49 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s31 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:96 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s31 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s35 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:128 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s39 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:80 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s43 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s37 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s45 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[8:11], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s46 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s47 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[8:11], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s46 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s47 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s50 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s51 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s41 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:16 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s49 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s45 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s39 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s5 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: @@ -7048,61 +7055,63 @@ ; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s40 ; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s41 ; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 ; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s63 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s38 ; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0xb0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s39 ; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s39 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 ; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0xa0 ; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s39 -; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s43 -; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s63 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0x90 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s44 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s45 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s46 ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s40 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24 -; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x80 ; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s71 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s49 ; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s41 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s50 ; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s51 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s38 -; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s54 ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s55 ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s52 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s53 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[2:3], v[20:23] +; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s56 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s58 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s25 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s39 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -3592,7 +3592,7 @@ ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s6 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 @@ -3600,50 +3600,48 @@ ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 64 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(7) +; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v21 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v20 ; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v21 ; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v20 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[20:21] ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[24:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s5 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xc0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xd0 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xa0 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xa0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[24:27] +; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v23 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 16, v22 ; GCN-HSA-NEXT: v_and_b32_e32 v25, 0xffff, v23 ; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[23:26] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v17 ; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v17 ; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[20:23] ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 @@ -3653,54 +3651,58 @@ ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v18 ; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v19 ; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[19:22] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v12 ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v13 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v14 ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v15 ; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[15:18] +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[15:18] ; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v4 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v9 ; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[16:19] -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[11:14] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v11 ; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v33 @@ -3718,14 +3720,12 @@ ; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v34 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v10 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[15:18] +; GCN-HSA-NEXT: flat_store_dwordx4 v[7:8], v[15:18] ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v29 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v28 @@ -3735,15 +3735,13 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v31 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v30 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v31 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v30 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v31 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v30 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v31 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v30 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3751,10 +3749,12 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32: @@ -4237,120 +4237,120 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s8, s8, s3 -; GCN-NOHSA-SI-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s3 +; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v11 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v10 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v11, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(5) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v11 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v10 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v11, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v10, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v32, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v8, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v31 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v30 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v31, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v30, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v29 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v28 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v29, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v28, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v35 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 16, v34 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v35, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v34, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 16, v33 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v45, 16, v32 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v33, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v44, v32, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v39 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v38 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v39, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v38, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v51, 16, v37 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v49, 16, v36 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v37, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v36, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 16, v27 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v37, 16, v26 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v27, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v26, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 16, v25 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v53, 16, v24 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v25, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v24, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v23 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v22 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v23, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v22, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v59, 16, v21 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v57, 16, v20 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v21, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v20, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 16, v19 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 16, v18 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v19, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v18, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 16, v17 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v61, 16, v16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 16, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 16, v14 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v15, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v14, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v33, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v34, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v35, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 16, v9 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v37, 16, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v8, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v12 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v15 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v14 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v15, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v14, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v13 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v12 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 16, v19 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 16, v18 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v19, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 16, v17 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v45, 16, v16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v17, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v44, v16, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 16, v23 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 16, v22 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v23, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v22, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v51, 16, v21 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v49, 16, v20 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v21, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v20, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 16, v6 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v7, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 16, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v53, 16, v4 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v2 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v3, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v59, 16, v1 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v57, 16, v0 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 16, v27 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v61, 16, v26 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v27, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v26, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v25 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v24 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v25, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v24, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 16, v31 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v30 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v31, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v30, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v29 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v28 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v29, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v28, 0, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -4361,189 +4361,192 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 -; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 -; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s8, s2, 64 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] -; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v33, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v21 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v20 -; GCN-HSA-NEXT: v_bfe_i32 v30, v21, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v28, v20, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v17 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v16 +; GCN-HSA-NEXT: v_bfe_i32 v30, v17, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v28, v16, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[28:31] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v19 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v18 +; GCN-HSA-NEXT: v_bfe_i32 v30, v19, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v28, v18, 0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[28:31] +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v12 +; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v12, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v23 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v22 -; GCN-HSA-NEXT: v_bfe_i32 v30, v23, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v28, v22, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[28:31] -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s4 -; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v17 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v16 -; GCN-HSA-NEXT: v_bfe_i32 v22, v17, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v20, v16, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v14 +; GCN-HSA-NEXT: v_bfe_i32 v17, v15, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v15, v14, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[15:18] +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v19 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v18 -; GCN-HSA-NEXT: v_bfe_i32 v21, v19, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v19, v18, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[19:22] -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v5 +; GCN-HSA-NEXT: v_bfe_i32 v14, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v4, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v12, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v39, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v14 -; GCN-HSA-NEXT: v_bfe_i32 v22, v15, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v20, v14, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[20:23] -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v17, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v15, v8, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v14, v7, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v6, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 -; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[15:18] -; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[11:14] -; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v17, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v15, v0, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v13, v5, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v11, v4, 0, 16 +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v14, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v0, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v5, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[3:6] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v26 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v10 +; GCN-HSA-NEXT: v_bfe_i32 v2, v11, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v10, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v8 +; GCN-HSA-NEXT: v_bfe_i32 v6, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v8, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v21 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v20 +; GCN-HSA-NEXT: v_bfe_i32 v10, v21, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v20, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v25 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v24 +; GCN-HSA-NEXT: v_bfe_i32 v14, v25, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v24, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v23 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v22 +; GCN-HSA-NEXT: v_bfe_i32 v18, v23, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v22, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: v_bfe_i32 v15, v26, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v25 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v24 -; GCN-HSA-NEXT: v_bfe_i32 v21, v25, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v19, v24, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 16, v33 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 16, v32 -; GCN-HSA-NEXT: v_bfe_i32 v25, v33, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v23, v32, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v33 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v32 +; GCN-HSA-NEXT: v_bfe_i32 v22, v33, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v20, v32, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[23:26] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v35 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v34 -; GCN-HSA-NEXT: v_bfe_i32 v2, v35, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v34, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v35 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v34 +; GCN-HSA-NEXT: v_bfe_i32 v18, v35, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v34, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v27 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v27 -; GCN-HSA-NEXT: v_bfe_i32 v17, v27, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v26 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GCN-HSA-NEXT: v_bfe_i32 v10, v27, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v26, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v64i16_to_v64i32: diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir @@ -84,13 +84,11 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 S_NOP 0 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -191,14 +189,12 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 S_NOP 0, implicit %24 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %23 S_NOP 0, implicit %0, implicit %1 @@ -300,7 +296,6 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 @@ -308,7 +303,6 @@ S_NOP 0, implicit %23 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -408,7 +402,6 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 @@ -416,7 +409,6 @@ S_NOP 0, implicit %22, implicit %23 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -529,7 +521,6 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 @@ -537,14 +528,12 @@ S_NOP 0, implicit %23 bb.2: - ; predcessors: %bb.1 successors: %bb.3 %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode S_NOP 0 bb.3: - ; predecessors: %bb.2 successors: %bb.4 %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 @@ -552,7 +541,6 @@ S_NOP 0, implicit %25 bb.4: - ; predcessors: %bb.3 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -666,7 +654,6 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 @@ -674,7 +661,6 @@ S_NOP 0, implicit %23, implicit %22 bb.2: - ; predcessors: %bb.1 successors: %bb.3 %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode @@ -682,7 +668,6 @@ S_NOP 0 bb.3: - ; predecessors: %bb.2 successors: %bb.4 %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0 @@ -690,7 +675,6 @@ S_NOP 0, implicit %25, implicit %26 bb.4: - ; predcessors: %bb.3 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -949,14 +933,12 @@ undef %23.sub0:vreg_64 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %23.sub1:vreg_64 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 S_NOP 0, implicit %23 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -1053,7 +1035,6 @@ undef %21.sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 @@ -1062,7 +1043,6 @@ S_NOP 0, implicit %21 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -1581,7 +1561,6 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 @@ -1589,7 +1568,6 @@ S_NOP 0, implicit %24, implicit %25 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %23 S_NOP 0, implicit %0, implicit %1 @@ -2528,14 +2506,12 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 S_NOP 0, implicit %24 bb.2: - ; predcessors: %bb.1 successors: %bb.3 S_NOP 0, implicit %23 @@ -2543,7 +2519,6 @@ S_NOP 0 bb.3: - ; predecessors: %bb.2 successors: %bb.4 %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 @@ -2551,7 +2526,6 @@ S_NOP 0, implicit %26, implicit %27 bb.4: - ; predcessors: %bb.3 S_NOP 0, implicit %25 S_NOP 0, implicit %0, implicit %1 @@ -2650,7 +2624,6 @@ %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 bb.1: - ; predecessors: %bb.0 successors: %bb.2 %21.sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 @@ -2658,7 +2631,6 @@ S_NOP 0, implicit %21 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -2759,7 +2731,6 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 @@ -2767,7 +2738,6 @@ S_NOP 0, implicit %23 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %23 S_NOP 0, implicit %0, implicit %1 @@ -5030,7 +5000,6 @@ %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %21.sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 @@ -5038,7 +5007,6 @@ S_NOP 0, implicit %21 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -5137,14 +5105,12 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 S_NOP 0, implicit %23, implicit %24 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -5242,7 +5208,6 @@ %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 @@ -5250,7 +5215,6 @@ S_NOP 0, implicit %22 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -5348,7 +5312,6 @@ %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 @@ -5357,7 +5320,6 @@ S_NOP 0, implicit %22 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -5456,7 +5418,6 @@ %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 @@ -5466,7 +5427,6 @@ S_NOP 0, implicit %22 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -5562,14 +5522,12 @@ %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 S_NOP 0, implicit %22, implicit %23 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %0, implicit %1 S_NOP 0, implicit %2, implicit %3 @@ -5669,14 +5627,12 @@ undef %23.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 S_NOP 0, implicit %24 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %23.sub1 S_NOP 0, implicit %0, implicit %1 @@ -5779,14 +5735,12 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 S_NOP 0, implicit %24 bb.2: - ; predcessors: %bb.1 DBG_VALUE %23, 0, 0 S_NOP 0, implicit %23 @@ -5889,14 +5843,12 @@ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: - ; predecessors: %bb.0 successors: %bb.2 %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 S_NOP 0, implicit %24 bb.2: - ; predcessors: %bb.1 S_NOP 0, implicit %23 S_NOP 0, implicit %0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -74,27 +74,19 @@ ; CHECK-NEXT: v_writelane_b32 v23, s5, 33 ; CHECK-NEXT: v_writelane_b32 v23, s6, 34 ; CHECK-NEXT: v_writelane_b32 v23, s7, 35 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:11] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 36 -; CHECK-NEXT: v_writelane_b32 v23, s5, 37 -; CHECK-NEXT: v_writelane_b32 v23, s6, 38 -; CHECK-NEXT: v_writelane_b32 v23, s7, 39 -; CHECK-NEXT: v_writelane_b32 v23, s8, 40 -; CHECK-NEXT: v_writelane_b32 v23, s9, 41 -; CHECK-NEXT: v_writelane_b32 v23, s10, 42 -; CHECK-NEXT: v_writelane_b32 v23, s11, 43 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[44:51] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[16:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[52:53] +; CHECK-NEXT: ; def s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[48:51] +; CHECK-NEXT: ; def s[52:55] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[36:43] @@ -102,62 +94,70 @@ ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 44 -; CHECK-NEXT: v_writelane_b32 v23, s1, 45 -; CHECK-NEXT: v_writelane_b32 v23, s2, 46 -; CHECK-NEXT: v_writelane_b32 v23, s3, 47 -; CHECK-NEXT: v_writelane_b32 v23, s4, 48 -; CHECK-NEXT: v_writelane_b32 v23, s5, 49 -; CHECK-NEXT: v_writelane_b32 v23, s6, 50 -; CHECK-NEXT: v_writelane_b32 v23, s7, 51 -; CHECK-NEXT: v_writelane_b32 v23, s8, 52 -; CHECK-NEXT: v_writelane_b32 v23, s9, 53 -; CHECK-NEXT: v_writelane_b32 v23, s10, 54 -; CHECK-NEXT: v_writelane_b32 v23, s11, 55 -; CHECK-NEXT: v_writelane_b32 v23, s12, 56 -; CHECK-NEXT: v_writelane_b32 v23, s13, 57 -; CHECK-NEXT: v_writelane_b32 v23, s14, 58 -; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: v_writelane_b32 v23, s15, 59 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[34:35] -; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s0, 36 +; CHECK-NEXT: v_writelane_b32 v23, s1, 37 +; CHECK-NEXT: v_writelane_b32 v23, s2, 38 +; CHECK-NEXT: v_writelane_b32 v23, s3, 39 +; CHECK-NEXT: v_writelane_b32 v23, s4, 40 +; CHECK-NEXT: v_writelane_b32 v23, s5, 41 +; CHECK-NEXT: v_writelane_b32 v23, s6, 42 +; CHECK-NEXT: v_writelane_b32 v23, s7, 43 +; CHECK-NEXT: v_writelane_b32 v23, s8, 44 +; CHECK-NEXT: v_writelane_b32 v23, s9, 45 +; CHECK-NEXT: v_writelane_b32 v23, s10, 46 +; CHECK-NEXT: v_writelane_b32 v23, s11, 47 +; CHECK-NEXT: v_writelane_b32 v23, s12, 48 +; CHECK-NEXT: v_writelane_b32 v23, s13, 49 +; CHECK-NEXT: v_writelane_b32 v23, s14, 50 +; CHECK-NEXT: v_writelane_b32 v23, s15, 51 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s0, 52 +; CHECK-NEXT: v_writelane_b32 v23, s1, 53 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[44:47] +; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s0, 54 +; CHECK-NEXT: v_writelane_b32 v23, s1, 55 +; CHECK-NEXT: v_writelane_b32 v23, s2, 56 +; CHECK-NEXT: v_writelane_b32 v23, s3, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 60 -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v23, s1, 61 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 -; CHECK-NEXT: v_writelane_b32 v23, s2, 62 -; CHECK-NEXT: v_writelane_b32 v0, s6, 2 -; CHECK-NEXT: v_writelane_b32 v23, s3, 63 -; CHECK-NEXT: v_writelane_b32 v0, s7, 3 +; CHECK-NEXT: v_writelane_b32 v23, s0, 58 +; CHECK-NEXT: v_writelane_b32 v23, s1, 59 +; CHECK-NEXT: v_writelane_b32 v23, s2, 60 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_writelane_b32 v23, s3, 61 +; CHECK-NEXT: v_writelane_b32 v23, s4, 62 +; CHECK-NEXT: v_writelane_b32 v0, s6, 0 +; CHECK-NEXT: v_writelane_b32 v23, s5, 63 +; CHECK-NEXT: v_writelane_b32 v0, s7, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 4 -; CHECK-NEXT: v_writelane_b32 v0, s1, 5 -; CHECK-NEXT: v_writelane_b32 v0, s2, 6 -; CHECK-NEXT: v_writelane_b32 v0, s3, 7 -; CHECK-NEXT: v_writelane_b32 v0, s4, 8 -; CHECK-NEXT: v_writelane_b32 v0, s5, 9 -; CHECK-NEXT: v_writelane_b32 v0, s6, 10 -; CHECK-NEXT: v_writelane_b32 v0, s7, 11 -; CHECK-NEXT: v_writelane_b32 v0, s8, 12 -; CHECK-NEXT: v_writelane_b32 v0, s9, 13 -; CHECK-NEXT: v_writelane_b32 v0, s10, 14 -; CHECK-NEXT: v_writelane_b32 v0, s11, 15 -; CHECK-NEXT: v_writelane_b32 v0, s12, 16 -; CHECK-NEXT: v_writelane_b32 v0, s13, 17 -; CHECK-NEXT: v_writelane_b32 v0, s14, 18 -; CHECK-NEXT: v_writelane_b32 v0, s15, 19 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[54:55] -; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s0, 2 +; CHECK-NEXT: v_writelane_b32 v0, s1, 3 +; CHECK-NEXT: v_writelane_b32 v0, s2, 4 +; CHECK-NEXT: v_writelane_b32 v0, s3, 5 +; CHECK-NEXT: v_writelane_b32 v0, s4, 6 +; CHECK-NEXT: v_writelane_b32 v0, s5, 7 +; CHECK-NEXT: v_writelane_b32 v0, s6, 8 +; CHECK-NEXT: v_writelane_b32 v0, s7, 9 +; CHECK-NEXT: v_writelane_b32 v0, s8, 10 +; CHECK-NEXT: v_writelane_b32 v0, s9, 11 +; CHECK-NEXT: v_writelane_b32 v0, s10, 12 +; CHECK-NEXT: v_writelane_b32 v0, s11, 13 +; CHECK-NEXT: v_writelane_b32 v0, s12, 14 +; CHECK-NEXT: v_writelane_b32 v0, s13, 15 +; CHECK-NEXT: v_writelane_b32 v0, s14, 16 +; CHECK-NEXT: v_writelane_b32 v0, s15, 17 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s0, 18 +; CHECK-NEXT: v_writelane_b32 v0, s1, 19 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND @@ -257,90 +257,90 @@ ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v23, 36 ; CHECK-NEXT: v_readlane_b32 s1, v23, 37 -; CHECK-NEXT: v_readlane_b32 s2, v23, 38 -; CHECK-NEXT: v_readlane_b32 s3, v23, 39 -; CHECK-NEXT: v_readlane_b32 s4, v23, 40 -; CHECK-NEXT: v_readlane_b32 s5, v23, 41 -; CHECK-NEXT: v_readlane_b32 s6, v23, 42 -; CHECK-NEXT: v_readlane_b32 s7, v23, 43 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:7] +; CHECK-NEXT: ; use s[44:51] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 44 -; CHECK-NEXT: v_readlane_b32 s1, v23, 45 -; CHECK-NEXT: v_readlane_b32 s2, v23, 46 -; CHECK-NEXT: v_readlane_b32 s3, v23, 47 -; CHECK-NEXT: v_readlane_b32 s4, v23, 48 -; CHECK-NEXT: v_readlane_b32 s5, v23, 49 -; CHECK-NEXT: v_readlane_b32 s6, v23, 50 -; CHECK-NEXT: v_readlane_b32 s7, v23, 51 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[16:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[52:53] +; CHECK-NEXT: ; use s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[48:51] +; CHECK-NEXT: ; use s[52:55] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[36:43] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s8, v23, 52 -; CHECK-NEXT: v_readlane_b32 s9, v23, 53 -; CHECK-NEXT: v_readlane_b32 s10, v23, 54 -; CHECK-NEXT: v_readlane_b32 s11, v23, 55 -; CHECK-NEXT: v_readlane_b32 s12, v23, 56 -; CHECK-NEXT: v_readlane_b32 s13, v23, 57 -; CHECK-NEXT: v_readlane_b32 s14, v23, 58 -; CHECK-NEXT: v_readlane_b32 s15, v23, 59 +; CHECK-NEXT: v_readlane_b32 s2, v23, 38 +; CHECK-NEXT: v_readlane_b32 s3, v23, 39 +; CHECK-NEXT: v_readlane_b32 s4, v23, 40 +; CHECK-NEXT: v_readlane_b32 s5, v23, 41 +; CHECK-NEXT: v_readlane_b32 s6, v23, 42 +; CHECK-NEXT: v_readlane_b32 s7, v23, 43 +; CHECK-NEXT: v_readlane_b32 s8, v23, 44 +; CHECK-NEXT: v_readlane_b32 s9, v23, 45 +; CHECK-NEXT: v_readlane_b32 s10, v23, 46 +; CHECK-NEXT: v_readlane_b32 s11, v23, 47 +; CHECK-NEXT: v_readlane_b32 s12, v23, 48 +; CHECK-NEXT: v_readlane_b32 s13, v23, 49 +; CHECK-NEXT: v_readlane_b32 s14, v23, 50 +; CHECK-NEXT: v_readlane_b32 s15, v23, 51 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 60 -; CHECK-NEXT: v_readlane_b32 s1, v23, 61 -; CHECK-NEXT: v_readlane_b32 s2, v23, 62 -; CHECK-NEXT: v_readlane_b32 s3, v23, 63 -; CHECK-NEXT: v_readlane_b32 s4, v0, 0 -; CHECK-NEXT: v_readlane_b32 s5, v0, 1 -; CHECK-NEXT: v_readlane_b32 s6, v0, 2 -; CHECK-NEXT: v_readlane_b32 s7, v0, 3 +; CHECK-NEXT: v_readlane_b32 s0, v23, 52 +; CHECK-NEXT: v_readlane_b32 s1, v23, 53 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[34:35] +; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 54 +; CHECK-NEXT: v_readlane_b32 s1, v23, 55 +; CHECK-NEXT: v_readlane_b32 s2, v23, 56 +; CHECK-NEXT: v_readlane_b32 s3, v23, 57 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[44:47] +; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 58 +; CHECK-NEXT: v_readlane_b32 s1, v23, 59 +; CHECK-NEXT: v_readlane_b32 s2, v23, 60 +; CHECK-NEXT: v_readlane_b32 s3, v23, 61 +; CHECK-NEXT: v_readlane_b32 s4, v23, 62 +; CHECK-NEXT: v_readlane_b32 s5, v23, 63 +; CHECK-NEXT: v_readlane_b32 s6, v0, 0 +; CHECK-NEXT: v_readlane_b32 s7, v0, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 4 -; CHECK-NEXT: v_readlane_b32 s1, v0, 5 -; CHECK-NEXT: v_readlane_b32 s2, v0, 6 -; CHECK-NEXT: v_readlane_b32 s3, v0, 7 -; CHECK-NEXT: v_readlane_b32 s4, v0, 8 -; CHECK-NEXT: v_readlane_b32 s5, v0, 9 -; CHECK-NEXT: v_readlane_b32 s6, v0, 10 -; CHECK-NEXT: v_readlane_b32 s7, v0, 11 -; CHECK-NEXT: v_readlane_b32 s8, v0, 12 -; CHECK-NEXT: v_readlane_b32 s9, v0, 13 -; CHECK-NEXT: v_readlane_b32 s10, v0, 14 -; CHECK-NEXT: v_readlane_b32 s11, v0, 15 -; CHECK-NEXT: v_readlane_b32 s12, v0, 16 -; CHECK-NEXT: v_readlane_b32 s13, v0, 17 -; CHECK-NEXT: v_readlane_b32 s14, v0, 18 -; CHECK-NEXT: v_readlane_b32 s15, v0, 19 +; CHECK-NEXT: v_readlane_b32 s0, v0, 2 +; CHECK-NEXT: v_readlane_b32 s1, v0, 3 +; CHECK-NEXT: v_readlane_b32 s2, v0, 4 +; CHECK-NEXT: v_readlane_b32 s3, v0, 5 +; CHECK-NEXT: v_readlane_b32 s4, v0, 6 +; CHECK-NEXT: v_readlane_b32 s5, v0, 7 +; CHECK-NEXT: v_readlane_b32 s6, v0, 8 +; CHECK-NEXT: v_readlane_b32 s7, v0, 9 +; CHECK-NEXT: v_readlane_b32 s8, v0, 10 +; CHECK-NEXT: v_readlane_b32 s9, v0, 11 +; CHECK-NEXT: v_readlane_b32 s10, v0, 12 +; CHECK-NEXT: v_readlane_b32 s11, v0, 13 +; CHECK-NEXT: v_readlane_b32 s12, v0, 14 +; CHECK-NEXT: v_readlane_b32 s13, v0, 15 +; CHECK-NEXT: v_readlane_b32 s14, v0, 16 +; CHECK-NEXT: v_readlane_b32 s15, v0, 17 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 18 +; CHECK-NEXT: v_readlane_b32 s1, v0, 19 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v0, 20 ; CHECK-NEXT: v_readlane_b32 s1, v0, 21 ; CHECK-NEXT: v_readlane_b32 s2, v0, 22 ; CHECK-NEXT: v_readlane_b32 s3, v0, 23 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[54:55] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v0, 24 diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir --- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir @@ -24,6 +24,7 @@ ; CHECK-NEXT: undef %0.sub3:vreg_512 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 0, [[V_MOV_B32_e32_]], implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_512 = COPY %0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: @@ -38,11 +39,10 @@ ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_HI16 */, def dead [[COPY1]], 851978 /* regdef:VGPR_HI16 */, def dead [[COPY]].sub1, 2147483657 /* reguse tiedto:$0 */, [[COPY1]], 2147549193 /* reguse tiedto:$1 */, [[COPY]].sub1 ; CHECK-NEXT: %11.sub0:vreg_512 = COPY [[COPY]].sub0 ; CHECK-NEXT: %11.sub3:vreg_512 = COPY [[COPY]].sub3 - ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: dead [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 4, [[V_MOV_B32_e32_1]], implicit-def dead $vcc, implicit $exec ; CHECK-NEXT: %11.sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_]] ; CHECK-NEXT: %11.sub5:vreg_512 = COPY undef [[V_MOV_B32_e32_]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_512 = COPY %11 + ; CHECK-NEXT: dead [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 4, [[V_MOV_B32_e32_1]], implicit-def dead $vcc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 bb.0: liveins: $sgpr6_sgpr7