diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -16,6 +16,7 @@ #include "GCNRegPressure.h" #include "llvm/ADT/MapVector.h" #include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/Support/NativeFormatting.h" namespace llvm { @@ -126,6 +127,29 @@ GCNMaxILPSchedStrategy(const MachineSchedContext *C); }; +class ScheduleMetrics { + unsigned ScheduleLength; + unsigned BubbleCycles; + public: + ScheduleMetrics() {} + ScheduleMetrics(unsigned L, unsigned BC): + ScheduleLength(L), BubbleCycles(BC) {} + unsigned getLength() const { return ScheduleLength; } + unsigned getBubbles() const { return BubbleCycles; } + float getMetric() const { + if (!BubbleCycles) + return std::numeric_limits::min(); + return static_cast(BubbleCycles) / ScheduleLength; + } +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) { + dbgs() << "\n Schedule Metric is: "; + llvm::write_double(dbgs(), Sm.getMetric(), FloatStyle::Fixed); + dbgs() << " [ " << Sm.getBubbles() << "/" << Sm.getLength() << " ]\n"; + return OS; +} + class GCNScheduleDAGMILive final : public ScheduleDAGMILive { friend class GCNSchedStage; friend class OccInitialScheduleStage; @@ -165,6 +189,9 @@ // Regions that have IGLP instructions (SCHED_GROUP_BARRIER or IGLP_OPT). BitVector RegionsWithIGLPInstrs; + // We only compute ScheduleMetrics for Min Occupancy regions. + DenseMap MinOccupancyRegionsMetrics; + // Region live-in cache. SmallVector LiveIns; @@ -259,6 +286,9 @@ // Check result of scheduling. void checkScheduling(); + // computes the given schedule virtual execution time in clocks + ScheduleMetrics getScheduleMetrics(); + // Returns true if scheduling should be reverted. virtual bool shouldRevertScheduling(unsigned WavesAfter); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -27,6 +27,7 @@ #include "AMDGPUIGroupLP.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/Support/NativeFormatting.h" #define DEBUG_TYPE "machine-scheduler" @@ -845,11 +846,15 @@ DAG.RescheduleRegions[RegionIdx] = false; if (S.HasHighPressure) DAG.RegionsWithHighRP[RegionIdx] = true; - // Revert scheduling if we have dropped occupancy or there is some other // reason that the original schedule is better. checkScheduling(); + if (DAG.RegionsWithMinOcc[RegionIdx] && + StageID == GCNSchedStageID::OccInitialSchedule) { + DAG.MinOccupancyRegionsMetrics[RegionIdx] = getScheduleMetrics(); + } + if (DAG.RegionsWithIGLPInstrs[RegionIdx] && StageID != GCNSchedStageID::UnclusteredHighRPReschedule) SavedMutations.swap(DAG.Mutations); @@ -862,6 +867,7 @@ // Check the results of scheduling. PressureAfter = DAG.getRealRegPressure(RegionIdx); LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); + LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { @@ -925,6 +931,71 @@ } } +ScheduleMetrics GCNSchedStage::getScheduleMetrics() { + struct comp { + bool operator()(std::pair A, + std::pair B) const { + return A.second < B.second; + } + }; + const SIInstrInfo * SI = ST.getInstrInfo(); +#ifndef NDEBUG + int BBNum = -1; +#endif + unsigned SumBubbles = 0; + DenseMap Model; + unsigned CurrCycle = 0; + for (auto &MI : DAG) { +#ifndef NDEBUG + if (BBNum == -1) + BBNum = MI.getParent()->getNumber(); +#endif + SUnit *SU = DAG.getSUnit(&MI); + if (!SU) + continue; + unsigned ReadyCycle = CurrCycle; + for (auto &D : SU->Preds) { + if (D.isAssignedRegDep()) { + MachineInstr *DefMI = D.getSUnit()->getInstr(); + unsigned Latency = + SI->getInstrLatency(ST.getInstrItineraryData(), *DefMI); + unsigned DefIdx = Model[DefMI]; + ReadyCycle = std::max(ReadyCycle, DefIdx + Latency); + } + } + SumBubbles += ReadyCycle - CurrCycle; + Model[&MI] = ReadyCycle; + CurrCycle = ++ReadyCycle; + } +#ifndef NDEBUG + if (!Model.empty()) { + std::set, comp> ModelSorted( + Model.begin(), Model.end()); + LLVM_DEBUG( + dbgs() << "\n################## Schedule time model for MBB : " << BBNum + << " for stage: " << StageID + << " ##################\n# Cycle #\t\t\tInstruction " + " " + " \n"; + unsigned IPrev = 1; + for (auto I : ModelSorted) { + if (I.second > IPrev + 1) + dbgs() << "****************************** BUBBLE OF " + << I.second - IPrev + << " CYCLES DETECTED ******************************\n\n"; + dbgs() << "[ " << I.second << " ] : " << *I.first << "\n"; + IPrev = I.second; + } + dbgs() << "\n\t" << "Metric: "; + write_double(dbgs(), static_cast(SumBubbles) / CurrCycle, + FloatStyle::Fixed); + dbgs() << "\n\n"); + } +#endif + + return ScheduleMetrics(CurrCycle, SumBubbles); +} + bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { if (WavesAfter < DAG.MinOccupancy) return true; @@ -952,6 +1023,24 @@ return true; } + auto ISBefore = DAG.MinOccupancyRegionsMetrics.find(RegionIdx); + if (ISBefore != DAG.MinOccupancyRegionsMetrics.end()) { + ScheduleMetrics MBefore = ISBefore->getSecond(); + ScheduleMetrics MAfter = getScheduleMetrics(); + float OldMetric = MBefore.getMetric(); + float NewMetric = MAfter.getMetric(); + unsigned WavesBefore = + std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST)); + float Profit = + (static_cast(WavesAfter) / WavesBefore * OldMetric / NewMetric); + LLVM_DEBUG(dbgs() << "\t*** In shouldRevertScheduling ***\n" + << "\tMetric before " << MBefore << "\tMetric after " + << MAfter << "Profit: "; + llvm::write_double(dbgs(), Profit, llvm::FloatStyle::Fixed); + dbgs() << "\n"); + return Profit < 1; + } + return false; } @@ -1055,6 +1144,8 @@ } } + LLVM_DEBUG(dbgs() << getScheduleMetrics() << "\n"); + // Then move the debug instructions back into their correct place and set // RegionBegin and RegionEnd if needed. DAG.placeDebugValues(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -1645,208 +1645,211 @@ ; GFX7-LABEL: v_mul_i256: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] -; GFX7-NEXT: v_mul_lo_u32 v28, v4, v11 -; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10 -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] +; GFX7-NEXT: v_mov_b32_e32 v16, v0 +; GFX7-NEXT: v_mov_b32_e32 v17, v1 +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1] +; GFX7-NEXT: v_mul_lo_u32 v27, v3, v12 +; GFX7-NEXT: v_mul_lo_u32 v26, v5, v10 +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1] +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1] +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19] ; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] ; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] ; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc ; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 +; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v20, vcc ; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] -; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc -; GFX7-NEXT: v_mov_b32_e32 v20, v18 +; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v22, vcc +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0 +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23] +; GFX7-NEXT: v_mov_b32_e32 v1, v18 ; GFX7-NEXT: v_mov_b32_e32 v18, v19 -; GFX7-NEXT: v_mov_b32_e32 v19, v16 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9 -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] -; GFX7-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] -; GFX7-NEXT: v_mov_b32_e32 v19, v22 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] -; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] -; GFX7-NEXT: v_mul_lo_u32 v24, v3, v12 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] -; GFX7-NEXT: v_mul_lo_u32 v22, v2, v13 -; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] +; GFX7-NEXT: v_mov_b32_e32 v19, v20 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23] +; GFX7-NEXT: v_addc_u32_e64 v25, s[4:5], 0, v0, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19] +; GFX7-NEXT: v_mov_b32_e32 v0, v23 +; GFX7-NEXT: v_mul_lo_u32 v23, v4, v11 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1] +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] +; GFX7-NEXT: v_mul_lo_u32 v13, v2, v13 +; GFX7-NEXT: v_mul_lo_u32 v20, v6, v9 +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12] ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1] ; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX7-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] -; GFX7-NEXT: v_mov_b32_e32 v20, v11 -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1] +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, v22 +; GFX7-NEXT: v_addc_u32_e64 v10, s[12:13], 0, v4, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2] ; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX7-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13] -; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] -; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13] -; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7] -; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc -; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, v10 +; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12] +; GFX7-NEXT: v_addc_u32_e64 v10, s[12:13], 0, v10, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2] +; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v11, v3, s[12:13] +; GFX7-NEXT: v_mul_lo_u32 v11, v16, v15 +; GFX7-NEXT: v_mul_lo_u32 v9, v17, v14 +; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v25, v4, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v10, v5, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v24, v6, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v10, s[12:13], v21, v11, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v9, s[12:13], v10, v9, s[14:15] +; GFX7-NEXT: v_addc_u32_e64 v9, s[10:11], v9, v13, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v27, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v23, s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v26, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] -; GFX8-NEXT: v_mul_lo_u32 v28, v4, v11 -; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10 -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] +; GFX8-NEXT: v_mov_b32_e32 v16, v0 +; GFX8-NEXT: v_mov_b32_e32 v17, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1] +; GFX8-NEXT: v_mul_lo_u32 v27, v3, v12 +; GFX8-NEXT: v_mul_lo_u32 v26, v5, v10 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19] ; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] ; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] ; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc ; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v20, vcc ; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] -; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc -; GFX8-NEXT: v_mov_b32_e32 v20, v18 +; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v22, vcc +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0 +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23] +; GFX8-NEXT: v_mov_b32_e32 v1, v18 ; GFX8-NEXT: v_mov_b32_e32 v18, v19 -; GFX8-NEXT: v_mov_b32_e32 v19, v16 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] -; GFX8-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] -; GFX8-NEXT: v_mov_b32_e32 v19, v22 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] -; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] -; GFX8-NEXT: v_mul_lo_u32 v24, v3, v12 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] -; GFX8-NEXT: v_mul_lo_u32 v22, v2, v13 -; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] +; GFX8-NEXT: v_mov_b32_e32 v19, v20 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23] +; GFX8-NEXT: v_addc_u32_e64 v25, s[4:5], 0, v0, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19] +; GFX8-NEXT: v_mov_b32_e32 v0, v23 +; GFX8-NEXT: v_mul_lo_u32 v23, v4, v11 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] +; GFX8-NEXT: v_mul_lo_u32 v13, v2, v13 +; GFX8-NEXT: v_mul_lo_u32 v20, v6, v9 +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12] ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1] ; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX8-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] -; GFX8-NEXT: v_mov_b32_e32 v20, v11 -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, v22 +; GFX8-NEXT: v_addc_u32_e64 v10, s[12:13], 0, v4, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2] ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX8-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13] -; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] -; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13] -; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7] -; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, v10 +; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12] +; GFX8-NEXT: v_addc_u32_e64 v10, s[12:13], 0, v10, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2] +; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v11, v3, s[12:13] +; GFX8-NEXT: v_mul_lo_u32 v11, v16, v15 +; GFX8-NEXT: v_mul_lo_u32 v9, v17, v14 +; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v25, v4, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v10, v5, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v24, v6, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v10, s[12:13], v21, v11, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v9, s[12:13], v10, v9, s[14:15] +; GFX8-NEXT: v_addc_u32_e64 v9, s[10:11], v9, v13, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v27, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v23, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v26, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] -; GFX9-NEXT: v_mul_lo_u32 v28, v4, v11 -; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] +; GFX9-NEXT: v_mov_b32_e32 v16, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1] +; GFX9-NEXT: v_mul_lo_u32 v27, v3, v12 +; GFX9-NEXT: v_mul_lo_u32 v26, v5, v10 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19] ; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] ; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc ; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc -; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v20, vcc ; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v20, vcc -; GFX9-NEXT: v_mov_b32_e32 v20, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v22, vcc +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0 +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23] +; GFX9-NEXT: v_mov_b32_e32 v1, v18 ; GFX9-NEXT: v_mov_b32_e32 v18, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v16 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] -; GFX9-NEXT: v_addc_co_u32_e64 v26, s[4:5], 0, v6, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] -; GFX9-NEXT: v_mov_b32_e32 v19, v22 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] -; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] -; GFX9-NEXT: v_mul_lo_u32 v24, v3, v12 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] -; GFX9-NEXT: v_mul_lo_u32 v22, v2, v13 -; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] +; GFX9-NEXT: v_mov_b32_e32 v19, v20 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23] +; GFX9-NEXT: v_addc_co_u32_e64 v25, s[4:5], 0, v0, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19] +; GFX9-NEXT: v_mov_b32_e32 v0, v23 +; GFX9-NEXT: v_mul_lo_u32 v23, v4, v11 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] +; GFX9-NEXT: v_mul_lo_u32 v13, v2, v13 +; GFX9-NEXT: v_mul_lo_u32 v20, v6, v9 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12] ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1] ; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13] -; GFX9-NEXT: v_mov_b32_e32 v20, v11 -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, v22 +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[12:13], 0, v4, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2] ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[12:13], 0, v2, s[12:13] -; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v12, v3, s[12:13] -; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v26, v4, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v11, v5, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v25, v6, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v17, v0, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v0, v9, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v22, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v24, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v28, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12] +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[12:13], 0, v10, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v11, v3, s[12:13] +; GFX9-NEXT: v_mul_lo_u32 v11, v16, v15 +; GFX9-NEXT: v_mul_lo_u32 v9, v17, v14 +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v25, v4, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v10, v5, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v24, v6, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[12:13], v21, v11, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[12:13], v10, v9, s[14:15] +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[10:11], v9, v13, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v27, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v9, v23, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v9, v26, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i256: diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -3606,136 +3606,136 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: s_add_u32 s6, s2, 16 +; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 -; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s8, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 64 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s8 -; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] -; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s4 ; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] +; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v29, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v30, 0xffff, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v28, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v30, 0xffff, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v28, 0xffff, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s5 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v30, 0xffff, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v28, 0xffff, v14 ; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xa0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GCN-HSA-NEXT: v_and_b32_e32 v30, 0xffff, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v28, 0xffff, v18 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[28:31] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[28:31] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s9 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v14 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[15:18] +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7 ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s5 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[7:10] -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[12:15] +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v6 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 -; GCN-HSA-NEXT: s_waitcnt vmcnt(11) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v17 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v11 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xffff, v10 ; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[1:4] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -3748,13 +3748,12 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: s_waitcnt vmcnt(12) +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v33 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v32 ; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v33 ; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v25 @@ -4403,67 +4402,64 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 64 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29] ; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v26, v13, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v24, v12, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[24:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v17 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v16 +; GCN-HSA-NEXT: v_bfe_i32 v26, v17, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v24, v16, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v14 -; GCN-HSA-NEXT: v_bfe_i32 v26, v15, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v24, v14, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[24:27] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[24:27] -; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v19 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v18 +; GCN-HSA-NEXT: v_bfe_i32 v26, v19, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v24, v18, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[24:27] +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 @@ -4471,20 +4467,23 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s4 +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v12 +; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v12, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 -; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v14 +; GCN-HSA-NEXT: v_bfe_i32 v17, v15, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v15, v14, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[15:18] ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 @@ -4493,45 +4492,45 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v10, v5, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v14, v7, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v12, v6, 0, 16 -; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v8 +; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[12:15] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v2 -; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[8:11] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v10 +; GCN-HSA-NEXT: v_bfe_i32 v18, v11, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v10, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[16:19] +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v0, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v14, v3, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: s_waitcnt vmcnt(11) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v17 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v17, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v16, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v19 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v18 -; GCN-HSA-NEXT: v_bfe_i32 v2, v19, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v18, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v6, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -4551,20 +4550,20 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GCN-HSA-NEXT: s_waitcnt vmcnt(12) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v29 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v28 -; GCN-HSA-NEXT: v_bfe_i32 v14, v29, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v12, v28, 0, 16 +; GCN-HSA-NEXT: s_waitcnt vmcnt(11) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v33 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v32 +; GCN-HSA-NEXT: v_bfe_i32 v14, v33, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v32, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v31 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v35 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v30 -; GCN-HSA-NEXT: v_bfe_i32 v10, v31, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v30, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v34 +; GCN-HSA-NEXT: v_bfe_i32 v10, v35, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v34, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v21 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v20 @@ -4574,19 +4573,18 @@ ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_waitcnt vmcnt(14) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v33 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v32 -; GCN-HSA-NEXT: v_bfe_i32 v6, v33, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v32, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v29 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v28 +; GCN-HSA-NEXT: v_bfe_i32 v6, v29, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v28, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v35 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v31 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v34 -; GCN-HSA-NEXT: v_bfe_i32 v2, v35, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v34, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v30 +; GCN-HSA-NEXT: v_bfe_i32 v2, v31, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v30, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -6509,50 +6507,51 @@ ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v6 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v21 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v20 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, 0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: @@ -6650,10 +6649,10 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -450,101 +450,92 @@ ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v6, 0 -; GFX8-NEXT: s_movk_i32 s4, 0x7f +; GFX8-NEXT: s_movk_i32 s0, 0x7f ; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_mov_b32 s5, 0 +; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: .LBB1_2: ; %for.body ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xffffb000, v3 -; GFX8-NEXT: s_mov_b64 s[0:1], vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffb800, v3 -; GFX8-NEXT: s_mov_b64 s[2:3], vcc -; GFX8-NEXT: v_addc_u32_e64 v8, vcc, -1, v4, s[0:1] +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, -1, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffc000, v3 -; GFX8-NEXT: s_mov_b64 s[0:1], vcc -; GFX8-NEXT: v_addc_u32_e64 v10, vcc, -1, v4, s[2:3] +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffb800, v3 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, -1, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffc000, v3 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, -1, v4, vcc +; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[11:12] ; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xffffc800, v3 -; GFX8-NEXT: s_mov_b64 s[2:3], vcc -; GFX8-NEXT: v_addc_u32_e64 v12, vcc, -1, v4, s[0:1] -; GFX8-NEXT: s_addk_i32 s5, 0x2000 -; GFX8-NEXT: s_cmp_gt_u32 s5, 0x3fffff -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v7, v5 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[11:12] -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xffffd000, v3 -; GFX8-NEXT: s_mov_b64 s[0:1], vcc -; GFX8-NEXT: v_addc_u32_e64 v14, vcc, -1, v4, s[2:3] -; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[13:14] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v9, v15 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, v10, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffd800, v3 -; GFX8-NEXT: s_mov_b64 s[2:3], vcc -; GFX8-NEXT: v_addc_u32_e64 v6, vcc, -1, v4, s[0:1] +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v4, vcc +; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xffffd000, v3 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffd800, v3 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffe000, v3 +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v4, vcc +; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] +; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] +; GFX8-NEXT: s_addk_i32 s1, 0x2000 +; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff +; GFX8-NEXT: s_waitcnt vmcnt(5) +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v7, v5 +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v8, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xffffe800, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xfffff000, v3 +; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[19:20] ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v7, v15 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xffffe000, v3 -; GFX8-NEXT: s_mov_b64 s[0:1], vcc -; GFX8-NEXT: v_addc_u32_e64 v10, vcc, -1, v4, s[2:3] -; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v11, v13 -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, v12, v8, vcc -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffe800, v3 -; GFX8-NEXT: s_mov_b64 s[2:3], vcc -; GFX8-NEXT: v_addc_u32_e64 v8, vcc, -1, v4, s[0:1] -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v5, v13 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v12, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xfffff000, v3 -; GFX8-NEXT: s_mov_b64 s[0:1], vcc -; GFX8-NEXT: v_addc_u32_e64 v12, vcc, -1, v4, s[2:3] -; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[11:12] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v9, v13 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v10, v6, vcc -; GFX8-NEXT: v_addc_u32_e64 v6, s[0:1], -1, v4, s[0:1] +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, -1, v4, vcc +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v9, v21 +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v10, v22, vcc ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xfffff800, v3 -; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] +; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, -1, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] -; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v7, v13 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v8, v14, vcc -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[3:4] +; GFX8-NEXT: s_waitcnt vmcnt(7) +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v11, v21 +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v12, v22, vcc +; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[3:4] ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x10000, v3 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: s_waitcnt vmcnt(7) +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v21 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v22, vcc +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v15, v13 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v16, v14, vcc +; GFX8-NEXT: s_waitcnt vmcnt(5) +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v17, v13 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v18, v14, vcc +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v19, v13 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v20, v14, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v13 -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, v12, v14, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v13 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v14, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v11 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v12, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v9, v5 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v10, v6, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v11, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v12, v6, vcc ; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX8-NEXT: s_add_i32 s0, s4, -1 -; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_add_i32 s1, s0, -1 +; GFX8-NEXT: s_cmp_eq_u32 s0, 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB1_5 ; GFX8-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 -; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_mov_b32 s0, s1 ; GFX8-NEXT: s_branch .LBB1_1 ; GFX8-NEXT: .LBB1_5: ; %while.end ; GFX8-NEXT: v_mov_b32_e32 v1, s35 @@ -601,63 +592,61 @@ ; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0xffffb000, v3 -; GFX900-NEXT: s_mov_b64 s[0:1], vcc -; GFX900-NEXT: v_addc_co_u32_e64 v8, s[0:1], -1, v4, s[0:1] +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v4, vcc ; GFX900-NEXT: global_load_dwordx2 v[9:10], v[3:4], off offset:-4096 ; GFX900-NEXT: global_load_dwordx2 v[11:12], v[3:4], off offset:-2048 ; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v3 ; GFX900-NEXT: global_load_dwordx2 v[7:8], v[7:8], off ; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v4, vcc +; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[19:20], v[13:14], off +; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s2, v3 +; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, s3, v3 +; GFX900-NEXT: global_load_dwordx2 v[15:16], v[15:16], off offset:-2048 +; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v4, vcc ; GFX900-NEXT: s_addk_i32 s6, 0x2000 ; GFX900-NEXT: s_cmp_gt_u32 s6, 0x3fffff -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v7, v5 -; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v6, vcc -; GFX900-NEXT: global_load_dwordx2 v[5:6], v[13:14], off offset:-2048 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, v5, v7 -; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v6, v8, vcc -; GFX900-NEXT: global_load_dwordx2 v[6:7], v[13:14], off -; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, s2, v3 -; GFX900-NEXT: s_mov_b64 s[0:1], vcc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, v6, v15 -; GFX900-NEXT: v_addc_co_u32_e64 v6, s[0:1], -1, v4, s[0:1] -; GFX900-NEXT: global_load_dwordx2 v[5:6], v[5:6], off offset:-2048 -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, v7, v8, vcc -; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, s3, v3 -; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v4, vcc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, v5, v13 -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, v6, v14, vcc -; GFX900-NEXT: global_load_dwordx2 v[5:6], v[7:8], off offset:-4096 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, v5, v13 -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, v6, v14, vcc -; GFX900-NEXT: global_load_dwordx2 v[5:6], v[7:8], off offset:-2048 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, v5, v13 -; GFX900-NEXT: global_load_dwordx2 v[7:8], v[7:8], off -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, v6, v14, vcc +; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, v7, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc +; GFX900-NEXT: global_load_dwordx2 v[7:8], v[13:14], off offset:-4096 +; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: v_add_co_u32_e64 v23, s[0:1], v17, v21 +; GFX900-NEXT: v_addc_co_u32_e64 v24, s[0:1], v18, v6, s[0:1] +; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[21:22], v[13:14], off ; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, s5, v3 ; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v4, vcc ; GFX900-NEXT: global_load_dwordx2 v[5:6], v[5:6], off offset:-2048 -; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, v7, v13 -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, v8, v14, vcc -; GFX900-NEXT: global_load_dwordx2 v[7:8], v[3:4], off +; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23 +; GFX900-NEXT: global_load_dwordx2 v[13:14], v[3:4], off +; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, v20, v24, vcc ; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, 0x10000, v3 ; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, v15, v19 +; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, v16, v20, vcc +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v7, v15 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v16, vcc +; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v17, v7 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v18, v8, vcc +; GFX900-NEXT: s_waitcnt vmcnt(2) +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v21, v7 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v22, v8, vcc ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v5, v13 -; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v14, vcc +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v8, vcc ; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v9, v5 ; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v10, v6, vcc ; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v11, v5 ; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v12, v6, vcc ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 -; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v13, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v14, v6, vcc ; GFX900-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1