diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -26,32 +26,36 @@ void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { GenericScheduler::initialize(DAG); - const SIRegisterInfo *SRI = static_cast(TRI); - MF = &DAG->MF; const GCNSubtarget &ST = MF->getSubtarget(); // FIXME: This is also necessary, because some passes that run after // scheduling and before regalloc increase register pressure. - const int ErrorMargin = 3; - - SGPRExcessLimit = Context->RegClassInfo - ->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass) - ErrorMargin; - VGPRExcessLimit = Context->RegClassInfo - ->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass) - ErrorMargin; - if (TargetOccupancy) { - SGPRCriticalLimit = ST.getMaxNumSGPRs(TargetOccupancy, true); - VGPRCriticalLimit = ST.getMaxNumVGPRs(TargetOccupancy); - } else { - SGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF, - AMDGPU::RegisterPressureSets::SReg_32); - VGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF, - AMDGPU::RegisterPressureSets::VGPR_32); - } - - SGPRCriticalLimit -= ErrorMargin; - VGPRCriticalLimit -= ErrorMargin; + const unsigned ErrorMargin = 3; + + SGPRExcessLimit = + Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); + VGPRExcessLimit = + Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass); + + SIMachineFunctionInfo &MFI = *MF->getInfo(); + // Set the initial TargetOccupnacy to the maximum occupancy that we can + // achieve for this function. This effectively sets a lower bound on the + // 'Critical' register limits in the scheduler. + TargetOccupancy = MFI.getOccupancy(); + SGPRCriticalLimit = + std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit); + VGPRCriticalLimit = + std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit); + + // Subtract error margin from register limits and avoid overflow. + SGPRCriticalLimit = + std::min(SGPRCriticalLimit - ErrorMargin, SGPRCriticalLimit); + VGPRCriticalLimit = + std::min(VGPRCriticalLimit - ErrorMargin, VGPRCriticalLimit); + SGPRExcessLimit = std::min(SGPRExcessLimit - ErrorMargin, SGPRExcessLimit); + VGPRExcessLimit = std::min(VGPRExcessLimit - ErrorMargin, VGPRExcessLimit); } void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, @@ -361,14 +365,18 @@ LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); return; } - unsigned Occ = MFI.getOccupancy(); - unsigned WavesAfter = std::min(Occ, PressureAfter.getOccupancy(ST)); - unsigned WavesBefore = std::min(Occ, PressureBefore.getOccupancy(ST)); + + unsigned WavesAfter = + std::min(S.TargetOccupancy, PressureAfter.getOccupancy(ST)); + unsigned WavesBefore = + std::min(S.TargetOccupancy, PressureBefore.getOccupancy(ST)); LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore << ", after " << WavesAfter << ".\n"); - // We could not keep current target occupancy because of the just scheduled - // region. Record new occupancy for next scheduling cycle. + // We may not be able to keep the current target occupancy because of the just + // scheduled region. We might still be able to revert scheduling if the + // occupancy before was higher, or if the current schedule has register + // pressure higher than the excess limits which could lead to more spilling. unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); // Allow memory bound functions to drop to 4 waves if not limited by an // attribute. @@ -378,6 +386,7 @@ << MFI.getMinAllowedOccupancy() << " waves\n"); NewOccupancy = WavesAfter; } + if (NewOccupancy < MinOccupancy) { MinOccupancy = NewOccupancy; MFI.limitOccupancy(MinOccupancy); @@ -394,6 +403,11 @@ RegionsWithHighRP[RegionIdx] = true; } + // If this condition is true, then either the occupancy before and after + // scheduling is the same, or we are allowing the occupancy to drop because + // the function is memory bound. Even if we are OK with the current occupancy, + // we still need to verify that we will not introduce any extra chance of + // spilling. if (WavesAfter >= MinOccupancy) { if (Stage == UnclusteredReschedule && !PressureAfter.less(ST, PressureBefore)) { @@ -540,7 +554,6 @@ } void GCNScheduleDAGMILive::finalizeSchedule() { - GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); LiveIns.resize(Regions.size()); @@ -586,8 +599,6 @@ dbgs() << "Retrying function scheduling with lowest recorded occupancy " << MinOccupancy << ".\n"); - - S.setTargetOccupancy(MinOccupancy); } } diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -3477,109 +3477,139 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, s0, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, s0, v2 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v32, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, 0xffff +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v15 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v18 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s0, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v14 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v33, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v34, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v35, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, s0, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v36, s0, v0 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, s0, v13 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, s0, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v40, s0, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, s0, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, s0, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, s0, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s0, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, s0, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v44, s0, v8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, s0, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, s0, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v48, s0, v12 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v18 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, s0, v19 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s0, v18 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, s0, v17 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v52, s0, v16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v23 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s0, v22 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, s0, v21 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v56, s0, v20 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v27 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v26 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v61, 16, v24 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, s0, v27 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, s0, v26 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v62, s0, v25 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, s0, v24 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, s0, v12 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v16 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v19 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, s0, v18 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, s0, v17 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s0, v16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v26 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v24 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v27 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s0, v26 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, s0, v25 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, s0, v24 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, s0, v31 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, s0, v30 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s0, v29 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v28 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v33, s0, v29 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, s0, v28 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v37 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v35 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, s0, v38 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v43, s0, v37 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, s0, v36 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v47, s0, v35 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v54, 16, v40 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v39 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v37, s0, v42 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v35, s0, v41 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[55:58], off, s[8:11], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, s0, v40 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v51, s0, v39 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v61, 16, v58 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 16, v57 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v56 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v55 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, s0, v58 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, s0, v57 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, s0, v56 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v55 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v42 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v41 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, s0, v42 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s0, v41 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s0, v40 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v39 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload @@ -3785,122 +3815,109 @@ ; ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000 -; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s3 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, 0xffff -; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, 0xffff +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v62, s4, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, s4, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, s4, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s4, v4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s0, v19 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v18 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[56:59], off, s[8:11], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, s0, v17 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s0, v16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, s0, v23 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, s0, v22 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s4, v23 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s4, v22 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, s0, v21 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, s4, v21 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s0, v20 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s4, v20 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v27 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, s0, v27 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, s4, v27 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, s0, v26 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, s4, v26 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, s0, v25 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, s4, v25 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, s0, v24 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, s4, v24 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, s0, v31 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, s4, v31 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v30 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, s0, v30 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, s4, v30 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, s0, v29 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, s4, v29 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s0, v28 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v46, s0, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, s0, v14 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s4, v28 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v35 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v46, s4, v35 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v34 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, s4, v34 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, s4, v33 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, s4, v32 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, s4, v19 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, s4, v18 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, s4, v17 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s4, v16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, s4, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, s4, v14 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, s0, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, s4, v13 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s0, v12 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, s0, v11 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, s0, v10 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s4, v12 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v58, s4, v11 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, s4, v10 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s0, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s4, v9 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s0, v8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, s0, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, s0, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, s0, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s0, v4 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v61, 16, v59 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, s0, v59 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 16, v58 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v58, s0, v58 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v57 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s0, v57 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v56 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s4, v8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -4265,38 +4282,40 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s3 -; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xe8f000 +; GCN-NOHSA-SI-NEXT: s_add_u32 s8, s8, s3 +; GCN-NOHSA-SI-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, s3 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v11 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v10 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v11, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v9, 0, 16 @@ -4358,8 +4377,6 @@ ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v12 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v13, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v12, 0, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 @@ -4375,10 +4392,10 @@ ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -4589,27 +4606,28 @@ ; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s3 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v59, 16, v13 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v63, 16, v17 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(5) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v55, 16, v17 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v59, 16, v21 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v51, 16, v21 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v55, 16, v25 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v11 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v10 @@ -4620,22 +4638,18 @@ ; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v9 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v27 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v26 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v27, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v26, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v39, 16, v25 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v37, 16, v24 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v38, v25, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v36, v24, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 16, v31 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 16, v30 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v31, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v30, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v13 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v12 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v31 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v30 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v31, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v30, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v43, 16, v29 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v41, 16, v28 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v42, v29, 0, 16 @@ -4648,62 +4662,54 @@ ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v45, 16, v32 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v46, v33, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v44, v32, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v23 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v22 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v23, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v22, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v49, 16, v20 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v50, v21, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v48, v20, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v39 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v38 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v39, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v38, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v51, 16, v37 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v49, 16, v36 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v50, v37, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v48, v36, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v39, 16, v27 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v37, 16, v26 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v38, v27, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v36, v26, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v53, 16, v24 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v54, v25, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v52, v24, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 16, v23 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 16, v22 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v23, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v22, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v57, 16, v20 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v58, v21, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v56, v20, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 16, v19 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 16, v18 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v19, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v18, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v53, 16, v16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v54, v17, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v52, v16, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v61, 16, v16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v62, v17, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v60, v16, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 16, v15 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v14 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v57, 16, v12 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v58, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v56, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[60:63], off, s[8:11], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v61 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v60 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v61, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v60, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v63 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v62 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v63, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v62, 0, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll @@ -1,8 +1,17 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -enable-amdgpu-aa=0 -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -enable-amdgpu-aa=0 -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=MISCHED %s +; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-ILP %s + +; Test the scheduler when only one wave is requested. The result should be high register usage and max ILP. ; We expect a three digit VGPR usage here since only one wave requested. -; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}} +; +; GCN-ILP: NumVgprs: {{[0-9][0-9][0-9]$}} + +; FIXME: The machine scheduler is doing a poor job at maximizing ILP here. +; However, if we had not requested only one wave register usage would indeed be +; much lower, demonstrating that is the purpose of this test. +; +; MISCHED: NumVgprs: {{[7-9][0-9]$}} define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(1)* nocapture %arg1) #1 { bb: diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-misched-max-waves.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-misched-max-waves.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-misched-max-waves.ll @@ -0,0 +1,110 @@ +; REQUIRES: asserts + +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -debug-only=machine-scheduler -o /dev/null < %s 2>&1 | FileCheck %s + +; We are only targeting one wave. Check that the machine scheduler doesn't use +; register pressure heuristics to prioritize any candidate instruction. + +; CHECK-NOT: REG-CRIT +; CHECK-NOT: REG-EXCESS + +define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(3)* nocapture %arg1) #1 { +bb: + %tmp0 = getelementptr inbounds float, float addrspace(3)* %arg, i32 1 + %tmp1 = load float, float addrspace(3)* %tmp0, align 4 + %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2 + %tmp3 = load float, float addrspace(3)* %tmp2, align 4 + %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 3 + %tmp5 = load float, float addrspace(3)* %tmp4, align 4 + %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4 + %tmp7 = load float, float addrspace(3)* %tmp6, align 4 + %tmp8 = getelementptr inbounds float, float addrspace(3)* %arg, i32 5 + %tmp9 = load float, float addrspace(3)* %tmp8, align 4 + %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6 + %tmp11 = load float, float addrspace(3)* %tmp10, align 4 + %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 7 + %tmp13 = load float, float addrspace(3)* %tmp12, align 4 + %tmp14 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8 + %tmp15 = load float, float addrspace(3)* %tmp14, align 4 + %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 9 + %tmp17 = load float, float addrspace(3)* %tmp16, align 4 + %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10 + %tmp19 = load float, float addrspace(3)* %tmp18, align 4 + %tmp20 = getelementptr inbounds float, float addrspace(3)* %arg, i32 11 + %tmp21 = load float, float addrspace(3)* %tmp20, align 4 + %tmp22 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12 + %tmp23 = load float, float addrspace(3)* %tmp22, align 4 + %tmp24 = getelementptr inbounds float, float addrspace(3)* %arg, i32 13 + %tmp25 = load float, float addrspace(3)* %tmp24, align 4 + %tmp26 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14 + %tmp27 = load float, float addrspace(3)* %tmp26, align 4 + %tmp28 = getelementptr inbounds float, float addrspace(3)* %arg, i32 15 + %tmp29 = load float, float addrspace(3)* %tmp28, align 4 + %tmp30 = getelementptr inbounds float, float addrspace(3)* %arg, i32 16 + %tmp31 = load float, float addrspace(3)* %tmp30, align 4 + %tmp32 = getelementptr inbounds float, float addrspace(3)* %arg, i32 17 + %tmp33 = load float, float addrspace(3)* %tmp32, align 4 + %tmp34 = getelementptr inbounds float, float addrspace(3)* %arg, i32 18 + %tmp35 = load float, float addrspace(3)* %tmp34, align 4 + %tmp36 = getelementptr inbounds float, float addrspace(3)* %arg, i32 19 + %tmp37 = load float, float addrspace(3)* %tmp36, align 4 + %tmp38 = getelementptr inbounds float, float addrspace(3)* %arg, i32 20 + %tmp39 = load float, float addrspace(3)* %tmp38, align 4 + %tmp40 = getelementptr inbounds float, float addrspace(3)* %arg, i32 21 + %tmp41 = load float, float addrspace(3)* %tmp40, align 4 + %tmp42 = getelementptr inbounds float, float addrspace(3)* %arg, i32 22 + %tmp43 = load float, float addrspace(3)* %tmp42, align 4 + %tmp44 = getelementptr inbounds float, float addrspace(3)* %arg, i32 23 + %tmp45 = load float, float addrspace(3)* %tmp44, align 4 + %tmp46 = getelementptr inbounds float, float addrspace(3)* %arg, i32 24 + %tmp47 = load float, float addrspace(3)* %tmp46, align 4 + %tmp48 = getelementptr inbounds float, float addrspace(3)* %arg, i32 25 + %tmp49 = load float, float addrspace(3)* %tmp48, align 4 + %tmp50 = getelementptr inbounds float, float addrspace(3)* %arg, i32 26 + %tmp51 = load float, float addrspace(3)* %tmp50, align 4 + %tmp52 = getelementptr inbounds float, float addrspace(3)* %arg, i32 27 + %tmp53 = load float, float addrspace(3)* %tmp52, align 4 + %tmp54 = getelementptr inbounds float, float addrspace(3)* %arg, i32 28 + %tmp55 = load float, float addrspace(3)* %tmp54, align 4 + %tmp56 = getelementptr inbounds float, float addrspace(3)* %arg, i32 29 + %tmp57 = load float, float addrspace(3)* %tmp56, align 4 + %tmp58 = getelementptr inbounds float, float addrspace(3)* %arg, i32 30 + %tmp59 = load float, float addrspace(3)* %tmp58, align 4 + %tmp60 = tail call float @llvm.fmuladd.f32(float %tmp1, float %tmp3, float %tmp5) + %tmp61 = tail call float @llvm.fmuladd.f32(float %tmp7, float %tmp9, float %tmp11) + %tmp62 = tail call float @llvm.fmuladd.f32(float %tmp13, float %tmp15, float %tmp17) + %tmp63 = tail call float @llvm.fmuladd.f32(float %tmp19, float %tmp21, float %tmp23) + %tmp64 = tail call float @llvm.fmuladd.f32(float %tmp25, float %tmp27, float %tmp29) + %tmp65 = tail call float @llvm.fmuladd.f32(float %tmp31, float %tmp33, float %tmp35) + %tmp66 = tail call float @llvm.fmuladd.f32(float %tmp37, float %tmp39, float %tmp41) + %tmp67 = tail call float @llvm.fmuladd.f32(float %tmp43, float %tmp45, float %tmp47) + %tmp68 = tail call float @llvm.fmuladd.f32(float %tmp49, float %tmp51, float %tmp53) + %tmp69 = tail call float @llvm.fmuladd.f32(float %tmp55, float %tmp57, float %tmp59) + %tmp70 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 1 + store float %tmp60, float addrspace(3)* %tmp70, align 4 + %tmp71 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 2 + store float %tmp61, float addrspace(3)* %tmp71, align 4 + %tmp72 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 3 + store float %tmp62, float addrspace(3)* %tmp72, align 4 + %tmp73 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 4 + store float %tmp63, float addrspace(3)* %tmp73, align 4 + %tmp74 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 5 + store float %tmp64, float addrspace(3)* %tmp74, align 4 + %tmp75 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 6 + store float %tmp65, float addrspace(3)* %tmp75, align 4 + %tmp76 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 7 + store float %tmp66, float addrspace(3)* %tmp76, align 4 + %tmp77 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 8 + store float %tmp67, float addrspace(3)* %tmp77, align 4 + %tmp78 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 9 + store float %tmp68, float addrspace(3)* %tmp78, align 4 + %tmp79 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 10 + store float %tmp69, float addrspace(3)* %tmp79, align 4 + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.fmuladd.f32(float, float, float) #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir --- a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir @@ -7,10 +7,10 @@ # CHECK-LABEL: name: expecting_non_empty_interval -# CHECK: undef %7.sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef %7.sub1, implicit $mode, implicit $exec -# CHECK-NEXT: SI_SPILL_V64_SAVE %7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) -# CHECK-NEXT: undef %5.sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec +# CHECK: undef %5.sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec # CHECK-NEXT: dead %3:vgpr_32 = V_MUL_F32_e32 0, %5.sub1, implicit $mode, implicit $exec +# CHECK-NEXT: undef %7.sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef %7.sub1, implicit $mode, implicit $exec +# CHECK-NEXT: SI_SPILL_V64_SAVE %7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) # CHECK: S_NOP 0, implicit %6.sub1 # CHECK-NEXT: %8:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) @@ -47,9 +47,9 @@ # CHECK: bb.1: # CHECK-NEXT: S_NOP 0, implicit %1.sub2 -# CHECK-NEXT: S_NOP 0, implicit undef %4.sub0 # CHECK-NEXT: undef %2.sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec # CHECK-NEXT: S_NOP 0, implicit %2.sub2 +# CHECK-NEXT: S_NOP 0, implicit undef %4.sub0 name: rematerialize_empty_interval_has_reference tracksRegLiveness: true machineFunctionInfo: