Index: llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
===================================================================
--- llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -64,6 +64,14 @@
 
 class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
 
+  enum : unsigned {
+    Collect,
+    InitialSchedule,
+    UnclusteredReschedule,
+    ClusteredLowOccupancyReschedule,
+    LastStage = ClusteredLowOccupancyReschedule
+  };
+
   const GCNSubtarget &ST;
 
   SIMachineFunctionInfo &MFI;
@@ -84,6 +92,10 @@
   SmallVector<std::pair<MachineBasicBlock::iterator,
                         MachineBasicBlock::iterator>, 32> Regions;
 
+  // Records if a region is not yet scheduled, or schedule has been reverted,
+  // or we generally desire to reschedule it.
+  BitVector RescheduleRegions;
+
   // Region live-in cache.
   SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
 
Index: llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -316,13 +316,13 @@
   ST(MF.getSubtarget<GCNSubtarget>()),
   MFI(*MF.getInfo<SIMachineFunctionInfo>()),
   StartingOccupancy(MFI.getOccupancy()),
-  MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
+  MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) {
 
   LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
 }
 
 void GCNScheduleDAGMILive::schedule() {
-  if (Stage == 0) {
+  if (Stage == Collect) {
     // Just record regions at the first pass.
     Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
     return;
@@ -348,6 +348,7 @@
 
   ScheduleDAGMILive::schedule();
   Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
+  RescheduleRegions[RegionIdx] = false;
 
   if (!LIS)
     return;
@@ -389,20 +390,28 @@
                       << MinOccupancy << ".\n");
   }
 
+  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+  unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+  if (PressureAfter.getVGPRNum() > MaxVGPRs ||
+      PressureAfter.getSGPRNum() > MaxSGPRs)
+    RescheduleRegions[RegionIdx] = true;
+
   if (WavesAfter >= MinOccupancy) {
-    unsigned TotalVGPRs = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
-    unsigned TotalSGPRs = AMDGPU::IsaInfo::getAddressableNumSGPRs(&ST);
-    if (WavesAfter > MFI.getMinWavesPerEU() ||
+    if (Stage == UnclusteredReschedule &&
+        !PressureAfter.less(ST, PressureBefore)) {
+      LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
+    } else if (WavesAfter > MFI.getMinWavesPerEU() ||
         PressureAfter.less(ST, PressureBefore) ||
-        (TotalVGPRs >= PressureAfter.getVGPRNum() &&
-         TotalSGPRs >= PressureAfter.getSGPRNum())) {
+        !RescheduleRegions[RegionIdx]) {
       Pressure[RegionIdx] = PressureAfter;
       return;
+    } else {
+      LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
     }
-    LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
   }
 
   LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
+  RescheduleRegions[RegionIdx] = true;
   RegionEnd = RegionBegin;
   for (MachineInstr *MI : Unsched) {
     if (MI->isDebugInstr())
@@ -532,33 +541,55 @@
 
   LiveIns.resize(Regions.size());
   Pressure.resize(Regions.size());
+  RescheduleRegions.resize(Regions.size());
+  RescheduleRegions.set();
 
   if (!Regions.empty())
     BBLiveInMap = getBBLiveInMap();
 
+  std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
+
   do {
     Stage++;
     RegionIdx = 0;
     MachineBasicBlock *MBB = nullptr;
 
-    if (Stage > 1) {
+    if (Stage > InitialSchedule) {
+      if (!LIS)
+        break;
+
       // Retry function scheduling if we found resulting occupancy and it is
       // lower than used for first pass scheduling. This will give more freedom
       // to schedule low register pressure blocks.
       // Code is partially copied from MachineSchedulerBase::scheduleRegions().
 
-      if (!LIS || StartingOccupancy <= MinOccupancy)
-        break;
+      if (Stage == UnclusteredReschedule) {
+        if (RescheduleRegions.none())
+          continue;
+        LLVM_DEBUG(dbgs() <<
+          "Retrying function scheduling without clustering.\n");
+      }
+
+      if (Stage == ClusteredLowOccupancyReschedule) {
+        if (StartingOccupancy <= MinOccupancy)
+          break;
 
-      LLVM_DEBUG(
-          dbgs()
-          << "Retrying function scheduling with lowest recorded occupancy "
-          << MinOccupancy << ".\n");
+        LLVM_DEBUG(
+            dbgs()
+            << "Retrying function scheduling with lowest recorded occupancy "
+            << MinOccupancy << ".\n");
 
-      S.setTargetOccupancy(MinOccupancy);
+        S.setTargetOccupancy(MinOccupancy);
+      }
     }
 
+    if (Stage == UnclusteredReschedule)
+      SavedMutations.swap(Mutations);
+
     for (auto Region : Regions) {
+      if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx])
+        continue;
+
       RegionBegin = Region.first;
       RegionEnd = Region.second;
 
@@ -566,7 +597,7 @@
         if (MBB) finishBlock();
         MBB = RegionBegin->getParent();
         startBlock(MBB);
-        if (Stage == 1)
+        if (Stage == InitialSchedule)
           computeBlockPressure(MBB);
       }
 
@@ -594,5 +625,7 @@
     }
     finishBlock();
 
-  } while (Stage < 2);
+    if (Stage == UnclusteredReschedule)
+      SavedMutations.swap(Mutations);
+  } while (Stage != LastStage);
 }
Index: llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
@@ -0,0 +1,36 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Interleave loads and stores to fit into 9 VGPR limit.
+; This requires to avoid load/store clustering.
+
+; GCN: global_load_dwordx4
+; GCN: global_store_dwordx4
+; GCN: global_load_dwordx4
+; GCN: global_store_dwordx4
+; GCN: global_load_dwordx4
+; GCN: global_store_dwordx4
+; GCN: NumVgprs: {{[0-9]$}}
+; GCN: ScratchSize: 0{{$}}
+
+define amdgpu_kernel void @load_store_max_9vgprs(<4 x i32> addrspace(1)* nocapture noalias readonly %arg, <4 x i32> addrspace(1)* nocapture noalias %arg1) #1 {
+bb:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %base = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %id
+  %tmp = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 1
+  %tmp2 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp, align 4
+  %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 3
+  %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 4
+  %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 5
+  %tmp6 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp5, align 4
+  store <4 x i32> %tmp2, <4 x i32> addrspace(1)* %arg1, align 4
+  %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 3
+  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp7, align 4
+  %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 5
+  store <4 x i32> %tmp6, <4 x i32> addrspace(1)* %tmp8, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { "amdgpu-num-vgpr"="9" }