diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -351,6 +351,13 @@
   BitVector getAllocatableSet(const MachineFunction &MF,
                               const TargetRegisterClass *RC = nullptr) const;
 
+  /// Returns a list of bitsets each indexed by register number indicating if a
+  /// register is allocatable or not. One bitset is computed for each provided
+  /// register class.
+  SmallVector<BitVector>
+  getAllocatableSets(const MachineFunction &MF,
+                     SmallVectorImpl<const TargetRegisterClass *> &RCs) const;
+
   /// Get a list of cost values for all registers that correspond to the index
   /// returned by RegisterCostTableIndex.
   ArrayRef<uint8_t> getRegisterCosts(const MachineFunction &MF) const {
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -273,6 +273,24 @@
   return Allocatable;
 }
 
+SmallVector<BitVector> TargetRegisterInfo::getAllocatableSets(
+    const MachineFunction &MF,
+    SmallVectorImpl<const TargetRegisterClass *> &RCs) const {
+  SmallVector<BitVector> AllocatableSets;
+  BitVector NotReserved = getReservedRegs(MF).flip();
+  for (const TargetRegisterClass *RC : RCs) {
+    BitVector &Allocatable = AllocatableSets.emplace_back(getNumRegs());
+    // A register class with no allocatable subclass returns an empty set.
+    const TargetRegisterClass *SubClass = getAllocatableClass(RC);
+    if (SubClass)
+      getAllocatableSetForRC(MF, SubClass, Allocatable);
+    // Mask out the reserved registers
+    Allocatable &= NotReserved;
+  }
+
+  return AllocatableSets;
+}
+
 static inline
 const TargetRegisterClass *firstCommonClass(const uint32_t *A,
                                             const uint32_t *B,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1169,7 +1169,11 @@
 
   if (OptExecMaskPreRA)
     insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
-  insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
+
+  // This is not an essential optimization and it has a noticeable impact on
+  // compilation time, so we only enable it from O2.
+  if (TM->getOptLevel() > CodeGenOpt::Less)
+    insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
 
   // This must be run immediately after phi elimination and before
   // TwoAddressInstructions, otherwise the processing of the tied operand of
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -273,8 +273,12 @@
   SlotIndexes *Ind = LIS->getSlotIndexes();
   bool Changed = false;
 
-  MaxVGPRs = TRI->getAllocatableSet(MF, &AMDGPU::VGPR_32RegClass).count();
-  MaxSGPRs = TRI->getAllocatableSet(MF, &AMDGPU::SGPR_32RegClass).count();
+  SmallVector<const TargetRegisterClass *> RCs(
+      {&AMDGPU::VGPR_32RegClass, &AMDGPU::SGPR_32RegClass});
+  SmallVector<BitVector, 2> AllocatableSets = TRI->getAllocatableSets(MF, RCs);
+  MaxVGPRs = AllocatableSets[0].count();
+  MaxSGPRs = AllocatableSets[1].count();
+
   unsigned FuncMaxClause = AMDGPU::getIntegerAttribute(
       MF.getFunction(), "amdgpu-max-memory-clause", MaxClause);
 
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -349,7 +349,6 @@
 ; GCN-O1-NEXT:         Live Register Matrix
 ; GCN-O1-NEXT:         SI Pre-allocate WWM Registers
 ; GCN-O1-NEXT:         SI optimize exec mask operations pre-RA
-; GCN-O1-NEXT:         SI Form memory clauses
 ; GCN-O1-NEXT:         Machine Natural Loop Construction
 ; GCN-O1-NEXT:         Machine Block Frequency Analysis
 ; GCN-O1-NEXT:         Debug Variable Analysis
@@ -632,7 +631,6 @@
 ; GCN-O1-OPTS-NEXT:         Live Register Matrix
 ; GCN-O1-OPTS-NEXT:         SI Pre-allocate WWM Registers
 ; GCN-O1-OPTS-NEXT:         SI optimize exec mask operations pre-RA
-; GCN-O1-OPTS-NEXT:         SI Form memory clauses
 ; GCN-O1-OPTS-NEXT:         Machine Natural Loop Construction
 ; GCN-O1-OPTS-NEXT:         Machine Block Frequency Analysis
 ; GCN-O1-OPTS-NEXT:         Debug Variable Analysis