Index: lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- lib/Target/AMDGPU/AMDGPU.h
+++ lib/Target/AMDGPU/AMDGPU.h
@@ -128,7 +128,8 @@
     PIXEL = 0,
     VERTEX = 1,
     GEOMETRY = 2,
-    COMPUTE = 3
+    COMPUTE = 3,
+    GL_COMPUTE = 4
   };
 }
 
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -60,6 +60,8 @@
   unsigned PSInputAddr;
   bool ReturnsVoid;
 
+  unsigned MaximumWorkGroupSize;
+
 public:
   // FIXME: Make private
   unsigned LDSWaveSpillSize;
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -48,6 +48,7 @@
     PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
     PSInputAddr(0),
     ReturnsVoid(true),
+    MaximumWorkGroupSize(256),
     LDSWaveSpillSize(0),
     PSInputEna(0),
     NumUserSGPRs(0),
@@ -120,6 +121,8 @@
   if (HasStackObjects && ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS &&
       ST.isAmdHsaOS())
     FlatScratchInit = true;
+
+  MaximumWorkGroupSize = AMDGPU::getMaximumWorkGroupSize(*F);
 }
 
 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
@@ -202,5 +205,8 @@
   const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
   // FIXME: We should get this information from kernel attributes if it
   // is available.
-  return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize();
+  if(getShaderType() == ShaderType::COMPUTE ||
+     getShaderType() == ShaderType::GL_COMPUTE)
+    return MaximumWorkGroupSize;
+  return ST.getWavefrontSize();
 }
Index: lib/Target/AMDGPU/SIRegisterInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -23,6 +23,44 @@
 
 using namespace llvm;
 
+namespace {
+
+unsigned getMinWaveCount(const MachineFunction &MF) {
+  const SIMachineFunctionInfo& MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  return (MFI.getMaximumWorkGroupSize(MF) + 255) / 256;
+}
+
+unsigned getAllowedSGPRCount(const MachineFunction &MF) {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  unsigned MinWaveCount = getMinWaveCount(MF);
+
+  unsigned AllowedSGPRCount;
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    AllowedSGPRCount = 800;
+  else
+    AllowedSGPRCount = 512;
+
+  AllowedSGPRCount = (AllowedSGPRCount / MinWaveCount) & ~7;
+
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    if (ST.hasSGPRInitBug())
+      AllowedSGPRCount = std::min<unsigned>(AllowedSGPRCount,
+                            AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) - 6;
+    else
+      AllowedSGPRCount = std::min(AllowedSGPRCount, 102U) - 6;
+  } else
+    AllowedSGPRCount = std::min(AllowedSGPRCount, 104U) - 2;
+  return AllowedSGPRCount;
+}
+
+unsigned getAllowedVGPRCount(const MachineFunction &MF) {
+  unsigned MinWaveCount = getMinWaveCount(MF);
+
+  return (256 / MinWaveCount) & ~3;
+}
+
+}
+
 SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {
   unsigned NumRegPressureSets = getNumRegPressureSets();
 
@@ -47,38 +85,20 @@
 
 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
   const MachineFunction &MF) const {
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
-  if (ST.hasSGPRInitBug()) {
-    // Leave space for flat_scr, xnack_mask, vcc, and alignment
-    unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4;
-    unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
-    return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
-  }
-
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and
-    // 100/101 for vcc. This is the next sgpr128 down.
-    return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
-  }
-
-  return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
+  unsigned BaseIdx = (getAllowedSGPRCount(MF) & ~3) - 4;
+  unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+  return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
 }
 
 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
   const MachineFunction &MF) const {
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
-  if (ST.hasSGPRInitBug()) {
-    unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1;
-    return AMDGPU::SGPR_32RegClass.getRegister(Idx);
-  }
-
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // Next register before reservations for flat_scr, xnack_mask, vcc,
-    // and scratch resource.
-    return AMDGPU::SGPR91;
-  }
-
-  return AMDGPU::SGPR95;
+  unsigned Idx = getAllowedSGPRCount(MF);
+  // Try to place it in a hole after PrivateSegmentbufferReg.
+  if(Idx & 3)
+	  Idx -= 1;
+  else
+	  Idx -= 5;
+  return AMDGPU::SGPR_32RegClass.getRegister(Idx);
 }
 
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
@@ -90,35 +110,21 @@
   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
 
-  // Reserve the last 2 registers so we will always have at least 2 more that
-  // will physically contain VCC.
-  reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103);
 
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  unsigned AllowedSGPRCount = getAllowedSGPRCount(MF);
+  unsigned AllowedVGPRCount = getAllowedVGPRCount(MF);
 
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation
-    // for VCC/XNACK_MASK/FLAT_SCR.
-    //
-    // TODO The SGPRs that alias to XNACK_MASK could be used as general purpose
-    // SGPRs when the XNACK feature is not used. This is currently not done
-    // because the code that counts SGPRs cannot account for such holes.
-    reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97);
-    reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
-    reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
+  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+  unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+  for (unsigned i = AllowedSGPRCount; i < NumSGPRs; ++i) {
+    unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
+    reserveRegisterTuples(Reserved, Reg);
   }
 
-  // Tonga and Iceland can only allocate a fixed number of SGPRs due
-  // to a hw bug.
-  if (ST.hasSGPRInitBug()) {
-    unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
-    // Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs).
-    unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6;
 
-    for (unsigned i = Limit; i < NumSGPRs; ++i) {
-      unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
-      reserveRegisterTuples(Reserved, Reg);
-    }
+  for (unsigned i = AllowedVGPRCount; i < NumVGPRs; ++i) {
+    unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
+    reserveRegisterTuples(Reserved, Reg);
   }
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
===================================================================
--- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -45,6 +45,7 @@
 bool isReadOnlySegment(const GlobalValue *GV);
 
 unsigned getShaderType(const Function &F);
+unsigned getMaximumWorkGroupSize(const Function &F);
 unsigned getInitialPSInputAddr(const Function &F);
 
 
Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
===================================================================
--- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -128,6 +128,10 @@
   return getIntegerAttribute(F, "ShaderType", ShaderType::COMPUTE);
 }
 
+unsigned getMaximumWorkGroupSize(const Function &F) {
+  return getIntegerAttribute(F, "MaximumWorkGroupSize", 256);
+}
+
 unsigned getInitialPSInputAddr(const Function &F) {
   return getIntegerAttribute(F, "InitialPSInputAddr", 0);
 }
Index: test/CodeGen/AMDGPU/large-work-group.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/large-work-group.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling < %s | FileCheck %s
+
+; CHECK: NumVgprs: 64
+define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, [16 x <8 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, <3 x i32> inreg, <3 x i32> inreg, <3 x i32>) #0 {
+main_body:
+  %8 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %4, i64 0, i64 8
+  %9 = load <4 x i32>, <4 x i32> addrspace(2)* %8, align 16, !tbaa !0
+  %10 = extractelement <3 x i32> %7, i32 0
+  %11 = extractelement <3 x i32> %7, i32 1
+  %12 = mul i32 %10, %11
+  %bc = bitcast <3 x i32> %7 to <3 x float>
+  %13 = extractelement <3 x float> %bc, i32 1
+  %14 = insertelement <512 x float> undef, float %13, i32 %12
+  call void @llvm.amdgcn.s.barrier()
+  %15 = extractelement <3 x i32> %6, i32 0
+  %16 = extractelement <3 x i32> %7, i32 0
+  %17 = shl i32 %15, 5
+  %18 = add i32 %17, %16
+  %19 = shl i32 %18, 4
+  %20 = extractelement <3 x i32> %7, i32 1
+  %21 = shl i32 %20, 2
+  %22 = sext i32 %21 to i64
+  %23 = getelementptr i8, i8 addrspace(3)* null, i64 %22
+  %24 = bitcast i8 addrspace(3)* %23 to i32 addrspace(3)*
+  %25 = load i32, i32 addrspace(3)* %24, align 4
+  %26 = extractelement <512 x float> %14, i32 %25
+  call void @llvm.amdgcn.buffer.store.f32(float %26, <4 x i32> %9, i32 0, i32 %19, i1 false, i1 false)
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare void @llvm.amdgcn.s.barrier() #1
+
+; Function Attrs: nounwind
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2
+
+attributes #0 = { "MaximumWorkGroupSize"="1024" "ShaderType"="4" }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind }
+
+!0 = !{!"const", null, i32 1}