Index: lib/Target/AMDGPU/AMDGPU.td
===================================================================
--- lib/Target/AMDGPU/AMDGPU.td
+++ lib/Target/AMDGPU/AMDGPU.td
@@ -182,6 +182,22 @@
   "Enable scratch buffer sizes greater than 128 GB"
 >;
 
+class MaxWorkGroupSizeFeature<int size> : SubtargetFeature<
+  "max-workgroup-size="#size,
+  "MaxWorkGroupSize",
+   !cast<string>(size),
+  "Set max workgroup size to "#size#" ("#
+    !cast<string>(!srl(size, 6))#" wavefronts)"
+>;
+
+// Theoretically every multiple of 64 up to the maximum 2048 can be
+// used as a workgroup size.
+//XXX - I'm not sure what the maximum size is for R600.
+foreach NumWaves = 1-32 in {
+  def FeatureMaxWorkGroupSize#!shl(NumWaves, 6) :
+    MaxWorkGroupSizeFeature<!shl(NumWaves, 6)>;
+}
+
 def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
   "EnableVGPRSpilling",
   "true",
Index: lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -35,10 +35,10 @@
   Module *Mod;
   const DataLayout *DL;
   MDNode *MaxWorkGroupSizeRange;
-  int MaxWorkGroupSize;
+  int32_t MaxWorkGroupSize;
 
   // FIXME: This should be per-kernel.
-  int LocalMemAvailable;
+  int32_t LocalMemAvailable;
 
   bool IsAMDGCN;
   bool IsAMDHSA;
@@ -489,7 +489,7 @@
   // available.
   uint64_t AllocaSize = MaxWorkGroupSize * DL->getTypeAllocSize(AllocaTy);
 
-  if (AllocaSize > LocalMemAvailable) {
+  if (AllocaSize > static_cast<uint32_t>(LocalMemAvailable)) {
     DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
     return;
   }
Index: lib/Target/AMDGPU/AMDGPUSubtarget.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -80,7 +80,8 @@
   bool EnableXNACK;
   unsigned WavefrontSize;
   bool CFALUBug;
-  uint32_t LocalMemorySize;
+  int32_t LocalMemorySize;
+  int32_t MaxWorkGroupSize;
   bool EnableVGPRSpilling;
   bool SGPRInitBug;
   bool IsGCN;
@@ -249,14 +250,12 @@
     return CFALUBug;
   }
 
-  uint32_t getLocalMemorySize() const {
+  int32_t getLocalMemorySize() const {
     return LocalMemorySize;
   }
 
-  int getMaxWorkGroupSize() const {
-    // FIXME: The hardware maximum is 2048. The runtime usually enforces a 256
-    // limit though.
-    return 256;
+  int32_t getMaxWorkGroupSize() const {
+    return MaxWorkGroupSize;
   }
 
   int getHardwareMaxWorkGroupSize() const {
Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -51,6 +51,10 @@
 
   ParseSubtargetFeatures(GPU, FullFS);
 
+  // If no specific size requested, set the default.
+  if (MaxWorkGroupSize == 0)
+    MaxWorkGroupSize = 256;
+
   // FIXME: I don't think think Evergreen has any useful support for
   // denormals, but should be checked. Should we issue a warning somewhere
   // if someone tries to enable these?
@@ -74,7 +78,7 @@
       EnableUnsafeDSOffsetFolding(false),
       EnableXNACK(false),
       WavefrontSize(0), CFALUBug(false),
-      LocalMemorySize(0),
+      LocalMemorySize(0), MaxWorkGroupSize(0),
       EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
       GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
       IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
Index: test/CodeGen/AMDGPU/max-workgroup-size-attr.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/max-workgroup-size-attr.ll
@@ -0,0 +1,50 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=64" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE64 %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=128" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE128 %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=192" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE192 %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=256" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE256 %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=640" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE640 %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=512" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE512 %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=1024" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE1024 %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=1600" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE1600 %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=1664" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=TOOBIG %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=1984" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=TOOBIG %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=2048" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=TOOBIG %s
+
+; Test if attribute is repeated with different sizes. Should be max
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=128,+max-workgroup-size=192,+max-workgroup-size=64" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE192 %s
+
+; MAXSIZE64: @promote_alloca_size.stack = internal unnamed_addr addrspace(3) global [64 x [5 x i32]] undef, align 4
+; MAXSIZE128: @promote_alloca_size.stack = internal unnamed_addr addrspace(3) global [128 x [5 x i32]] undef, align 4
+; MAXSIZE192: @promote_alloca_size.stack = internal unnamed_addr addrspace(3) global [192 x [5 x i32]] undef, align 4
+; MAXSIZE256: @promote_alloca_size.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
+; MAXSIZE512: @promote_alloca_size.stack = internal unnamed_addr addrspace(3) global [512 x [5 x i32]] undef, align 4
+; MAXSIZE640: @promote_alloca_size.stack = internal unnamed_addr addrspace(3) global [640 x [5 x i32]] undef, align 4
+; MAXSIZE1024: @promote_alloca_size.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] undef, align 4
+; MAXSIZE1600: @promote_alloca_size.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
+
+
+; This exceeds maximum LDS size.
+; TOOBIG: alloca [5 x i32]
+
+; FIXME: We should use subtarget features for each function instead of many opt -S run lines
+define void @promote_alloca_size(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %2 = load i32, i32* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+attributes #0 = { nounwind }