Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -343,11 +343,6 @@
 std::pair<unsigned, unsigned>
 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
   switch (CC) {
-  case CallingConv::AMDGPU_CS:
-  case CallingConv::AMDGPU_KERNEL:
-  case CallingConv::SPIR_KERNEL:
-    return std::make_pair(getWavefrontSize() * 2,
-                          std::max(getWavefrontSize() * 4, 256u));
   case CallingConv::AMDGPU_VS:
   case CallingConv::AMDGPU_LS:
   case CallingConv::AMDGPU_HS:
@@ -356,13 +351,12 @@
   case CallingConv::AMDGPU_PS:
     return std::make_pair(1, getWavefrontSize());
   default:
-    return std::make_pair(1, 16 * getWavefrontSize());
+    return std::make_pair(1u, getMaxFlatWorkGroupSize());
   }
 }
 
 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
   const Function &F) const {
-  // FIXME: 1024 if function.
   // Default minimum/maximum flat work group sizes.
   std::pair<unsigned, unsigned> Default =
     getDefaultFlatWorkGroupSize(F.getCallingConv());
Index: llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -412,7 +412,7 @@
 ; OPT-LABEL: @pointer_typed_alloca(
 ; OPT:  getelementptr inbounds [256 x i32 addrspace(1)*], [256 x i32 addrspace(1)*] addrspace(3)* @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}}
 ; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4
-define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) {
+define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) #1 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4, addrspace(5)
   store i32 addrspace(1)* %A, i32 addrspace(1)* addrspace(5)* %A.addr, align 4
@@ -556,7 +556,8 @@
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="1,256" }
+attributes #1 = { nounwind "amdgpu-flat-work-group-size"="1,256" }
 
 ; HSAOPT: !0 = !{}
 ; HSAOPT: !1 = !{i32 0, i32 257}
Index: llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -43,7 +43,7 @@
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind convergent }
 
Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
+++ llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
@@ -8,11 +8,11 @@
 ; CHECK: ---
 ; CHECK:  amdhsa.kernels:
 
-; CHECK:   - .args:           
+; CHECK:   - .args:
 ; CHECK:     .group_segment_fixed_size: 0
 ; CHECK:     .kernarg_segment_align: 8
 ; CHECK:     .kernarg_segment_size: 24
-; CHECK:     .max_flat_workgroup_size: 256
+; CHECK:     .max_flat_workgroup_size: 1024
 ; CHECK:     .name:           test
 ; CHECK:     .private_segment_fixed_size: 0
 ; WAVE64:    .sgpr_count:     8
@@ -33,6 +33,20 @@
   ret void
 }
 
+; CHECK:   - .args:
+; CHECK:     .max_flat_workgroup_size: 256
+define amdgpu_kernel void @test_max_flat_workgroup_size(
+    half addrspace(1)* %r,
+    half addrspace(1)* %a,
+    half addrspace(1)* %b) #2 {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %b.val = load half, half addrspace(1)* %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
 ; CHECK:   .name:       num_spilled_sgprs
 ; GFX700:   .sgpr_spill_count: 40
 ; GFX803:   .sgpr_spill_count: 24
@@ -149,3 +163,4 @@
 
 attributes #0 = { "amdgpu-num-sgpr"="14" }
 attributes #1 = { "amdgpu-num-vgpr"="20" }
+attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }
Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -18,7 +18,7 @@
 ; CHECK:     WavefrontSize:           64
 ; CHECK:     NumSGPRs:                8
 ; CHECK:     NumVGPRs:                6
-; CHECK:     MaxFlatWorkGroupSize:    256
+; CHECK:     MaxFlatWorkGroupSize:    1024
 define amdgpu_kernel void @test(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
@@ -31,6 +31,29 @@
   ret void
 }
 
+; CHECK-LABEL: - Name:       test_max_flat_workgroup_size
+; CHECK:   SymbolName: 'test_max_flat_workgroup_size@kd'
+; CHECK:   CodeProps:
+; CHECK:     KernargSegmentSize:      24
+; CHECK:     GroupSegmentFixedSize:   0
+; CHECK:     PrivateSegmentFixedSize: 0
+; CHECK:     KernargSegmentAlign:     8
+; CHECK:     WavefrontSize:           64
+; CHECK:     NumSGPRs:                8
+; CHECK:     NumVGPRs:                6
+; CHECK:     MaxFlatWorkGroupSize:    256
+define amdgpu_kernel void @test_max_flat_workgroup_size(
+    half addrspace(1)* %r,
+    half addrspace(1)* %a,
+    half addrspace(1)* %b) #2 {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %b.val = load half, half addrspace(1)* %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
 ; CHECK-LABEL: - Name:       num_spilled_sgprs
 ; CHECK:   SymbolName: 'num_spilled_sgprs@kd'
 ; CHECK:   CodeProps:
@@ -144,3 +167,4 @@
 
 attributes #0 = { "amdgpu-num-sgpr"="14" }
 attributes #1 = { "amdgpu-num-vgpr"="20" }
+attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }
Index: llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
+++ llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
@@ -39,7 +39,7 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
-attributes #0 = { norecurse nounwind }
+attributes #0 = { norecurse nounwind "amdgpu-flat-work-group-size"="1,256" }
 attributes #1 = { nounwind readnone }
 
 !0 = !{i32 0, i32 1024}
Index: llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
+++ llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
@@ -262,8 +262,8 @@
 }
 
 ; GCN-LABEL: {{^}}used_lds_6556:
-; GFX9:       ; Occupancy: 9
-; GFX1010W64: ; Occupancy: 19
+; GFX9:       ; Occupancy: 10
+; GFX1010W64: ; Occupancy: 20
 ; GFX1010W32: ; Occupancy: 20
 @lds6556 = internal addrspace(3) global [6556 x i8] undef, align 4
 define amdgpu_kernel void @used_lds_6556() {
@@ -273,9 +273,9 @@
 }
 
 ; GCN-LABEL: {{^}}used_lds_13112:
-; GFX9:       ; Occupancy: 4
-; GFX1010W64: ; Occupancy: 9
-; GFX1010W32: ; Occupancy: 19
+; GFX9:       ; Occupancy: 10
+; GFX1010W64: ; Occupancy: 20
+; GFX1010W32: ; Occupancy: 20
 @lds13112 = internal addrspace(3) global [13112 x i8] undef, align 4
 define amdgpu_kernel void @used_lds_13112() {
   %p = bitcast [13112 x i8] addrspace(3)* @lds13112 to i8 addrspace(3)*
Index: llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
+++ llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
@@ -300,4 +300,4 @@
 ; OPT: !0 = !{i32 0, i32 257}
 ; OPT: !1 = !{i32 0, i32 256}
 
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="1,256" }
Index: llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll
+++ llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll
@@ -18,4 +18,4 @@
   ret void
 }
 
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" }
Index: llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
+++ llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
@@ -64,4 +64,4 @@
 
 declare i32* @get_unknown_pointer() #0
 
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
Index: llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
+++ llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
@@ -201,4 +201,4 @@
 
 declare i32* @get_unknown_pointer() #0
 
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
Index: llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
+++ llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
@@ -131,5 +131,5 @@
   ret void
 }
 
-attributes #0 = { norecurse nounwind "amdgpu-waves-per-eu"="1,1" }
+attributes #0 = { norecurse nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
 attributes #1 = { norecurse nounwind }