diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -410,10 +410,10 @@
   // minimum/maximum flat work group sizes.
   unsigned MinImpliedByFlatWorkGroupSize =
     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
+  Default.first = MinImpliedByFlatWorkGroupSize;
   bool RequestedFlatWorkGroupSize = false;
 
   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
-    Default.first = MinImpliedByFlatWorkGroupSize;
     RequestedFlatWorkGroupSize = true;
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll b/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll
--- a/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll
+++ b/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll
@@ -4,9 +4,11 @@
 ; CHECK-NEXT: s_nop 0
 ; CHECK-NEXT: ;;#ASMEND
 
-define void @foo(i32 addrspace(5)* %ptr) {
+define void @foo(i32 addrspace(5)* %ptr) #0 {
   %tmp = tail call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "s_nop 0", "=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65"(i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2)
   %tmp2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %tmp, 0
   store i32 %tmp2, i32 addrspace(5)* %ptr, align 4
   ret void
 }
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,768" }
diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
--- a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
@@ -1,6 +1,25 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
+--- |
+    define amdgpu_kernel void @a_to_v() #0 { ret void }
+    define amdgpu_kernel void @a4_to_v4() #0 { ret void }
+    define amdgpu_kernel void @a16_to_v16() #0 { ret void }
+
+    define amdgpu_kernel void @v_to_a() #0 { ret void }
+    define amdgpu_kernel void @v4_to_a4() #0 { ret void }
+    define amdgpu_kernel void @v16_to_a16() #0 { ret void }
+
+    define amdgpu_kernel void @s_to_a() #0 { ret void }
+    define amdgpu_kernel void @s2_to_a2() #0 { ret void }
+
+    define amdgpu_kernel void @a_to_a() #0 { ret void }
+    define amdgpu_kernel void @a2_to_a2() #0 { ret void }
+
+    define amdgpu_kernel void @a_to_a_spill() #0 { ret void }
+    attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+...
+
 ---
 name:            a_to_v
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
--- a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
@@ -47,4 +47,4 @@
   ret void
 }
 
-attributes #0 = { nounwind noinline norecurse }
+attributes #0 = { nounwind noinline norecurse "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
@@ -199,7 +199,7 @@
 ; GFX908: ScratchSize: 0
 ; GCN:    VGPRBlocks: 63
 ; GCN:    NumVGPRsForWavesPerEU: 256
-define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) {
+define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
   %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
@@ -250,7 +250,7 @@
 ; GFX908-FIXME: ScratchSize: 0
 ; GCN:    VGPRBlocks: 63
 ; GCN:    NumVGPRsForWavesPerEU: 256
-define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) {
+define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
   %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
@@ -288,3 +288,4 @@
 declare i32 @llvm.amdgcn.workitem.id.x()
 
 attributes #0 = { nounwind "amdgpu-num-vgpr"="10" }
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
@@ -2,6 +2,14 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9,GFX9_10 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX10,GFX9_10 %s
 
+--- |
+    define amdgpu_kernel void @max-counter-lgkmcnt() #0 { ret void }
+    define amdgpu_kernel void @max-counter-vmcnt() #0 { ret void }
+    define amdgpu_kernel void @max-counter-expcnt() #0 { ret void }
+
+    attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+...
+
 # Check that we handle cases where a counter has overflowed.
 
 # Overflows lgkmcnt with gfx9 but not with gfx10.