diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -404,10 +404,10 @@
   // minimum/maximum flat work group sizes.
   unsigned MinImpliedByFlatWorkGroupSize =
     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
+  Default.first = MinImpliedByFlatWorkGroupSize;
   bool RequestedFlatWorkGroupSize = false;
 
   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
-    Default.first = MinImpliedByFlatWorkGroupSize;
     RequestedFlatWorkGroupSize = true;
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll b/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll
--- a/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll
+++ b/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll
@@ -4,9 +4,11 @@
 ; CHECK-NEXT: s_nop 0
 ; CHECK-NEXT: ;;#ASMEND
 
-define void @foo(i32 addrspace(5)* %ptr) {
+define void @foo(i32 addrspace(5)* %ptr) #0 {
   %tmp = tail call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "s_nop 0", "=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65"(i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2)
   %tmp2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %tmp, 0
   store i32 %tmp2, i32 addrspace(5)* %ptr, align 4
   ret void
 }
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,768" }
diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
--- a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
@@ -1,5 +1,24 @@
 # RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
+--- |
+    define amdgpu_kernel void @a_to_v() #0 { ret void }
+    define amdgpu_kernel void @a4_to_v4() #0 { ret void }
+    define amdgpu_kernel void @a16_to_v16() #0 { ret void }
+
+    define amdgpu_kernel void @v_to_a() #0 { ret void }
+    define amdgpu_kernel void @v4_to_a4() #0 { ret void }
+    define amdgpu_kernel void @v16_to_a16() #0 { ret void }
+
+    define amdgpu_kernel void @s_to_a() #0 { ret void }
+    define amdgpu_kernel void @s2_to_a2() #0 { ret void }
+
+    define amdgpu_kernel void @a_to_a() #0 { ret void }
+    define amdgpu_kernel void @a2_to_a2() #0 { ret void }
+
+    define amdgpu_kernel void @a_to_a_spill() #0 { ret void }
+    attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+...
+
 # GCN-LABEL: name: a_to_v
 # GCN: $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
--- a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
@@ -47,4 +47,4 @@
   ret void
 }
 
-attributes #0 = { nounwind noinline norecurse }
+attributes #0 = { nounwind noinline norecurse "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
@@ -199,7 +199,7 @@
 ; GFX908: ScratchSize: 0
 ; GCN:    VGPRBlocks: 63
 ; GCN:    NumVGPRsForWavesPerEU: 256
-define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) {
+define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
   %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
@@ -250,7 +250,7 @@
 ; GFX908-FIXME: ScratchSize: 0
 ; GCN:    VGPRBlocks: 63
 ; GCN:    NumVGPRsForWavesPerEU: 256
-define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) {
+define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
   %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
@@ -288,3 +288,4 @@
 declare i32 @llvm.amdgcn.workitem.id.x()
 
 attributes #0 = { nounwind "amdgpu-num-vgpr"="10" }
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
@@ -2,6 +2,14 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9,GFX9_10 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX10,GFX9_10 %s
 
+--- |
+    define amdgpu_kernel void @max-counter-lgkmcnt() #0 { ret void }
+    define amdgpu_kernel void @max-counter-vmcnt() #0 { ret void }
+    define amdgpu_kernel void @max-counter-expcnt() #0 { ret void }
+
+    attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+...
+
 # Check that we handle cases where a counter has overflowed.
 
 # Overflows lgkmcnt with gfx9 but not with gfx10.