Index: llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -128,6 +128,17 @@
     return ST.hasApertureRegs();
   }
 
+  std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+    return ST.getFlatWorkGroupSizes(F);
+  }
+
+  std::pair<unsigned, unsigned>
+  getMaximumFlatWorkGroupRange(const Function &F) {
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+    return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
+  }
+
 private:
   /// Check if the ConstantExpr \p CE requires queue ptr attribute.
   static bool visitConstExpr(const ConstantExpr *CE) {
@@ -470,6 +481,118 @@
   llvm_unreachable("AAAMDAttributes is only valid for function position");
 }
 
+/// Propagate amdgpu-flat-work-group-size attribute.
+struct AAAMDFlatWorkGroupSize
+    : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
+  using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
+  AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
+      : Base(IRP, 32) {}
+
+  /// See AbstractAttribute::getState(...).
+  IntegerRangeState &getState() override { return *this; }
+  const IntegerRangeState &getState() const override { return *this; }
+
+  void initialize(Attributor &A) override {
+    Function *F = getAssociatedFunction();
+    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+    unsigned MinGroupSize, MaxGroupSize;
+    std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
+    intersectKnown(
+        ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
+  }
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+    auto CheckCallSite = [&](AbstractCallSite CS) {
+      Function *Caller = CS.getInstruction()->getFunction();
+      LLVM_DEBUG(dbgs() << "[AAAMDFlatWorkGroupSize] Call " << Caller->getName()
+                        << "->" << getAssociatedFunction()->getName() << '\n');
+
+      const auto &CallerInfo = A.getAAFor<AAAMDFlatWorkGroupSize>(
+          *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
+
+      Change |=
+          clampStateAndIndicateChange(this->getState(), CallerInfo.getState());
+
+      return true;
+    };
+
+    bool AllCallSitesKnown = true;
+    if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
+      indicatePessimisticFixpoint();
+
+    return Change;
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    SmallVector<Attribute, 8> AttrList;
+    Function *F = getAssociatedFunction();
+    LLVMContext &Ctx = F->getContext();
+
+    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+    unsigned Min, Max;
+    std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F);
+
+    // Don't add the attribute if it's the implied default.
+    if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
+      return ChangeStatus::UNCHANGED;
+
+    SmallString<10> Buffer;
+    raw_svector_ostream OS(Buffer);
+    OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
+
+    AttrList.push_back(
+        Attribute::get(Ctx, "amdgpu-flat-work-group-size", OS.str()));
+    return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
+                                              /* ForceReplace */ true);
+  }
+
+  const std::string getAsStr() const override {
+    std::string Str;
+    raw_string_ostream OS(Str);
+    OS << "AMDFlatWorkGroupSize[";
+    OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
+    OS << ']';
+    return OS.str();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
+                                                   Attributor &A);
+
+  /// See AbstractAttribute::getName()
+  const std::string getName() const override {
+    return "AAAMDFlatWorkGroupSize";
+  }
+
+  /// See AbstractAttribute::getIdAddr()
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is
+  /// AAAMDFlatWorkGroupSize
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+const char AAAMDFlatWorkGroupSize::ID = 0;
+
+AAAMDFlatWorkGroupSize &
+AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
+                                          Attributor &A) {
+  if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
+    return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
+  llvm_unreachable(
+      "AAAMDFlatWorkGroupSize is only valid for function position");
+}
+
 class AMDGPUAttributor : public ModulePass {
 public:
   AMDGPUAttributor() : ModulePass(ID) {}
@@ -497,7 +620,8 @@
     BumpPtrAllocator Allocator;
     AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
     DenseSet<const char *> Allowed(
-        {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, &AACallEdges::ID});
+        {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
+         &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID});
 
     Attributor A(Functions, InfoCache, CGUpdater, &Allowed);
 
@@ -505,6 +629,9 @@
       if (!F.isIntrinsic()) {
         A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
         A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
+        if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) {
+          A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
+        }
       }
     }
 
Index: llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
@@ -55,10 +55,7 @@
 
 // Attributes to propagate.
 // TODO: Support conservative min/max merging instead of cloning.
-static constexpr const char* AttributeNames[] = {
-  "amdgpu-waves-per-eu",
-  "amdgpu-flat-work-group-size"
-};
+static constexpr const char *AttributeNames[] = {"amdgpu-waves-per-eu"};
 
 static constexpr unsigned NumAttr =
   sizeof(AttributeNames) / sizeof(AttributeNames[0]);
Index: llvm/test/CodeGen/AMDGPU/propagate-attributes-flat-work-group-size.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/propagate-attributes-flat-work-group-size.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-propagate-attributes-late %s | FileCheck %s
-
-; CHECK: define internal void @max_flat_1_1024() #0 {
-define internal void @max_flat_1_1024() #0 {
-  ret void
-}
-
-; CHECK: define internal void @max_flat_1_256() #1 {
-define internal void @max_flat_1_256() #1 {
-  ret void
-}
-
-; CHECK: define amdgpu_kernel void @kernel_1_256_call_default() #1 {
-define amdgpu_kernel void @kernel_1_256_call_default() #1 {
-  call void @default()
-  ret void
-}
-
-; CHECK: define amdgpu_kernel void @kernel_1_256_call_1_256() #1 {
-define amdgpu_kernel void @kernel_1_256_call_1_256() #1 {
-  call void @max_flat_1_256()
-  ret void
-}
-
-; CHECK: define amdgpu_kernel void @kernel_1_256_call_64_64() #1 {
-define amdgpu_kernel void @kernel_1_256_call_64_64() #1 {
-  call void @max_flat_64_64()
-  ret void
-}
-
-; CHECK: define internal void @max_flat_64_64() #2 {
-define internal void @max_flat_64_64() #2 {
-  ret void
-}
-
-; CHECK: define internal void @default() #2 {
-define internal void @default() #3 {
-  ret void
-}
-
-attributes #0 = { noinline "amdgpu-flat-work-group-size"="1,1024" }
-attributes #1 = { noinline "amdgpu-flat-work-group-size"="1,256" }
-attributes #2 = { noinline "amdgpu-flat-work-group-size"="64,64" }
-attributes #3 = { noinline }
-
-; CHECK: attributes #0 = { noinline "amdgpu-flat-work-group-size"="1,1024"
-; CHECK-NEXT: attributes #1 = { noinline "amdgpu-flat-work-group-size"="1,256"
-; CHECK-NEXT: attributes #2 = { noinline "amdgpu-flat-work-group-size"="1,256"
Index: llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
@@ -0,0 +1,214 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck %s
+
+; Check propagation of amdgpu-flat-work-group-size attribute.
+
+; Called from a single kernel with 1,256
+define internal void @default_to_1_256() {
+; CHECK-LABEL: define {{[^@]+}}@default_to_1_256
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @kernel_1_256() #0 {
+; CHECK-LABEL: define {{[^@]+}}@kernel_1_256
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:    call void @default_to_1_256()
+; CHECK-NEXT:    ret void
+;
+  call void @default_to_1_256()
+  ret void
+}
+
+; Called from a single kernel with 64,128
+define internal void @default_to_64_128() {
+; CHECK-LABEL: define {{[^@]+}}@default_to_64_128
+; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @kernel_64_128() #1 {
+; CHECK-LABEL: define {{[^@]+}}@kernel_64_128
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    call void @default_to_64_128()
+; CHECK-NEXT:    call void @flat_group_64_64()
+; CHECK-NEXT:    call void @default_to_64_256()
+; CHECK-NEXT:    call void @flat_group_128_256()
+; CHECK-NEXT:    ret void
+;
+  call void @default_to_64_128()
+  call void @flat_group_64_64()
+  call void @default_to_64_256()
+  call void @flat_group_128_256()
+  ret void
+}
+
+; Called from kernels with 128,512 and 512,512
+define internal void @default_to_128_512() {
+; CHECK-LABEL: define {{[^@]+}}@default_to_128_512
+; CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; This already has a strict bounds, but called from kernels with wider
+; bounds, and should not be changed.
+define internal void @flat_group_64_64() #2 {
+; CHECK-LABEL: define {{[^@]+}}@flat_group_64_64
+; CHECK-SAME: () #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; 128,256 -> 128,128
+define internal void @flat_group_128_256() #3 {
+; CHECK-LABEL: define {{[^@]+}}@flat_group_128_256
+; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define internal void @flat_group_512_1024() #4 {
+; CHECK-LABEL: define {{[^@]+}}@flat_group_512_1024
+; CHECK-SAME: () #[[ATTR5:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @kernel_128_512() #5 {
+; CHECK-LABEL: define {{[^@]+}}@kernel_128_512
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:    call void @default_to_128_512()
+; CHECK-NEXT:    call void @flat_group_64_64()
+; CHECK-NEXT:    ret void
+;
+  call void @default_to_128_512()
+  call void @flat_group_64_64()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_512_512() #6 {
+; CHECK-LABEL: define {{[^@]+}}@kernel_512_512
+; CHECK-SAME: () #[[ATTR5]] {
+; CHECK-NEXT:    call void @default_to_128_512()
+; CHECK-NEXT:    call void @flat_group_512_1024()
+; CHECK-NEXT:    ret void
+;
+  call void @default_to_128_512()
+  call void @flat_group_512_1024()
+  ret void
+}
+
+; Called from kernels with 128,256 and 64,128 => 64,256
+define internal void @default_to_64_256() {
+; CHECK-LABEL: define {{[^@]+}}@default_to_64_256
+; CHECK-SAME: () #[[ATTR6:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; The kernel's lower bound is higher than the callee's lower bound, so
+; this should probably be illegal.
+define amdgpu_kernel void @kernel_128_256() #3 {
+; CHECK-LABEL: define {{[^@]+}}@kernel_128_256
+; CHECK-SAME: () #[[ATTR7:[0-9]+]] {
+; CHECK-NEXT:    call void @default_to_64_256()
+; CHECK-NEXT:    ret void
+;
+  call void @default_to_64_256()
+  ret void
+}
+
+; 64,128 -> 64,128
+define internal void @merge_cycle_0() #1 {
+; CHECK-LABEL: define {{[^@]+}}@merge_cycle_0
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    call void @merge_cycle_1()
+; CHECK-NEXT:    ret void
+;
+  call void @merge_cycle_1()
+  ret void
+}
+
+; 128,256 -> 128,128
+define internal void @merge_cycle_1() #3 {
+; CHECK-LABEL: define {{[^@]+}}@merge_cycle_1
+; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-NEXT:    call void @merge_cycle_0()
+; CHECK-NEXT:    ret void
+;
+  call void @merge_cycle_0()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_64_256() #7 {
+; CHECK-LABEL: define {{[^@]+}}@kernel_64_256
+; CHECK-SAME: () #[[ATTR6]] {
+; CHECK-NEXT:    call void @merge_cycle_0()
+; CHECK-NEXT:    call void @default_captured_address()
+; CHECK-NEXT:    call void @externally_visible_default()
+; CHECK-NEXT:    [[F32:%.*]] = call float bitcast (i32 ()* @bitcasted_function to float ()*)()
+; CHECK-NEXT:    ret void
+;
+  call void @merge_cycle_0()
+  call void @default_captured_address()
+  call void @externally_visible_default()
+  %f32 = call float bitcast (i32 ()* @bitcasted_function to float ()*)()
+  ret void
+}
+
+define internal void @default_captured_address() {
+; CHECK-LABEL: define {{[^@]+}}@default_captured_address
+; CHECK-SAME: () #[[ATTR8:[0-9]+]] {
+; CHECK-NEXT:    store volatile void ()* @default_captured_address, void ()** undef, align 8
+; CHECK-NEXT:    ret void
+;
+  store volatile void ()* @default_captured_address, void ()** undef, align 8
+  ret void
+}
+
+define void @externally_visible_default() {
+; CHECK-LABEL: define {{[^@]+}}@externally_visible_default
+; CHECK-SAME: () #[[ATTR8]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; 1,1024 -> 64,256
+define internal i32 @bitcasted_function() {
+; CHECK-LABEL: define {{[^@]+}}@bitcasted_function
+; CHECK-SAME: () #[[ATTR6]] {
+; CHECK-NEXT:    ret i32 0
+;
+  ret i32 0
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+attributes #1 = { "amdgpu-flat-work-group-size"="64,128" }
+attributes #2 = { "amdgpu-flat-work-group-size"="64,64" }
+attributes #3 = { "amdgpu-flat-work-group-size"="128,256" }
+attributes #4 = { "amdgpu-flat-work-group-size"="512,1024" }
+attributes #5 = { "amdgpu-flat-work-group-size"="128,512" }
+attributes #6 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #7 = { "amdgpu-flat-work-group-size"="64,256" }
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+;.