diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
@@ -30,11 +30,13 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/CallGraph.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include <queue>
 
 #define DEBUG_TYPE "amdgpu-propagate-attributes"
 
@@ -113,12 +115,17 @@
   // Clone functions as needed or just set attributes.
   bool AllowClone;
 
+  CallGraph *ModuleCG = nullptr;
+
   // Option propagation roots.
   SmallSet<Function *, 32> Roots;
 
   // Clones of functions with their attributes.
   SmallVector<Clone, 32> Clones;
 
+  // To memoize address taken functions.
+  SmallSet<Function *, 32> AddressTakenFunctions;
+
   // Find a clone with required features.
   Function *findFunction(const FnProperties &PropsNeeded,
                          Function *OrigF);
@@ -139,14 +146,23 @@
   bool process();
 
 public:
-  AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) :
-    TM(TM), AllowClone(AllowClone) {}
+  AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone)
+      : TM(TM), AllowClone(AllowClone) {}
 
   // Use F as a root and propagate its attributes.
   bool process(Function &F);
 
   // Propagate attributes starting from kernel functions.
-  bool process(Module &M);
+  bool process(Module &M, CallGraph *CG);
+
+  // Remove attributes from F.
+  // This is used in presence of address taken functions.
+  bool removeAttributes(Function *F);
+
+  // Handle call graph rooted at address taken functions.
+  // This function will erase all attributes present
+  // on all functions called from address taken functions transitively.
+  bool handleAddressTakenFunctions(CallGraph *CG);
 };
 
 // Allows to propagate attributes early, but no clonning is allowed as it must
@@ -182,6 +198,7 @@
       *PassRegistry::getPassRegistry());
   }
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
   bool runOnModule(Module &M) override;
 };
 
@@ -199,6 +216,49 @@
                 "Late propagate attributes from kernels to functions",
                 false, false)
 
+bool AMDGPUPropagateAttributes::removeAttributes(Function *F) {
+  bool Changed = false;
+  if (!F)
+    return Changed;
+  LLVM_DEBUG(dbgs() << "Removing attributes from " << F->getName() << '\n');
+  for (unsigned I = 0; I < NumAttr; ++I) {
+    if (F->hasFnAttribute(AttributeNames[I])) {
+      F->removeFnAttr(AttributeNames[I]);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+bool AMDGPUPropagateAttributes::handleAddressTakenFunctions(CallGraph *CG) {
+  assert(ModuleCG && "Call graph not present");
+
+  bool Changed = false;
+  SmallSet<CallGraphNode *, 32> Visited;
+
+  for (Function *F : AddressTakenFunctions) {
+    CallGraphNode *CGN = (*CG)[F];
+    if (!Visited.count(CGN)) {
+      Changed |= removeAttributes(F);
+      Visited.insert(CGN);
+    }
+
+    std::queue<CallGraphNode *> SubGraph;
+    SubGraph.push(CGN);
+    while (!SubGraph.empty()) {
+      CallGraphNode *CGN = SubGraph.front();
+      SubGraph.pop();
+      if (!Visited.count(CGN)) {
+        Changed |= removeAttributes(CGN->getFunction());
+        Visited.insert(CGN);
+      }
+      for (auto N : *CGN)
+        SubGraph.push(N.second);
+    }
+  }
+  return Changed;
+}
+
 Function *
 AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded,
                                         Function *OrigF) {
@@ -210,11 +270,12 @@
   return nullptr;
 }
 
-bool AMDGPUPropagateAttributes::process(Module &M) {
+bool AMDGPUPropagateAttributes::process(Module &M, CallGraph *CG) {
   for (auto &F : M.functions())
     if (AMDGPU::isEntryFunctionCC(F.getCallingConv()))
       Roots.insert(&F);
 
+  ModuleCG = CG;
   return process();
 }
 
@@ -240,6 +301,9 @@
       if (F.isDeclaration())
         continue;
 
+      if (F.hasAddressTaken(nullptr, true, true, true))
+        AddressTakenFunctions.insert(&F);
+
       const FnProperties CalleeProps(*TM, F);
       SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace;
       SmallSet<CallBase *, 32> Visited;
@@ -310,6 +374,12 @@
   Roots.clear();
   Clones.clear();
 
+  // Keep the post processing related to indirect
+  // calls separate to handle them gracefully.
+  // The core traversal need not be affected by this.
+  if (AllowClone)
+    Changed |= handleAddressTakenFunctions(ModuleCG);
+
   return Changed;
 }
 
@@ -389,6 +459,10 @@
   return AMDGPUPropagateAttributes(TM, false).process(F);
 }
 
+void AMDGPUPropagateAttributesLate::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<CallGraphWrapperPass>();
+}
+
 bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {
   if (!TM) {
     auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
@@ -397,8 +471,8 @@
 
     TM = &TPC->getTM<TargetMachine>();
   }
-
-  return AMDGPUPropagateAttributes(TM, true).process(M);
+  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  return AMDGPUPropagateAttributes(TM, true).process(M, &CG);
 }
 
 FunctionPass
@@ -423,8 +497,9 @@
 }
 
 PreservedAnalyses
-AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &AM) {
-  return AMDGPUPropagateAttributes(&TM, true).process(M)
-             ? PreservedAnalyses::none()
-             : PreservedAnalyses::all();
+AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &MAM) {
+  AMDGPUPropagateAttributes APA(&TM, true);
+  CallGraph &CG = MAM.getResult<CallGraphAnalysis>(M);
+  const bool Changed = APA.process(M, &CG);
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
diff --git a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
--- a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
@@ -65,6 +65,7 @@
 ; GCN-O1-NEXT:     AMDGPU Printf lowering
 ; GCN-O1-NEXT:       FunctionPass Manager
 ; GCN-O1-NEXT:         Dominator Tree Construction
+; GCN-O1-NEXT:     CallGraph Construction
 ; GCN-O1-NEXT:     Late propagate attributes from kernels to functions
 ; GCN-O1-NEXT:     Interprocedural Sparse Conditional Constant Propagation
 ; GCN-O1-NEXT:       FunctionPass Manager
@@ -374,6 +375,7 @@
 ; GCN-O2-NEXT:     AMDGPU Printf lowering
 ; GCN-O2-NEXT:       FunctionPass Manager
 ; GCN-O2-NEXT:         Dominator Tree Construction
+; GCN-O2-NEXT:     CallGraph Construction
 ; GCN-O2-NEXT:     Late propagate attributes from kernels to functions
 ; GCN-O2-NEXT:     Interprocedural Sparse Conditional Constant Propagation
 ; GCN-O2-NEXT:       FunctionPass Manager
@@ -728,6 +730,7 @@
 ; GCN-O3-NEXT:     AMDGPU Printf lowering
 ; GCN-O3-NEXT:       FunctionPass Manager
 ; GCN-O3-NEXT:         Dominator Tree Construction
+; GCN-O3-NEXT:     CallGraph Construction
 ; GCN-O3-NEXT:     Late propagate attributes from kernels to functions
 ; GCN-O3-NEXT:     FunctionPass Manager
 ; GCN-O3-NEXT:       Dominator Tree Construction
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll
@@ -0,0 +1,55 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late  %s | FileCheck %s
+
+; CHECK-LABEL: define float @common() {
+define float @common() {
+   ret float 0.0
+}
+
+; CHECK-LABEL: define float @foo() {
+define float @foo() {
+  %direct_call = call contract float @common()
+  ret float %direct_call
+}
+
+; CHECK-LABEL: define float @bar() {
+define float @bar() {
+   ret float 0.0
+}
+
+; CHECK-LABEL: define float @baz() {
+define float @baz() {
+   ret float 0.0
+}
+
+define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 {
+  %fn = alloca float ()*
+  switch i32 %type, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb:
+  store float ()* @foo, float ()** %fn
+  br label %sw.epilog
+
+sw.bb2:
+  store float ()* @bar, float ()** %fn
+  br label %sw.epilog
+
+sw.bb3:
+  store float ()* @baz, float ()** %fn
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  %fp = load float ()*, float ()** %fn
+  %direct_call = call contract float @common()
+  %indirect_call = call contract float %fp()
+  store float %indirect_call, float* %result
+  ret void
+}
+
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll
@@ -0,0 +1,53 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late  %s | FileCheck %s
+
+; Test to check if we skip propgating attributes even if
+; a function is called directly as well as
+; indirectly. "baz" is called directly as well indirectly.
+
+; CHECK-LABEL: define float @foo()
+define float @foo() #1 {
+  ret float 0.0
+}
+
+; CHECK-LABEL: define float @bar()
+define float @bar() #1 {
+ ret float 0.0
+}
+
+; CHECK-LABEL: define float @baz()
+define float @baz() #1 {
+ ret float 0.0
+}
+
+define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 {
+  %fn = alloca float ()*
+  switch i32 %type, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb:
+  store float ()* @foo, float ()** %fn
+  br label %sw.epilog
+
+sw.bb2:
+  store float ()* @bar, float ()** %fn
+  br label %sw.epilog
+
+sw.bb3:
+  store float ()* @baz, float ()** %fn
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  %fp = load float ()*, float ()** %fn
+  %direct_call = call contract float @baz()
+  %indirect_call = call contract float %fp()
+  store float %indirect_call, float* %result
+  ret void
+}
+
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll
--- a/llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll
@@ -30,11 +30,14 @@
 ; In order to expose this bug, it is necessary that `g` have one of the
 ; propagated attributes, so that a clone and substitution would take place if g
 ; were actually the function being called.
-; CHECK-DAG: define private void @g.1() #1
-; CHECK-DAG: define internal void @g() #2
+; CHECK-DAG: define private void @g.1() #0
+; CHECK-DAG: define internal void @g() #1
 define private void @g() #1 {
     ret void
 }
 
 attributes #0 = { noinline }
 attributes #1 = { noinline "amdgpu-waves-per-eu"="1,10" }
+
+; CHECK: attributes #0 = { noinline }
+; CHECK-NEXT: attributes #1 = { noinline "target-features"="+enable-ds128,+enable-prt-strict-null,+flat-address-space,+flat-for-global,+load-store-opt,+promote-alloca,+trap-handler,+unaligned-access-mode,-wavefrontsize16,-wavefrontsize32,+wavefrontsize64" }
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll
@@ -0,0 +1,34 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late  %s | FileCheck %s
+
+; Test to check if we skip attributes on address
+; taken functions and its callees.
+
+; CHECK-LABEL: define float @bar() {
+define float @bar() {
+  ret float 0.0
+}
+
+; CHECK-LABEL: define float @baz() {
+define float @baz() {
+  ret float 0.0
+}
+
+; CHECK-LABEL: define float @foo() {
+define float @foo() {
+  %v1 = call contract float @bar()
+  %v2 = call contract float @baz()
+  %v3 = fadd float %v1, %v2
+  ret float %v3
+}
+
+; CHECK-LABEL: define amdgpu_kernel void @kernel(float* %result, i32 %type) #0 {
+define amdgpu_kernel void @kernel(float *%result, i32 %type) #1 {
+  %fn = alloca float ()*
+  store float ()* @foo, float ()** %fn
+  %fp = load float ()*, float ()** %fn
+  %indirect_call = call contract float %fp()
+  store float %indirect_call, float* %result
+  ret void
+}
+
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-launch-bounds.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-launch-bounds.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-launch-bounds.ll
@@ -0,0 +1,32 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late  %s | FileCheck %s
+
+; Test attributes on a function which
+; is called indirectly from two kernels
+; having different launch bounds.
+
+; This function should not have any attributes on it.
+; CHECK-LABEL: define float @foo() {
+define float @foo() {
+   ret float 0.0
+}
+
+define amdgpu_kernel void @kernel1(float *%result, i32 %type) #1 {
+  %fn = alloca float ()*
+  store float ()* @foo, float ()** %fn
+  %fp = load float ()*, float ()** %fn
+  %indirect_call = call contract float %fp()
+  store float %indirect_call, float* %result
+  ret void
+}
+
+define amdgpu_kernel void @kernel2(float *%result, i32 %type) #2 {
+  %fn = alloca float ()*
+  store float ()* @foo, float ()** %fn
+  %fp = load float ()*, float ()** %fn
+  %indirect_call = call contract float %fp()
+  store float %indirect_call, float* %result
+  ret void
+}
+
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
+attributes #2 = { "amdgpu-flat-work-group-size"="1,512" }