diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
@@ -236,6 +236,16 @@
       if (F.isDeclaration())
         continue;
 
+      // Skip propagating attributes and features to
+      // address taken functions.
+      if (F.hasAddressTaken()) {
+        if (!Roots.count(&F) && !NewRoots.count(&F)) {
+          NewRoots.insert(&F);
+          Changed = true;
+        }
+        continue;
+      }
+
       const FnProperties CalleeProps(*TM, F);
       SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace;
       SmallSet<CallBase *, 32> Visited;
@@ -255,7 +265,11 @@
 
         const FnProperties CallerProps(*TM, *Caller);
 
-        if (CalleeProps == CallerProps) {
+        // Convergence is allowed if the caller has its
+        // address taken because all callee's (attributes + features)
+        // may not agree as the callee may be the target of
+        // more than one function (called directly or indirectly).
+        if (Caller->hasAddressTaken() || CalleeProps == CallerProps) {
           if (!Roots.count(&F))
             NewRoots.insert(&F);
           continue;
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll
@@ -0,0 +1,79 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-early  %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+
+; Complicated call graph where a function is called
+; directly from a kernel abd also from a function 
+; whose address is taken. 
+
+; CHECK-LABEL: define float @common_callee.gc(i32 %a) #0 {
+define float @common_callee.gc(i32 %a) {
+  %add = add i32 %a, 6
+  %mul = mul nsw i32 %add, 9
+  %div = sdiv i32 %mul, 8
+  %f = sitofp i32 %div to float
+  ret float %f
+}
+
+; CHECK-LABEL: define float @foo(i32 %a) {
+define float @foo(i32 %a) {
+entry:
+  %mul = mul nsw i32 %a, 5
+  %cast = sitofp i32 %mul to float
+
+  ret float %cast
+}
+
+; CHECK-LABEL: define float @bar(i32 %a) {
+define float @bar(i32 %a) {
+entry:
+  %div = sdiv i32 %a, 7
+  %direct_call = call contract float @common_callee.gc(i32 5)
+  ret float %direct_call
+}
+
+; CHECK-LABEL: define float @baz(i32 %a) {
+define float @baz(i32 %a) {
+entry:
+  %mul = mul nsw i32 %a, 6
+  %div = sdiv i32 %mul, 7
+  %conv = sitofp i32 %div to float
+  ret float %conv
+}
+
+define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 {
+entry:
+  %fn = alloca float (i32)*, align 8, addrspace(5)
+  switch i32 %type, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb:
+  store float (i32)* @foo, float (i32)* addrspace(5)* %fn, align 8
+  br label %sw.epilog
+
+sw.bb2:
+  store float (i32)* @bar, float (i32)* addrspace(5)* %fn, align 8
+  br label %sw.epilog
+
+sw.bb3:
+  store float (i32)* @baz, float (i32)* addrspace(5)* %fn, align 8
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  %fp = load float (i32)*, float (i32)* addrspace(5)* %fn, align 8
+  %direct_call = call contract float @common_callee.gc(i32 4)
+  %conv = fptosi float %direct_call to i32
+  %call4 = call contract float %fp(i32 %conv)
+  store float %call4, float* %result, align 4
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "target-features"="+16-bit-insts,+add-no-carry-insts,+aperture-regs,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+ds-src2-insts,+enable-ds128,+enable-prt-strict-null,+fast-denormal-f32,+fast-fmaf,+flat-address-space,+flat-for-global,+flat-global-insts,+flat-inst-offsets,+flat-scratch-insts,+fma-mix-insts,+fp64,+gcn3-encoding,+gfx7-gfx8-gfx9-insts,+gfx8-insts,+gfx9,+gfx9-insts,+half-rate-64-ops,+image-gather4-d16-bug,+int-clamp-insts,+inv-2pi-inline-imm,+ldsbankcount32,+load-store-opt,+localmemorysize65536,+mad-mac-f32-insts,+no-xnack-support,+promote-alloca,+r128-a16,+s-memrealtime,+s-memtime-inst,+scalar-atomics,+scalar-flat-scratch-insts,+scalar-stores,+sdwa,+sdwa-omod,+sdwa-scalar,+sdwa-sdst,+sram-ecc,+trap-handler,+unaligned-access-mode,+unaligned-buffer-access,+unaligned-ds-access,+vgpr-index-mode,+vop3p,-wavefrontsize16,-wavefrontsize32,+wavefrontsize64,+xnack" }
+attributes #1 = { convergent norecurse nounwind mustprogress
+    "amdgpu-flat-work-group-size"="1,256"} 
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll
@@ -0,0 +1,69 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-early  %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+
+; Test to check if we skip propgating attributes even if
+; a function is called directly as well as
+; indirectly. "baz" is called directly as well indirectly.
+
+; CHECK-LABEL: define float @foo(i32 %a) {
+define float @foo(i32 %a) {
+entry:
+  %mul = mul nsw i32 %a, 5
+  %cast = sitofp i32 %mul to float
+
+  ret float %cast
+}
+
+; CHECK-LABEL: define float @bar(i32 %a) {
+define float @bar(i32 %a) {
+entry:
+  %div = sdiv i32 %a, 7
+  %conv = sitofp i32 %div to float
+  ret float %conv
+}
+
+; CHECK-LABEL: define float @baz(i32 %a) {
+define float @baz(i32 %a) {
+entry:
+  %mul = mul nsw i32 %a, 6
+  %div = sdiv i32 %mul, 7
+  %conv = sitofp i32 %div to float
+  ret float %conv
+}
+
+define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 {
+entry:
+  %fn = alloca float (i32)*, align 8, addrspace(5)
+  switch i32 %type, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb:
+  store float (i32)* @foo, float (i32)* addrspace(5)* %fn, align 8
+  br label %sw.epilog
+
+sw.bb2:
+  store float (i32)* @bar, float (i32)* addrspace(5)* %fn, align 8
+  br label %sw.epilog
+
+sw.bb3:
+  store float (i32)* @baz, float (i32)* addrspace(5)* %fn, align 8
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  %fp = load float (i32)*, float (i32)* addrspace(5)* %fn, align 8
+  %direct_call = call contract float @baz(i32 4)
+  %conv = fptosi float %direct_call to i32
+  %call4 = call contract float %fp(i32 %conv)
+  store float %call4, float* %result, align 4
+  ret void
+}
+
+attributes #1 = { convergent norecurse nounwind mustprogress
+    "amdgpu-flat-work-group-size"="1,256" } 
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect.ll
@@ -0,0 +1,79 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-early  %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+
+; Test to check if we skip attributes on address
+; taken functions but pass to direct callees.
+
+; CHECK-LABEL: define float @foo(i32 %a) {
+define float @foo(i32 %a) {
+entry:
+  %mul = mul nsw i32 %a, 5
+  %cast = sitofp i32 %mul to float
+
+  ret float %cast
+}
+
+; CHECK-LABEL: define float @bar(i32 %a) {
+define float @bar(i32 %a) {
+entry:
+  %div = sdiv i32 %a, 7
+  %conv = sitofp i32 %div to float
+  ret float %conv
+}
+
+; CHECK-LABEL: define float @baz(i32 %a) {
+define float @baz(i32 %a) {
+entry:
+  %mul = mul nsw i32 %a, 6
+  %div = sdiv i32 %mul, 7
+  %conv = sitofp i32 %div to float
+  ret float %conv
+}
+
+; CHECK-LABEL: define float @baz2(i32 %a) #0 {
+define float @baz2(i32 %a) {
+  %mul = mul nsw i32 %a, 6
+  %div = sdiv i32 %mul, 8
+  %add = add i32 %div , 12
+  %conv = sitofp i32 %add to float
+  ret float %conv
+}
+
+define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 {
+entry:
+  %fn = alloca float (i32)*, align 8, addrspace(5)
+  switch i32 %type, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb:
+  store float (i32)* @foo, float (i32)* addrspace(5)* %fn, align 8
+  br label %sw.epilog
+
+sw.bb2:
+  store float (i32)* @bar, float (i32)* addrspace(5)* %fn, align 8
+  br label %sw.epilog
+
+sw.bb3:
+  store float (i32)* @baz, float (i32)* addrspace(5)* %fn, align 8
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  %fp = load float (i32)*, float (i32)* addrspace(5)* %fn, align 8
+  %direct_call = call contract float @baz2(i32 5)
+  %conv = fptosi float %direct_call to i32
+  %call4 = call contract float %fp(i32 %conv)
+  store float %call4, float* %result, align 4
+  ret void
+}
+
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "target-features"="+16-bit-insts,+add-no-carry-insts,+aperture-regs,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+ds-src2-insts,+enable-ds128,+enable-prt-strict-null,+fast-denormal-f32,+fast-fmaf,+flat-address-space,+flat-for-global,+flat-global-insts,+flat-inst-offsets,+flat-scratch-insts,+fma-mix-insts,+fp64,+gcn3-encoding,+gfx7-gfx8-gfx9-insts,+gfx8-insts,+gfx9,+gfx9-insts,+half-rate-64-ops,+image-gather4-d16-bug,+int-clamp-insts,+inv-2pi-inline-imm,+ldsbankcount32,+load-store-opt,+localmemorysize65536,+mad-mac-f32-insts,+no-xnack-support,+promote-alloca,+r128-a16,+s-memrealtime,+s-memtime-inst,+scalar-atomics,+scalar-flat-scratch-insts,+scalar-stores,+sdwa,+sdwa-omod,+sdwa-scalar,+sdwa-sdst,+sram-ecc,+trap-handler,+unaligned-access-mode,+unaligned-buffer-access,+unaligned-ds-access,+vgpr-index-mode,+vop3p,-wavefrontsize16,-wavefrontsize32,+wavefrontsize64,+xnack" }
+attributes #1 = { convergent norecurse nounwind mustprogress
+    "amdgpu-flat-work-group-size"="1,256"} 
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll
@@ -0,0 +1,67 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-early  %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+
+; Test to check if we skip attributes on address
+; taken functions in a simple call graph.
+
+; CHECK-LABEL: define float @foo(i32 %a) {
+define float @foo(i32 %a) {
+entry:
+  %mul = mul nsw i32 %a, 5
+  %cast = sitofp i32 %mul to float
+
+  ret float %cast
+}
+
+; CHECK-LABEL: define float @bar(i32 %a) {
+define float @bar(i32 %a) {
+entry:
+  %div = sdiv i32 %a, 7
+  %conv = sitofp i32 %div to float
+  ret float %conv
+}
+
+; CHECK-LABEL: define float @baz(i32 %a) {
+define float @baz(i32 %a) {
+entry:
+  %mul = mul nsw i32 %a, 6
+  %div = sdiv i32 %mul, 7
+  %conv = sitofp i32 %div to float
+  ret float %conv
+}
+
+define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 {
+entry:
+  %fn = alloca float (i32)*, align 8, addrspace(5)
+  switch i32 %type, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb:
+  store float (i32)* @foo, float (i32)* addrspace(5)* %fn, align 8
+  br label %sw.epilog
+
+sw.bb2:
+  store float (i32)* @bar, float (i32)* addrspace(5)* %fn, align 8
+  br label %sw.epilog
+
+sw.bb3:
+  store float (i32)* @baz, float (i32)* addrspace(5)* %fn, align 8
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  %fp = load float (i32)*, float (i32)* addrspace(5)* %fn, align 8
+  %call4 = call contract float %fp(i32 7)
+  store float %call4, float* %result, align 4
+  ret void
+}
+
+attributes #1 = { convergent norecurse nounwind mustprogress
+    "amdgpu-flat-work-group-size"="1,256"} 
+