Index: lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -48,6 +48,7 @@
   const TargetMachine *TM = nullptr;
 
   bool addFeatureAttributes(Function &F);
+  bool propagateAttribute(Function &F);
 
 public:
   static char ID;
@@ -213,6 +214,40 @@
     handleAttr(Parent, Callee, AttrName);
 }
 
+bool AMDGPUAnnotateKernelFeatures::propagateAttribute(Function &F) {
+  bool Changed = false;
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      CallSite CS(&I);
+      if (CS) {
+        Function *Callee = CS.getCalledFunction();
+        if (!Callee) 
+          continue;
+
+        if (F.hasFnAttribute("uniform-work-group-size")) {
+          if (F.getFnAttribute("uniform-work-group-size").getValueAsString().equals("false")) {
+            Callee->addFnAttr("uniform-work-group-size", "false");
+            Changed = true;
+          } else {
+            if (!Callee->hasFnAttribute("uniform-work-group-size")) {
+              Callee->addFnAttr("uniform-work-group-size", "true");
+              Changed = true;
+            }
+          }
+        //Check for nested function calls
+        Changed |= propagateAttribute(*Callee);
+        } else {
+          if (Callee->hasFnAttribute("uniform-work-group-size")) {
+            Callee->addFnAttr("uniform-work-group-size", "false");
+            Changed = true;
+          } 
+        }
+      }
+    }
+  }
+  return Changed;
+}
+
 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
   bool HasFlat = ST.hasFlatAddressSpace();
@@ -293,16 +328,12 @@
 }
 
 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
-  Module &M = SCC.getCallGraph().getModule();
-  Triple TT(M.getTargetTriple());
-
   bool Changed = false;
   for (CallGraphNode *I : SCC) {
     Function *F = I->getFunction();
     if (!F || F->isDeclaration())
       continue;
-
-    Changed |= addFeatureAttributes(*F);
+    Changed |= addFeatureAttributes(*F) | propagateAttribute(*F);
   }
 
   return Changed;
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -667,6 +667,10 @@
 }
 
 void AMDGPUPassConfig::addCodeGenPrepare() {
+
+  if (TM->getTargetTriple().getArch() == Triple::amdgcn)
+    addPass(createAMDGPUAnnotateKernelFeaturesPass());
+
   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
       EnableLowerKernelArguments)
     addPass(createAMDGPULowerKernelArgumentsPass());
@@ -749,7 +753,6 @@
 
   // FIXME: We need to run a pass to propagate the attributes when calls are
   // supported.
-  addPass(createAMDGPUAnnotateKernelFeaturesPass());
 
   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
   // regions formed by them.
Index: test/CodeGen/AMDGPU/uniform-work-group-test1.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/uniform-work-group-test1.ll
@@ -0,0 +1,19 @@
+; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s 
+
+; Test 1
+; GCN: define void @foo() #[[FOO:[0-9]+]] {
+define void @foo() #0 {
+  ret void
+}
+
+; GCN: define amdgpu_kernel void @kernel1() #[[KERNEL1:[0-9]+]] {
+define amdgpu_kernel void @kernel1() #1 {
+  call void @foo()
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "uniform-work-group-size"="false" }
+
+; GCN: attributes #[[FOO]] = { nounwind "uniform-work-group-size"="false" }
+; GCN: attributes #[[KERNEL1]] = { "uniform-work-group-size"="false" }
Index: test/CodeGen/AMDGPU/uniform-work-group-test2.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/uniform-work-group-test2.ll
@@ -0,0 +1,19 @@
+; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s 
+
+; Test 2
+; GCN: define void @boo() #[[BOO:[0-9]+]] {
+define void @boo() #2 {
+  ret void
+}
+
+; GCN: define amdgpu_kernel void @kernel2() #[[KERNEL2:[0-9]+]] {
+define amdgpu_kernel void @kernel2() #3 {
+  call void @boo()
+  ret void
+}
+
+attributes #2 = { "" }
+attributes #3 = { "uniform-work-group-size"="false" }
+
+; GCN: attributes #[[BOO]] = { "" "uniform-work-group-size"="false" }
+; GCN: attributes #[[KERNEL2]] = { "uniform-work-group-size"="false" }
Index: test/CodeGen/AMDGPU/uniform-work-group-test3.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/uniform-work-group-test3.ll
@@ -0,0 +1,27 @@
+; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s 
+
+; Test 3
+; GCN: define void @foo() #[[FOO:[0-9]+]] {
+define void @foo() #0 {
+  ret void
+}
+
+; GCN: define void @boo() #[[BOO:[0-9]+]] {
+define void @boo() #1 {
+  call void @foo()
+  ret void
+}
+
+; GCN: define amdgpu_kernel void @kernel3() #[[KERNEL3:[0-9]+]] {
+define amdgpu_kernel void @kernel3() #2 {
+  call void @boo()
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "" }
+attributes #2 = { "uniform-work-group-size"="true" }
+
+; GCN: attributes #[[FOO]] = { nounwind "uniform-work-group-size"="true" }
+; GCN: attributes #[[BOO]] = { "" "uniform-work-group-size"="true" }
+; GCN: attributes #[[KERNEL3]] = { "uniform-work-group-size"="true" }
Index: test/CodeGen/AMDGPU/uniform-work-group-test4.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/uniform-work-group-test4.ll
@@ -0,0 +1,27 @@
+; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s 
+
+; Test 4
+; GCN: define void @foo() #[[FOO:[0-9]+]] {
+define void @foo() #0 {
+  ret void
+}
+
+; GCN: define void @boo() #[[BOO:[0-9]+]] {
+define void @boo() #1 {
+  call void @foo()
+  ret void
+}
+
+; GCN: define amdgpu_kernel void @kernel4() #[[KERNEL4:[0-9]+]] {
+define amdgpu_kernel void @kernel4() #2 {
+  call void @boo()
+  ret void
+}
+
+attributes #0 = { nounwind "uniform-work-group-size"="true"}
+attributes #1 = { "" }
+attributes #2 = { "uniform-work-group-size"="false" }
+
+; GCN: attributes #[[FOO]] = { nounwind "uniform-work-group-size"="false" }
+; GCN: attributes #[[BOO]] = { "" "uniform-work-group-size"="false" }
+; GCN: attributes #[[KERNEL4]] = { "uniform-work-group-size"="false" }
Index: test/CodeGen/AMDGPU/uniform-work-group-test5.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/uniform-work-group-test5.ll
@@ -0,0 +1,27 @@
+; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s 
+
+; Test 5
+; GCN: define void @foo() #[[FOO:[0-9]+]] {
+define void @foo() #0 {
+  ret void
+}
+
+; GCN: define amdgpu_kernel void @kernel1() #[[KERNEL1:[0-9]+]] {
+define amdgpu_kernel void @kernel1() #1 {
+  call void @foo()
+  ret void
+}
+
+; GCN: define amdgpu_kernel void @kernel2() #[[KERNEL2:[0-9]+]] {
+define amdgpu_kernel void @kernel2() #2 {
+  call void @foo()
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "uniform-work-group-size"="true" }
+attributes #2 = { "" }
+
+; GCN: attributes #[[FOO]] = { nounwind "uniform-work-group-size"="false" }
+; GCN: attributes #[[KERNEL1]] = { "uniform-work-group-size"="true" }
+; GCN: attributes #[[KERNEL2]] = { "" }