Index: lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -48,6 +48,8 @@
   const TargetMachine *TM = nullptr;
 
   bool addFeatureAttributes(Function &F);
+  bool processUniformWorkGroup(CallGraphSCC &SCC);
+  bool propagteAttribute();
 
 public:
   static char ID;
@@ -72,12 +74,23 @@
     SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
 };
 
+class NodeWrapper {
+  public:
+  CallGraphNode *Owner;
+  std::list<NodeWrapper*> wrapperlist;
+  unsigned num_uses;
+  
+};
+
 } // end anonymous namespace
 
 char AMDGPUAnnotateKernelFeatures::ID = 0;
 
 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
 
+std::list<NodeWrapper*> node_list;
+std::map<CallGraphNode*, NodeWrapper*> node_map;
+
 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
                 "Add AMDGPU function attributes", false, false)
 
@@ -213,6 +226,67 @@
     handleAttr(Parent, Callee, AttrName);
 }
 
+bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroup(CallGraphSCC &SCC) {
+  bool Changed = false;
+
+  for (CallGraphSCC::iterator it = SCC.begin(), end = SCC.end(); it != end; ++it) {
+    //Create NodeWrappers for the nodes and append them to node_list
+    if ((*it)->getNumReferences()) {
+      NodeWrapper *temp = new NodeWrapper;
+      temp->Owner = *it;
+      temp->num_uses = (*it)->getNumReferences();
+      //Push the callees to the back and the callers to the front
+      node_list.push_front(temp);
+
+      //Update node_map with reference to the NodeWrapper for all the nodes
+      node_map.insert(std::pair<CallGraphNode*, NodeWrapper*>((*it), temp));
+
+    } else {
+      Changed = propagteAttribute();
+    }
+  }
+  return Changed;
+}
+
+bool AMDGPUAnnotateKernelFeatures::propagteAttribute() {
+  bool Changed = false;
+  for (std::list<NodeWrapper*>::iterator it = node_list.begin(); it != node_list.end(); ++it) {
+
+    //Adding the callees to the wrapper list
+    for (CallGraphNode::iterator iter = (*it)->Owner->begin(), end = (*it)->Owner->end(); iter != end; ++iter)
+      (*it)->wrapperlist.push_back(node_map[std::get<1>(*iter)]);
+
+    //Iterating over wrapper list and propagating attributes
+    for (std::list<NodeWrapper*>::iterator iter = (*it)->wrapperlist.begin(), end = (*it)->wrapperlist.end(); iter != end; ++iter) {
+      Function *Caller = (*it)->Owner->getFunction();
+      Function *Callee = (*iter)->Owner->getFunction();
+      //Check if the Caller has the attribute
+      if (Caller->hasFnAttribute("uniform-work-group-size")) {
+        //Check if th evalue of the value of the attribute is true
+        if (Callee) {
+          if (Caller->getFnAttribute("uniform-work-group-size").getValueAsString().equals("true")) {
+            //Propagate the attribute to the callee, if it does not have it
+            if (!Callee->hasFnAttribute("uniform-work-group-size")) {
+              Callee->addFnAttr("uniform-work-group-size", "true");
+              Changed = true;
+            }
+          } else {
+            Callee->addFnAttr("uniform-work-group-size", "false");
+            Changed = true;
+          }
+        }
+      } else {
+        //If caller does not have the attribute, clear the attribute from the Callee if it is true
+        Caller->addFnAttr("uniform-work-group-size", "false");
+        if (Callee)
+          Callee->addFnAttr("uniform-work-group-size", "false");
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
   bool HasFlat = ST.hasFlatAddressSpace();
@@ -293,18 +367,18 @@
 }
 
 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
-  Module &M = SCC.getCallGraph().getModule();
-  Triple TT(M.getTargetTriple());
-
   bool Changed = false;
+ 
+  //propagate uniform-work-group-attribute
+  Changed |= processUniformWorkGroup(SCC);
+
   for (CallGraphNode *I : SCC) {
-    Function *F = I->getFunction();
+    Function *F = I->getFunction();    
+    //add feature attributes
     if (!F || F->isDeclaration())
       continue;
-
     Changed |= addFeatureAttributes(*F);
   }
-
   return Changed;
 }
 
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -680,6 +680,9 @@
 }
 
 void AMDGPUPassConfig::addCodeGenPrepare() {
+  if (TM->getTargetTriple().getArch() == Triple::amdgcn)
+    addPass(createAMDGPUAnnotateKernelFeaturesPass());
+
   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
       EnableLowerKernelArguments)
     addPass(createAMDGPULowerKernelArgumentsPass());
@@ -767,7 +770,6 @@
 
   // FIXME: We need to run a pass to propagate the attributes when calls are
   // supported.
-  addPass(createAMDGPUAnnotateKernelFeaturesPass());
 
   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
   // regions formed by them.
Index: test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
===================================================================
--- test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -244,52 +244,52 @@
   ret void
 }
 
-; HSA: define void @use_implicitarg_ptr() #15 {
+; HSA: define void @use_implicitarg_ptr() #16 {
 define void @use_implicitarg_ptr() #1 {
   %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
   store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef
   ret void
 }
 
-; HSA: define void @func_indirect_use_implicitarg_ptr() #15 {
+; HSA: define void @func_indirect_use_implicitarg_ptr() #16 {
 define void @func_indirect_use_implicitarg_ptr() #1 {
   call void @use_implicitarg_ptr()
   ret void
 }
 
-; HSA: declare void @external.func() #16
+; HSA: declare void @external.func() #17
 declare void @external.func() #3
 
-; HSA: define internal void @defined.func() #16 {
+; HSA: define internal void @defined.func() #17 {
 define internal void @defined.func() #3 {
   ret void
 }
 
-; HSA: define void @func_call_external() #16 {
+; HSA: define void @func_call_external() #17 {
 define void @func_call_external() #3 {
   call void @external.func()
   ret void
 }
 
-; HSA: define void @func_call_defined() #16 {
+; HSA: define void @func_call_defined() #17 {
 define void @func_call_defined() #3 {
   call void @defined.func()
   ret void
 }
 
-; HSA: define void @func_call_asm() #16 {
+; HSA: define void @func_call_asm() #17 {
 define void @func_call_asm() #3 {
   call void asm sideeffect "", ""() #3
   ret void
 }
 
-; HSA: define amdgpu_kernel void @kern_call_external() #17 {
+; HSA: define amdgpu_kernel void @kern_call_external() #18 {
 define amdgpu_kernel void @kern_call_external() #3 {
   call void @external.func()
   ret void
 }
 
-; HSA: define amdgpu_kernel void @func_kern_defined() #17 {
+; HSA: define amdgpu_kernel void @func_kern_defined() #18 {
 define amdgpu_kernel void @func_kern_defined() #3 {
   call void @defined.func()
   ret void
@@ -301,20 +301,20 @@
 attributes #3 = { nounwind }
 
 ; HSA: attributes #0 = { nounwind readnone speculatable }
-; HSA: attributes #1 = { nounwind "amdgpu-work-item-id-x" "target-cpu"="fiji" }
-; HSA: attributes #2 = { nounwind "amdgpu-work-item-id-y" "target-cpu"="fiji" }
-; HSA: attributes #3 = { nounwind "amdgpu-work-item-id-z" "target-cpu"="fiji" }
-; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-x" "target-cpu"="fiji" }
-; HSA: attributes #5 = { nounwind "amdgpu-work-group-id-y" "target-cpu"="fiji" }
-; HSA: attributes #6 = { nounwind "amdgpu-work-group-id-z" "target-cpu"="fiji" }
-; HSA: attributes #7 = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" }
-; HSA: attributes #8 = { nounwind "amdgpu-queue-ptr" "target-cpu"="fiji" }
-; HSA: attributes #9 = { nounwind "amdgpu-dispatch-id" "target-cpu"="fiji" }
+; HSA: attributes #1 = { nounwind "amdgpu-work-item-id-x" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #2 = { nounwind "amdgpu-work-item-id-y" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #3 = { nounwind "amdgpu-work-item-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-x" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #5 = { nounwind "amdgpu-work-group-id-y" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #6 = { nounwind "amdgpu-work-group-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #7 = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #8 = { nounwind "amdgpu-queue-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #9 = { nounwind "amdgpu-dispatch-id" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; HSA: attributes #10 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "target-cpu"="fiji" }
-; HSA: attributes #11 = { nounwind "target-cpu"="fiji" }
-; HSA: attributes #12 = { nounwind "target-cpu"="gfx900" }
-; HSA: attributes #13 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" }
-; HSA: attributes #14 = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" }
+; HSA: attributes #11 = { nounwind "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #12 = { nounwind "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; HSA: attributes #13 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; HSA: attributes #14 = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; HSA: attributes #15 = { nounwind "amdgpu-implicitarg-ptr" "target-cpu"="fiji" }
-; HSA: attributes #16 = { nounwind }
-; HSA: attributes #17 = { nounwind "amdgpu-flat-scratch" }
+; HSA: attributes #16 = { nounwind "amdgpu-implicitarg-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #17 = { nounwind "uniform-work-group-size"="false" }
Index: test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
@@ -0,0 +1,18 @@
+; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s 
+
+; Test 1
+; "If the kernel does not have the uniform-work-group-attribute, set both callee and caller as false"
+; CHECK: define void @foo() #[[FOO:[0-9]+]] {
+define void @foo() #0 {
+  ret void
+}
+
+; CHECK: define amdgpu_kernel void @kernel1() #[[FOO]] {
+define amdgpu_kernel void @kernel1() #1 {
+  call void @foo()
+  ret void
+}
+
+attributes #0 = { "uniform-work-group-size"="true" }
+
+; CHECK: attributes #[[FOO]] = { "uniform-work-group-size"="false" }
Index: test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s 
+
+; Test 3
+; "Test to verify if the attribute gets propagated across nested function calls"
+; CHECK: define void @func1() #[[FUNC:[0-9]+]] {
+define void @func1() #0 {
+  ret void
+}
+
+; CHECK: define void @func2() #[[FUNC]] {
+define void @func2() #1 {
+  call void @func1()
+  ret void
+}
+
+; CHECK: define amdgpu_kernel void @kernel3() #[[FUNC:[0-9]+]] {
+define amdgpu_kernel void @kernel3() #2 {
+  call void @func2()
+  ret void
+}
+
+attributes #2 = { "uniform-work-group-size"="true" }
+
+; CHECK: attributes #[[FUNC]] = { "uniform-work-group-size"="true" }
Index: test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
@@ -0,0 +1,25 @@
+; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s 
+
+; Test 5
+; "Two kernels with different values of the uniform-work-group-attribute call the same function"
+; CHECK: define void @func() #[[FUNC:[0-9]+]] {
+define void @func() #0 {
+  ret void
+}
+
+; CHECK: define amdgpu_kernel void @kernel1() #[[KERNEL1:[0-9]+]] {
+define amdgpu_kernel void @kernel1() #1 {
+  call void @func()
+  ret void
+}
+
+; CHECK: define amdgpu_kernel void @kernel2() #[[FUNC]] {
+define amdgpu_kernel void @kernel2() #2 {
+  call void @func()
+  ret void
+}
+
+attributes #1 = { "uniform-work-group-size"="true" }
+
+; CHECK: attributes #[[FUNC]] = { "uniform-work-group-size"="false" }
+; CHECK: attributes #[[KERNEL1]] = { "uniform-work-group-size"="true" }
Index: test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
@@ -0,0 +1,20 @@
+; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s 
+
+; Test 2
+; "Propagate the uniform-work-group-attribute from the kernel to callee if it doesn't have it"
+; CHECK: define void @func() #[[FUNC:[0-9]+]] {
+define void @func() #0 {
+  ret void
+}
+
+; CHECK: define amdgpu_kernel void @kernel() #[[KERNEL:[0-9]+]] {
+define amdgpu_kernel void @kernel() #1 {
+  call void @func()
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "uniform-work-group-size"="false" }
+
+; CHECK: attributes #[[FUNC]] = { nounwind "uniform-work-group-size"="false" }
+; CHECK: attributes #[[KERNEL]] = { "uniform-work-group-size"="false" }