Index: lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -237,6 +237,26 @@ continue; } + CallingConv::ID CC = F.getCallingConv(); + if (ST.isAmdHsaOS()) { + if (CC == CallingConv::AMDGPU_KERNEL) { + //Check for uniform workgroup size attribute + if (F.hasFnAttribute("uniform-work-group-size")) { + if (Callee->hasFnAttribute("uniform-work-group-size")) { + Attribute attr = Callee->getFnAttribute("uniform-work-group-size"); + if (attr.getValueAsString().equals("false")) + continue; + else if (attr.getValueAsString().equals("true")) + Callee->addFnAttr(F.getFnAttribute("uniform-work-group-size")); + } + else { + //Copy the kernel attribute to the function + Callee->addFnAttr(F.getFnAttribute("uniform-work-group-size")); + } + } + } + } + Intrinsic::ID IID = Callee->getIntrinsicID(); if (IID == Intrinsic::not_intrinsic) { HaveCall = true; Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -667,6 +667,10 @@ } void AMDGPUPassConfig::addCodeGenPrepare() { + + if (TM->getTargetTriple().getArch() == Triple::amdgcn ) + addPass(createAMDGPUAnnotateKernelFeaturesPass()); + if (TM->getTargetTriple().getArch() == Triple::amdgcn && EnableLowerKernelArguments) addPass(createAMDGPULowerKernelArgumentsPass()); @@ -749,7 +753,7 @@ // FIXME: We need to run a pass to propagate the attributes when calls are // supported. - addPass(createAMDGPUAnnotateKernelFeaturesPass()); + //addPass(createAMDGPUAnnotateKernelFeaturesPass()); // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. Index: test/CodeGen/AMDGPU/uniform-workgroup-test1.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/uniform-workgroup-test1.ll @@ -0,0 +1,30 @@ +define void @foo() #0 { + ret void +} + +define void @boo() #1 { + ret void +} + +define amdgpu_kernel void @kernel1() #2 { + call void @foo() + call void @boo() + ret void +} + +define amdgpu_kernel void @kernel2() #3 { + call void @foo() + ret void +} + +define amdgpu_kernel void @kernel3() #4 { + call void @boo() + ret void +} + +attributes #0 = {nounwind "uniform-work-group-size"="true"} +attributes #1 = {nounwind "amdgpu-dispatch-ptr" } +attributes #2 = {"amdgpu-work-item-id-y" "uniform-work-group-size"="true"} +attributes #3 = {"uniform-work-group-size"="false"} +attributes #4 = {"uniform-work-group-size"="true"} + Index: test/CodeGen/AMDGPU/uniform-workgroup-test2.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/uniform-workgroup-test2.ll @@ -0,0 +1,15 @@ +define void @func(i32 %a, i32 addrspace(1)* %r)#0 { + store i32 %a,i32 addrspace(1)* %r + + ret void +} + +define amdgpu_kernel void @kernel(i32 addrspace(1)* %a, i32 addrspace(1)* %r)#1 { + %a.val = load i32, i32 addrspace(1)* %a + call void @func(i32 %a.val, i32 addrspace(1)* %r) + ret void + +} + +attributes #0 = { nounwind "uniform-work-group-size"="false"} +attributes #1 = { "amdgpu-work-item-id-y" "uniform-work-group-size"="true"}