Index: llvm/lib/Analysis/UniformityAnalysis.cpp =================================================================== --- llvm/lib/Analysis/UniformityAnalysis.cpp +++ llvm/lib/Analysis/UniformityAnalysis.cpp @@ -120,7 +120,7 @@ auto &CI = FAM.getResult(F); UniformityInfo UI{F, DT, CI, &TTI}; // Skip computation if we can assume everything is uniform. - if (TTI.hasBranchDivergence()) + if (TTI.hasBranchDivergence(&F)) UI.compute(); return UI; @@ -175,7 +175,7 @@ UniformityInfo{F, domTree, cycleInfo, &targetTransformInfo}; // Skip computation if we can assume everything is uniform. - if (targetTransformInfo.hasBranchDivergence()) + if (targetTransformInfo.hasBranchDivergence(m_function)) m_uniformityInfo.compute(); return false; Index: llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp @@ -14,6 +14,7 @@ #include "AMDGPURegBankSelect.h" #include "AMDGPU.h" +#include "GCNSubtarget.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" #include "llvm/InitializePasses.h" @@ -59,13 +60,14 @@ assert(checkFunctionIsLegal(MF)); + const GCNSubtarget &ST = MF.getSubtarget(); MachineCycleInfo &CycleInfo = getAnalysis().getCycleInfo(); MachineDominatorTree &DomTree = getAnalysis(); - // TODO: Check for single lane execution. MachineUniformityInfo Uniformity = - computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(), true); + computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(), + !ST.isSingleLaneExecution(F)); (void)Uniformity; // TODO: Use this assignRegisterBanks(MF); Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -269,6 +269,9 @@ /// 2) dimension. unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const; + /// Return true if only a single workitem can be active in a wave. + bool isSingleLaneExecution(const Function &Kernel) const; + /// Creates value range metadata on an workitemid.* intrinsic call or load. bool makeLIDRangeMetadata(Instruction *I) const; Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -469,6 +469,15 @@ return getFlatWorkGroupSizes(Kernel).second - 1; } +bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const { + for (int I = 0; I < 3; ++I) { + if (getMaxWorkitemID(Func, I) > 0) + return false; + } + + return true; +} + bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { Function *Kernel = I->getParent()->getParent(); unsigned MinSize = 0; Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -298,7 +298,7 @@ } bool GCNTTIImpl::hasBranchDivergence(const Function *F) const { - return true; + return !F || !ST->isSingleLaneExecution(*F); } unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { Index: llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll =================================================================== --- llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll +++ llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll @@ -52,6 +52,12 @@ ret void } +; CHECK-LABEL: for function 'single_lane_func_arguments': +; CHECK-NOT: DIVERGENT +define void @single_lane_func_arguments(i32 %i32, i1 %i1) #2 { + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.readfirstlane(i32) #0 declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #1 @@ -60,3 +66,4 @@ attributes #0 = { nounwind readnone } attributes #1 = { nounwind readnone convergent } +attributes #2 = { "amdgpu-flat-work-group-size"="1,1" } Index: llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll =================================================================== --- llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll +++ llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll @@ -41,5 +41,83 @@ ret void } +; CHECK-LABEL: UniformityInfo for function 'workitem_id_x_singlethreaded': +; CHECK-NOT: DIVERGENT +define amdgpu_kernel void @workitem_id_x_singlethreaded() #2 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %id.x, ptr addrspace(1) undef + ret void +} + +; CHECK-LABEL: UniformityInfo for function 'workitem_id_y_singlethreaded': +; CHECK-NOT: DIVERGENT +define amdgpu_kernel void @workitem_id_y_singlethreaded() #2 { + %id.x = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %id.x, ptr addrspace(1) undef + ret void +} + +; CHECK-LABEL: UniformityInfo for function 'workitem_id_z_singlethreaded': +; CHECK-NOT: DIVERGENT +define amdgpu_kernel void @workitem_id_z_singlethreaded() #2 { + %id.x = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %id.x, ptr addrspace(1) undef + ret void +} + +; CHECK-LABEL: UniformityInfo for function 'workitem_id_x_singlethreaded_md': +; CHECK-NOT: DIVERGENT +define amdgpu_kernel void @workitem_id_x_singlethreaded_md() !reqd_work_group_size !0 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %id.x, ptr addrspace(1) undef + ret void +} + +; CHECK-LABEL: UniformityInfo for function 'workitem_id_y_singlethreaded_md': +; CHECK-NOT: DIVERGENT +define amdgpu_kernel void @workitem_id_y_singlethreaded_md() !reqd_work_group_size !0 { + %id.x = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %id.x, ptr addrspace(1) undef + ret void +} + +; CHECK-LABEL: UniformityInfo for function 'workitem_id_z_singlethreaded_md': +; CHECK-NOT: DIVERGENT +define amdgpu_kernel void @workitem_id_z_singlethreaded_md() !reqd_work_group_size !0 { + %id.x = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %id.x, ptr addrspace(1) undef + ret void +} + +; CHECK-LABEL: UniformityInfo for function 'workitem_id_x_not_singlethreaded_dimx': +; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimx() !reqd_work_group_size !1 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %id.x, ptr addrspace(1) undef + ret void +} + +; CHECK-LABEL: UniformityInfo for function 'workitem_id_x_not_singlethreaded_dimy': +; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimy() !reqd_work_group_size !2 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %id.x, ptr addrspace(1) undef + ret void +} + +; CHECK-LABEL: UniformityInfo for function 'workitem_id_x_not_singlethreaded_dimz': +; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimz() !reqd_work_group_size !3 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %id.x, ptr addrspace(1) undef + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } +attributes #2 = { "amdgpu-flat-work-group-size"="1,1" } + +!0 = !{i32 1, i32 1, i32 1} +!1 = !{i32 2, i32 1, i32 1} +!2 = !{i32 1, i32 2, i32 1} +!3 = !{i32 1, i32 1, i32 2}