Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -918,20 +918,36 @@ return false; } + // In most cases TID / wavefrontsize is uniform. + // + // However, if a kernel has uneven dimesions we can have a value of + // workitem-id-x divided by the wavefrontsize non-uniform. For example + // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1) + // packed into a same wave which gives 1 and 0 after the division by 64 + // respectively. + // + // FIXME: limit it to 1D kernels only, although that shall be possible + // to perform this optimization is the size of the X dimension is a power + // of 2, we just do not currently have infrastructure to query it. using namespace llvm::PatternMatch; uint64_t C; if (match(V, m_LShr(m_Intrinsic(), m_ConstantInt(C))) || match(V, m_AShr(m_Intrinsic(), - m_ConstantInt(C)))) - return C >= ST->getWavefrontSizeLog2(); + m_ConstantInt(C)))) { + const Function *F = cast(V)->getFunction(); + return C >= ST->getWavefrontSizeLog2() && + ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0; + } Value *Mask; if (match(V, m_c_And(m_Intrinsic(), m_Value(Mask)))) { - const DataLayout &DL = cast(V)->getModule()->getDataLayout(); + const Function *F = cast(V)->getFunction(); + const DataLayout &DL = F->getParent()->getDataLayout(); return computeKnownBits(Mask, DL).countMinTrailingZeros() >= - ST->getWavefrontSizeLog2(); + ST->getWavefrontSizeLog2() && + ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0; } const ExtractValueInst *ExtValue = dyn_cast(V); Index: llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll +++ llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll @@ -10,8 +10,8 @@ ; OPT-LABEL: @lshr_threadid ; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}} -; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0 -define amdgpu_kernel void @lshr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) { +; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform +define amdgpu_kernel void @lshr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !0 { entry: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %div = lshr i32 %lid, 5 @@ -30,8 +30,8 @@ ; OPT-LABEL: @ashr_threadid ; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}} -; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0 -define amdgpu_kernel void @ashr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) { +; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform +define amdgpu_kernel void @ashr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !0 { entry: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %div = ashr i32 %lid, 5 @@ -50,8 +50,96 @@ ; OPT-LABEL: @and_threadid ; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}} -; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0 -define amdgpu_kernel void @and_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) { +; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform +define amdgpu_kernel void @and_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !0 { +entry: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %and = and i32 %lid, -32 + %div4 = zext i32 %and to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4 + %load = load i32, ptr addrspace(1) %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4 + store i32 %load, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; GCN-LABEL: {{^}}lshr_threadid_no_dim_info: +; GCN: global_load_dword + +; OPT-LABEL: @lshr_threadid_no_dim_info +; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}} +define amdgpu_kernel void @lshr_threadid_no_dim_info(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) { +entry: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %div = lshr i32 %lid, 5 + %div4 = zext i32 %div to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4 + %load = load i32, ptr addrspace(1) %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4 + store i32 %load, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; GCN-LABEL: {{^}}lshr_threadid_2d: +; GCN: global_load_dword + +; OPT-LABEL: @lshr_threadid_2d +; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}} +define amdgpu_kernel void @lshr_threadid_2d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !1 { +entry: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %div = lshr i32 %lid, 5 + %div4 = zext i32 %div to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4 + %load = load i32, ptr addrspace(1) %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4 + store i32 %load, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; GCN-LABEL: {{^}}lshr_threadid_3d: +; GCN: global_load_dword + +; OPT-LABEL: @lshr_threadid_3d +; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}} +define amdgpu_kernel void @lshr_threadid_3d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !2 { +entry: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %div = lshr i32 %lid, 5 + %div4 = zext i32 %div to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4 + %load = load i32, ptr addrspace(1) %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4 + store i32 %load, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; GCN-LABEL: {{^}}lshr_threadid_1d_uneven: +; W64: global_load_dword +; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0 +; W32: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]] + +; OPT-LABEL: @lshr_threadid_1d_uneven +; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}} +; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform +define amdgpu_kernel void @lshr_threadid_1d_uneven(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !3 { +entry: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %div = lshr i32 %lid, 5 + %div4 = zext i32 %div to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4 + %load = load i32, ptr addrspace(1) %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4 + store i32 %load, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; GCN-LABEL: {{^}}and_threadid_2d: +; GCN: global_load_dword + +; OPT-LABEL: @and_threadid_2d +; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}} +define amdgpu_kernel void @and_threadid_2d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !1 { entry: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %and = and i32 %lid, -32 @@ -64,3 +152,8 @@ } declare i32 @llvm.amdgcn.workitem.id.x() + +!0 = !{i32 64, i32 1, i32 1} +!1 = !{i32 65, i32 2, i32 1} +!2 = !{i32 64, i32 1, i32 2} +!3 = !{i32 65, i32 1, i32 1}