diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -81,10 +81,37 @@ return M.getFunction(Name); } -} // end anonymous namespace +LoadInst *getUniqueSimpleLoadUser(Value *V) { + if (LoadInst *L = dyn_cast(V)) { + return L->isSimple() ? L : nullptr; + } + + if (!V->getType()->isPointerTy()) { + return nullptr; + } + + LoadInst *UniqueLoad = nullptr; + for (User *U : V->users()) { + if (LoadInst *L = getUniqueSimpleLoadUser(U)) { + if (UniqueLoad && UniqueLoad != L) + return nullptr; + UniqueLoad = L; + } + } + return UniqueLoad; +} + +auto m_MatchAnyOf(ArrayRef V) { + struct anyofval_ty { + const ArrayRef Vals; + anyofval_ty(const ArrayRef V) : Vals(V) {} + bool match(Value *V) { return is_contained(Vals, V); } + }; + return anyofval_ty(V); +} static bool processUse(CallInst *CI, bool IsV5OrAbove) { - Function *F = CI->getParent()->getParent(); + Function *F = CI->getFunction(); auto MD = F->getMetadata("reqd_work_group_size"); const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3; @@ -95,107 +122,87 @@ if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize) return false; - Value *BlockCounts[3] = {nullptr, nullptr, nullptr}; - Value *GroupSizes[3] = {nullptr, nullptr, nullptr}; - Value *Remainders[3] = {nullptr, nullptr, nullptr}; - Value *GridSizes[3] = {nullptr, nullptr, nullptr}; + SmallVector BlockCounts[3], GroupSizes[3], Remainders, + GridSizes[3]; const DataLayout &DL = F->getParent()->getDataLayout(); // We expect to see several GEP users, casted to the appropriate type and // loaded. for (User *U : CI->users()) { - if (!U->hasOneUse()) + LoadInst *Load = getUniqueSimpleLoadUser(U); + if (!Load) continue; - int64_t Offset = 0; - auto *Load = dyn_cast(U); // Load from ImplicitArgPtr/DispatchPtr? - auto *BCI = dyn_cast(U); - if (!Load && !BCI) { - if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) - continue; - Load = dyn_cast(*U->user_begin()); // Load from GEP? - BCI = dyn_cast(*U->user_begin()); - } - - if (BCI) { - if (!BCI->hasOneUse()) - continue; - Load = dyn_cast(*BCI->user_begin()); // Load from BCI? - } - - if (!Load || !Load->isSimple()) + APInt Offset(64, 0U); + if (Load != U && + U->stripAndAccumulateConstantOffsets(DL, Offset, true) != CI) continue; unsigned LoadSize = DL.getTypeStoreSize(Load->getType()); // TODO: Handle merged loads. + auto const OffsetValue = Offset.getSExtValue(); if (IsV5OrAbove) { // Base is ImplicitArgPtr. - switch (Offset) { + switch (OffsetValue) { case HIDDEN_BLOCK_COUNT_X: if (LoadSize == 4) - BlockCounts[0] = Load; + BlockCounts[0].push_back(Load); break; case HIDDEN_BLOCK_COUNT_Y: if (LoadSize == 4) - BlockCounts[1] = Load; + BlockCounts[1].push_back(Load); break; case HIDDEN_BLOCK_COUNT_Z: if (LoadSize == 4) - BlockCounts[2] = Load; + BlockCounts[2].push_back(Load); break; case HIDDEN_GROUP_SIZE_X: if (LoadSize == 2) - GroupSizes[0] = Load; + GroupSizes[0].push_back(Load); break; case HIDDEN_GROUP_SIZE_Y: if (LoadSize == 2) - GroupSizes[1] = Load; + GroupSizes[1].push_back(Load); break; case HIDDEN_GROUP_SIZE_Z: if (LoadSize == 2) - GroupSizes[2] = Load; + GroupSizes[2].push_back(Load); break; case HIDDEN_REMAINDER_X: - if (LoadSize == 2) - Remainders[0] = Load; - break; case HIDDEN_REMAINDER_Y: - if (LoadSize == 2) - Remainders[1] = Load; - break; case HIDDEN_REMAINDER_Z: if (LoadSize == 2) - Remainders[2] = Load; + Remainders.push_back(Load); break; default: break; } } else { // Base is DispatchPtr. - switch (Offset) { + switch (OffsetValue) { case WORKGROUP_SIZE_X: if (LoadSize == 2) - GroupSizes[0] = Load; + GroupSizes[0].push_back(Load); break; case WORKGROUP_SIZE_Y: if (LoadSize == 2) - GroupSizes[1] = Load; + GroupSizes[1].push_back(Load); break; case WORKGROUP_SIZE_Z: if (LoadSize == 2) - GroupSizes[2] = Load; + GroupSizes[2].push_back(Load); break; case GRID_SIZE_X: if (LoadSize == 4) - GridSizes[0] = Load; + GridSizes[0].push_back(Load); break; case GRID_SIZE_Y: if (LoadSize == 4) - GridSizes[1] = Load; + GridSizes[1].push_back(Load); break; case GRID_SIZE_Z: if (LoadSize == 4) - GridSizes[2] = Load; + GridSizes[2].push_back(Load); break; default: break; @@ -213,31 +220,29 @@ // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned // for __ockl_get_local_size. for (int I = 0; I < 3; ++I) { - Value *BlockCount = BlockCounts[I]; - if (!BlockCount) - continue; - using namespace llvm::PatternMatch; auto GroupIDIntrin = I == 0 ? m_Intrinsic() : (I == 1 ? m_Intrinsic() : m_Intrinsic()); - for (User *ICmp : BlockCount->users()) { - ICmpInst::Predicate Pred; - if (match(ICmp, m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) { - if (Pred != ICmpInst::ICMP_ULT) - continue; - ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType())); - MadeChange = true; + for (Value *BlockCount : BlockCounts[I]) { + for (User *ICmp : BlockCount->users()) { + ICmpInst::Predicate Pred; + if (match(ICmp, + m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) { + if (Pred != ICmpInst::ICMP_ULT) + continue; + ICmp->replaceAllUsesWith( + llvm::ConstantInt::getTrue(ICmp->getType())); + MadeChange = true; + } } } } // All remainders should be 0 with uniform work group size. for (Value *Remainder : Remainders) { - if (!Remainder) - continue; Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType())); MadeChange = true; } @@ -262,37 +267,38 @@ // condition is false (except for group_id == 0, where the select result is // the same). for (int I = 0; I < 3; ++I) { - Value *GroupSize = GroupSizes[I]; - Value *GridSize = GridSizes[I]; - if (!GroupSize || !GridSize) - continue; - using namespace llvm::PatternMatch; auto GroupIDIntrin = I == 0 ? m_Intrinsic() : (I == 1 ? m_Intrinsic() : m_Intrinsic()); + auto MatchGroupSizesZExt = m_ZExt(m_MatchAnyOf(GroupSizes[I])); - for (User *U : GroupSize->users()) { - auto *ZextGroupSize = dyn_cast(U); - if (!ZextGroupSize) + for (Value *GridSize : GridSizes[I]) { + if (GroupSizes[I].empty()) continue; - for (User *UMin : ZextGroupSize->users()) { - if (match(UMin, - m_UMin(m_Sub(m_Specific(GridSize), - m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))), - m_Specific(ZextGroupSize)))) { - if (HasReqdWorkGroupSize) { - ConstantInt *KnownSize - = mdconst::extract(MD->getOperand(I)); - UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast( - KnownSize, UMin->getType(), false)); - } else { - UMin->replaceAllUsesWith(ZextGroupSize); + for (User *Sub : GridSize->users()) { + Instruction *ZExtGroupSize; + auto MatchAndBindGroupSizesZExt = + m_CombineAnd(MatchGroupSizesZExt, m_Instruction(ZExtGroupSize)); + if (!match(Sub, + m_Sub(m_Specific(GridSize), + m_Mul(GroupIDIntrin, MatchAndBindGroupSizesZExt)))) + continue; + for (User *UMin : Sub->users()) { + if (match(UMin, m_UMin(m_Specific(Sub), MatchGroupSizesZExt))) { + if (HasReqdWorkGroupSize) { + ConstantInt *KnownSize = + mdconst::extract(MD->getOperand(I)); + UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast( + KnownSize, UMin->getType(), false)); + } else { + UMin->replaceAllUsesWith(ZExtGroupSize); + } + + MadeChange = true; } - - MadeChange = true; } } } @@ -304,19 +310,18 @@ return MadeChange; for (int I = 0; I < 3; I++) { - Value *GroupSize = GroupSizes[I]; - if (!GroupSize) - continue; - - ConstantInt *KnownSize = mdconst::extract(MD->getOperand(I)); - GroupSize->replaceAllUsesWith( - ConstantExpr::getIntegerCast(KnownSize, GroupSize->getType(), false)); - MadeChange = true; + for (Value *GroupSize : GroupSizes[I]) { + ConstantInt *KnownSize = mdconst::extract(MD->getOperand(I)); + GroupSize->replaceAllUsesWith( + ConstantExpr::getIntegerCast(KnownSize, GroupSize->getType(), false)); + MadeChange = true; + } } return MadeChange; } +} // end anonymous namespace // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get // TargetPassConfig for subtarget. diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt-opaque-ptr.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt-opaque-ptr.ll --- a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt-opaque-ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt-opaque-ptr.ll @@ -63,6 +63,32 @@ ret void } +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_local_size_x_2_block_count_users_opaque_pointer(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_local_size_x_2_block_count_users_opaque_pointer( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12 +; GCN-NEXT: [[LOCAL_SIZE:%.*]] = load i16, ptr addrspace(4) [[GEP_LOCAL_SIZE]], align 4 +; GCN-NEXT: store volatile i16 [[LOCAL_SIZE]], ptr addrspace(1) [[OUT:%.*]], align 2 +; GCN-NEXT: store volatile i16 [[LOCAL_SIZE]], ptr addrspace(1) [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() + %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %block.count.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4 + %cmp.id.count = icmp ult i32 %group.id, %block.count.x + %local.size.offset = select i1 %cmp.id.count, i64 12, i64 18 + %gep.local.size = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 %local.size.offset + %local.size = load i16, ptr addrspace(4) %gep.local.size, align 2 + store volatile i16 %local.size, i16 addrspace(1)* %out + %cmp.id.count.1 = icmp ult i32 %group.id, %block.count.x + %local.size.offset.1 = select i1 %cmp.id.count.1, i64 12, i64 18 + %gep.local.size.1 = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 %local.size.offset.1 + %local.size.1 = load i16, ptr addrspace(4) %gep.local.size.1, align 2 + store volatile i16 %local.size.1, i16 addrspace(1)* %out + ret void +} + declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #1 declare i32 @llvm.amdgcn.workgroup.id.x() #1 declare i32 @llvm.amdgcn.workgroup.id.y() #1 diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll --- a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll @@ -93,7 +93,7 @@ ; GCN-NEXT: ret void ; %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() - %gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18 + %gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 20 %bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)* %remainder.y = load i16, i16 addrspace(4)* %bc.y, align 2 store i16 %remainder.y, i16 addrspace(1)* %out @@ -107,13 +107,37 @@ ; GCN-NEXT: ret void ; %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() - %gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18 + %gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 22 %bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)* %remainder.z = load i16, i16 addrspace(4)* %bc.z, align 2 store i16 %remainder.z, i16 addrspace(1)* %out ret void } +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_remainder_xyz(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_remainder_xyz( +; GCN-NEXT: store volatile i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: store volatile i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: store volatile i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18 + %bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)* + %remainder.x = load i16, i16 addrspace(4)* %bc.x, align 2 + store volatile i16 %remainder.x, i16 addrspace(1)* %out + %gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 20 + %bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)* + %remainder.y = load i16, i16 addrspace(4)* %bc.y, align 2 + store volatile i16 %remainder.y, i16 addrspace(1)* %out + %gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 22 + %bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)* + %remainder.z = load i16, i16 addrspace(4)* %bc.z, align 2 + store volatile i16 %remainder.z, i16 addrspace(1)* %out + ret void +} + ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn define amdgpu_kernel void @get_work_group_size_x(i16 addrspace(1)* %out) #0 { ; GCN-LABEL: @get_work_group_size_x( @@ -210,6 +234,30 @@ ret void } +define amdgpu_kernel void @get_all_work_group_sizes_reqd(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { +; GCN-LABEL: @get_all_work_group_sizes_reqd( +; GCN-NEXT: store volatile i16 8, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: store volatile i16 16, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: store volatile i16 2, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 12 + %gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 14 + %gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 16 + %bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)* + %bc.y.0 = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)* + %bc.y = bitcast i16 addrspace(4)* %bc.y.0 to i16 addrspace(4)* + %bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)* + %bc.z.select = select i1 true, i16 addrspace(4)* %bc.z, i16 addrspace(4)* %bc.z + %group.size.x = load i16, i16 addrspace(4)* %bc.x, align 2 + %group.size.y = load i16, i16 addrspace(4)* %bc.y, align 2 + %group.size.z = load i16, i16 addrspace(4)* %bc.z.select, align 2 + store volatile i16 %group.size.x, i16 addrspace(1)* %out + store volatile i16 %group.size.y, i16 addrspace(1)* %out + store volatile i16 %group.size.z, i16 addrspace(1)* %out + ret void +} declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #1 declare i32 @llvm.amdgcn.workgroup.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll --- a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll @@ -372,6 +372,96 @@ ret void } +; CHECK-LABEL: @all_local_size_twice( +; CHECK-NEXT: store volatile i64 8, i64 addrspace(1)* %out, align 4 +; CHECK-NEXT: store volatile i64 16, i64 addrspace(1)* %out, align 4 +; CHECK-NEXT: store volatile i64 2, i64 addrspace(1)* %out, align 4 +; CHECK-NEXT: store volatile i64 8, i64 addrspace(1)* %out, align 4 +; CHECK-NEXT: store volatile i64 16, i64 addrspace(1)* %out, align 4 +; CHECK-NEXT: store volatile i64 2, i64 addrspace(1)* %out, align 4 +define amdgpu_kernel void @all_local_size_twice(i64 addrspace(1)* nocapture readnone %out) #0 !reqd_work_group_size !0 { + %tmp.i = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %tmp2.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #0 + %tmp3.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 12 + %tmp4.i = bitcast i8 addrspace(4)* %tmp3.i to i32 addrspace(4)* + %tmp5.i = load i32, i32 addrspace(4)* %tmp4.i, align 4 + %tmp6.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 4 + %tmp7.i = bitcast i8 addrspace(4)* %tmp6.i to i16 addrspace(4)* + %tmp8.i = load i16, i16 addrspace(4)* %tmp7.i, align 4 + %tmp29.i = zext i16 %tmp8.i to i32 + %tmp30.i = mul i32 %tmp2.i, %tmp29.i + %tmp31.i = sub i32 %tmp5.i, %tmp30.i + %umin0 = call i32 @llvm.umin.i32(i32 %tmp31.i, i32 %tmp29.i) + %tmp34.i = zext i32 %umin0 to i64 + %tmp10.i = tail call i32 @llvm.amdgcn.workgroup.id.y() #0 + %tmp11.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 16 + %tmp12.i = bitcast i8 addrspace(4)* %tmp11.i to i32 addrspace(4)* + %tmp13.i = load i32, i32 addrspace(4)* %tmp12.i, align 8 + %tmp14.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 6 + %tmp15.i = bitcast i8 addrspace(4)* %tmp14.i to i16 addrspace(4)* + %tmp16.i = load i16, i16 addrspace(4)* %tmp15.i, align 2 + %tmp29.i9 = zext i16 %tmp16.i to i32 + %tmp30.i10 = mul i32 %tmp10.i, %tmp29.i9 + %tmp31.i11 = sub i32 %tmp13.i, %tmp30.i10 + %umin1 = call i32 @llvm.umin.i32(i32 %tmp31.i11, i32 %tmp29.i9) + %tmp34.i14 = zext i32 %umin1 to i64 + %tmp18.i = tail call i32 @llvm.amdgcn.workgroup.id.z() #0 + %tmp19.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 20 + %tmp20.i = bitcast i8 addrspace(4)* %tmp19.i to i32 addrspace(4)* + %tmp21.i = load i32, i32 addrspace(4)* %tmp20.i, align 4 + %tmp22.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 8 + %tmp23.i = bitcast i8 addrspace(4)* %tmp22.i to i16 addrspace(4)* + %tmp24.i = load i16, i16 addrspace(4)* %tmp23.i, align 8 + %tmp29.i2 = zext i16 %tmp24.i to i32 + %tmp30.i3 = mul i32 %tmp18.i, %tmp29.i2 + %tmp31.i4 = sub i32 %tmp21.i, %tmp30.i3 + %umin2 = call i32 @llvm.umin.i32(i32 %tmp31.i4, i32 %tmp29.i2) + %tmp34.i7 = zext i32 %umin2 to i64 + %tmp2.i.1 = tail call i32 @llvm.amdgcn.workgroup.id.x() #0 + %tmp3.i.1 = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 12 + %tmp4.i.1 = bitcast i8 addrspace(4)* %tmp3.i.1 to i32 addrspace(4)* + %tmp5.i.1 = load i32, i32 addrspace(4)* %tmp4.i.1, align 4 + %tmp6.i.1 = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 4 + %tmp7.i.1 = bitcast i8 addrspace(4)* %tmp6.i.1 to i16 addrspace(4)* + %tmp8.i.1 = load i16, i16 addrspace(4)* %tmp7.i.1, align 4 + %tmp29.i.1 = zext i16 %tmp8.i.1 to i32 + %tmp30.i.1 = mul i32 %tmp2.i.1, %tmp29.i.1 + %tmp31.i.1 = sub i32 %tmp5.i.1, %tmp30.i.1 + %umin0.1 = call i32 @llvm.umin.i32(i32 %tmp31.i.1, i32 %tmp29.i.1) + %tmp34.i.1 = zext i32 %umin0.1 to i64 + %tmp10.i.1 = tail call i32 @llvm.amdgcn.workgroup.id.y() #0 + %tmp11.i.1 = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 16 + %tmp12.i.1 = bitcast i8 addrspace(4)* %tmp11.i.1 to i32 addrspace(4)* + %tmp13.i.1 = load i32, i32 addrspace(4)* %tmp12.i.1, align 8 + %tmp14.i.1 = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 6 + %tmp15.i.1 = bitcast i8 addrspace(4)* %tmp14.i.1 to i16 addrspace(4)* + %tmp16.i.1 = load i16, i16 addrspace(4)* %tmp15.i.1, align 2 + %tmp29.i9.1 = zext i16 %tmp16.i.1 to i32 + %tmp30.i10.1 = mul i32 %tmp10.i.1, %tmp29.i9.1 + %tmp31.i11.1 = sub i32 %tmp13.i.1, %tmp30.i10.1 + %umin1.1 = call i32 @llvm.umin.i32(i32 %tmp31.i11.1, i32 %tmp29.i9) ; use tmp29.i9 instead of tmp29.i9.1 + %tmp34.i14.1 = zext i32 %umin1.1 to i64 + %tmp18.i.1 = tail call i32 @llvm.amdgcn.workgroup.id.z() #0 + %tmp19.i.1 = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 20 + %tmp20.i.1 = bitcast i8 addrspace(4)* %tmp19.i.1 to i32 addrspace(4)* + %tmp21.i.1 = load i32, i32 addrspace(4)* %tmp20.i.1, align 4 + %tmp22.i.1 = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 8 + %tmp23.i.1 = bitcast i8 addrspace(4)* %tmp22.i.1 to i16 addrspace(4)* + %tmp24.i.1 = load i16, i16 addrspace(4)* %tmp23.i.1, align 8 + %tmp29.i2.1 = zext i16 %tmp24.i.1 to i32 + %tmp30.i3.1 = mul i32 %tmp18.i.1, %tmp29.i2.1 + %tmp31.i4.1 = sub i32 %tmp21.i.1, %tmp30.i3.1 + %umin2.1 = call i32 @llvm.umin.i32(i32 %tmp31.i4.1, i32 %tmp29.i2.1) + %tmp34.i7.1 = zext i32 %umin2.1 to i64 + store volatile i64 %tmp34.i, i64 addrspace(1)* %out, align 4 + store volatile i64 %tmp34.i14, i64 addrspace(1)* %out, align 4 + store volatile i64 %tmp34.i7, i64 addrspace(1)* %out, align 4 + store volatile i64 %tmp34.i.1, i64 addrspace(1)* %out, align 4 + store volatile i64 %tmp34.i14.1, i64 addrspace(1)* %out, align 4 + store volatile i64 %tmp34.i7.1, i64 addrspace(1)* %out, align 4 + ret void +} + ; TODO: Should be able to handle this, but not much reason to. ; CHECK-LABEL: @partial_load_group_size_x( ; CHECK-NEXT: %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()