diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -5067,8 +5067,9 @@ OAS.getOffset() < getOffset() + getSize(); } - /// Constant used to represent unknown offset or sizes. - static constexpr int64_t Unknown = 1 << 31; + /// Constants used to represent special offsets or sizes. + static constexpr int64_t Unassigned = -1; + static constexpr int64_t Unknown = -2; }; /// Call \p CB on all accesses that might interfere with \p OAS and return diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -1226,7 +1226,7 @@ /// Helper struct, will support ranges eventually. struct OffsetInfo { - int64_t Offset = OffsetAndSize::Unknown; + int64_t Offset = OffsetAndSize::Unassigned; bool operator==(const OffsetInfo &OI) const { return Offset == OI.Offset; } }; @@ -1243,6 +1243,8 @@ auto HandlePassthroughUser = [&](Value *Usr, OffsetInfo PtrOI, bool &Follow) { + assert(PtrOI.Offset != OffsetAndSize::Unassigned && + "Cannot pass through if the input Ptr was not visited!"); OffsetInfo &UsrOI = OffsetInfoMap[Usr]; UsrOI = PtrOI; Follow = true; @@ -1283,11 +1285,15 @@ APInt GEPOffset(DL.getIndexTypeSizeInBits(GEP->getType()), 0); if (PtrOI.Offset == OffsetAndSize::Unknown || !GEP->accumulateConstantOffset(DL, GEPOffset)) { + LLVM_DEBUG(dbgs() << "[AAPointerInfo] GEP offset not constant " + << *GEP << "\n"); UsrOI.Offset = OffsetAndSize::Unknown; Follow = true; return true; } + LLVM_DEBUG(dbgs() << "[AAPointerInfo] GEP offset is constant " << *GEP + << "\n"); UsrOI.Offset = PtrOI.Offset + GEPOffset.getZExtValue(); Follow = true; return true; @@ -1306,15 +1312,22 @@ bool IsFirstPHIUser = !OffsetInfoMap.count(Usr); OffsetInfo &UsrOI = OffsetInfoMap[Usr]; OffsetInfo &PtrOI = OffsetInfoMap[CurPtr]; - // Check if the PHI is invariant (so far). - if (UsrOI == PtrOI) - return true; // Check if the PHI operand has already an unknown offset as we can't // improve on that anymore. if (PtrOI.Offset == OffsetAndSize::Unknown) { + LLVM_DEBUG(dbgs() << "[AAPointerInfo] PHI operand offset unknown " + << *CurPtr << " in " << *Usr << "\n"); + Follow = UsrOI.Offset != OffsetAndSize::Unknown; UsrOI = PtrOI; - Follow = true; + return true; + } + + // Check if the PHI is invariant (so far). + if (UsrOI == PtrOI) { + assert(PtrOI.Offset != OffsetAndSize::Unassigned && + "Cannot assign if the current Ptr was not visited!"); + LLVM_DEBUG(dbgs() << "[AAPointerInfo] PHI is invariant (so far)"); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/attributor.ll b/llvm/test/CodeGen/AMDGPU/attributor.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/attributor.ll @@ -0,0 +1,42 @@ +; RUN: llc < %s | FileCheck %s + +target triple = "amdgcn-amd-amdhsa" + +; The call to intrinsic implicitarg_ptr reaches a load through a phi. The +; offsets of the phi cannot be determined, and hence the attirbutor assumes that +; hostcall is in use. + +; CHECK: .value_kind: hidden_hostcall_buffer +; CHECK: .value_kind: hidden_multigrid_sync_arg + +define amdgpu_kernel void @the_kernel(i32 addrspace(1)* %a, i64 %index1, i64 %index2, i1 %cond) { +entry: + %tmp7 = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + br i1 %cond, label %old, label %new + +old: ; preds = %entry + %tmp4 = getelementptr i8, i8 addrspace(4)* %tmp7, i64 %index1 + br label %join + +new: ; preds = %entry + %tmp12 = getelementptr inbounds i8, i8 addrspace(4)* %tmp7, i64 %index2 + br label %join + +join: ; preds = %new, %old + %.in.in.in = phi i8 addrspace(4)* [ %tmp12, %new ], [ %tmp4, %old ] + %.in.in = bitcast i8 addrspace(4)* %.in.in.in to i16 addrspace(4)* + + ;;; THIS USE is where the offset into implicitarg_ptr is unknown + %.in = load i16, i16 addrspace(4)* %.in.in, align 2 + + %idx.ext = sext i16 %.in to i64 + %add.ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idx.ext + %tmp16 = atomicrmw add i32 addrspace(1)* %add.ptr3, i32 15 syncscope("agent-one-as") monotonic, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() + +declare align 4 i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + +declare i32 @llvm.amdgcn.workgroup.id.x() diff --git a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll --- a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll +++ b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll @@ -1616,6 +1616,71 @@ ret i32 %add1 } +define i8 @local_alloca_not_simplifiable_2(i64 %index1, i64 %index2, i1 %cnd) { +; TUNIT: Function Attrs: nofree norecurse nosync nounwind willreturn +; TUNIT-LABEL: define {{[^@]+}}@local_alloca_not_simplifiable_2 +; TUNIT-SAME: (i64 [[INDEX1:%.*]], i64 [[INDEX2:%.*]], i1 [[CND:%.*]]) #[[ATTR3]] { +; TUNIT-NEXT: entry: +; TUNIT-NEXT: [[BYTES:%.*]] = alloca [1024 x i8], align 16 +; TUNIT-NEXT: [[GEP0:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BYTES]], i64 0, i64 0 +; TUNIT-NEXT: store i8 7, i8* [[GEP0]], align 16 +; TUNIT-NEXT: br i1 [[CND]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; TUNIT: left: +; TUNIT-NEXT: [[GEP1:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BYTES]], i64 0, i64 [[INDEX1]] +; TUNIT-NEXT: br label [[JOIN:%.*]] +; TUNIT: right: +; TUNIT-NEXT: [[GEP2:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BYTES]], i64 0, i64 [[INDEX2]] +; TUNIT-NEXT: br label [[JOIN]] +; TUNIT: join: +; TUNIT-NEXT: [[GEP_JOIN:%.*]] = phi i8* [ [[GEP1]], [[LEFT]] ], [ [[GEP2]], [[RIGHT]] ] +; TUNIT-NEXT: store i8 9, i8* [[GEP_JOIN]], align 4 +; TUNIT-NEXT: [[I:%.*]] = load i8, i8* [[GEP0]], align 16 +; TUNIT-NEXT: ret i8 [[I]] +; +; CGSCC: Function Attrs: nofree norecurse nosync nounwind willreturn +; CGSCC-LABEL: define {{[^@]+}}@local_alloca_not_simplifiable_2 +; CGSCC-SAME: (i64 [[INDEX1:%.*]], i64 [[INDEX2:%.*]], i1 [[CND:%.*]]) #[[ATTR5]] { +; CGSCC-NEXT: entry: +; CGSCC-NEXT: [[BYTES:%.*]] = alloca [1024 x i8], align 16 +; CGSCC-NEXT: [[GEP0:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BYTES]], i64 0, i64 0 +; CGSCC-NEXT: store i8 7, i8* [[GEP0]], align 16 +; CGSCC-NEXT: br i1 [[CND]], label [[LEFT:%.*]], label [[RIGHT:%.*]] +; CGSCC: left: +; CGSCC-NEXT: [[GEP1:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BYTES]], i64 0, i64 [[INDEX1]] +; CGSCC-NEXT: br label [[JOIN:%.*]] +; CGSCC: right: +; CGSCC-NEXT: [[GEP2:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BYTES]], i64 0, i64 [[INDEX2]] +; CGSCC-NEXT: br label [[JOIN]] +; CGSCC: join: +; CGSCC-NEXT: [[GEP_JOIN:%.*]] = phi i8* [ [[GEP1]], [[LEFT]] ], [ [[GEP2]], [[RIGHT]] ] +; CGSCC-NEXT: store i8 9, i8* [[GEP_JOIN]], align 4 +; CGSCC-NEXT: [[I:%.*]] = load i8, i8* [[GEP0]], align 16 +; CGSCC-NEXT: ret i8 [[I]] +; +entry: + %Bytes = alloca [1024 x i8], align 16 + %gep0 = getelementptr inbounds [1024 x i8], [1024 x i8]* %Bytes, i64 0, i64 0 + store i8 7, i8* %gep0, align 4 + br i1 %cnd, label %left, label %right + +left: ; preds = %entry + %gep1 = getelementptr inbounds [1024 x i8], [1024 x i8]* %Bytes, i64 0, i64 %index1 + br label %join + +right: ; preds = %entry + %gep2 = getelementptr inbounds [1024 x i8], [1024 x i8]* %Bytes, i64 0, i64 %index2 + br label %join + +join: ; preds = %right, %left + %gep.join = phi i8* [ %gep1, %left ], [ %gep2, %right ] + store i8 9, i8* %gep.join, align 4 + + ; This load cannot be replaced by the value 7 from %entry, since the previous + ; store interferes due to its unknown offset. + %i = load i8, i8* %gep0, align 4 + ret i8 %i +} + ; We could simplify these if we separate accessed bins wrt. alignment (here mod 4). define i32 @unknown_access_mixed_simplifiable(i32 %arg1, i32 %arg2) { ; CHECK: Function Attrs: nofree norecurse nosync nounwind readnone willreturn