Index: include/llvm/Analysis/MemoryDependenceAnalysis.h =================================================================== --- include/llvm/Analysis/MemoryDependenceAnalysis.h +++ include/llvm/Analysis/MemoryDependenceAnalysis.h @@ -302,6 +302,10 @@ NonLocalPointerInfo() : Size(MemoryLocation::UnknownSize) {} }; + /// Cache storing single nonlocal def for the instruction. + /// It is set when nonlocal def would be found in function returning only + /// local dependencies. + DenseMap NonLocalDefsCache; /// This map stores the cached results of doing a pointer lookup at the /// bottom of a block. /// @@ -441,9 +445,9 @@ /// This analysis looks for other loads and stores with invariant.group /// metadata and the same pointer operand. Returns Unknown if it does not /// find anything, and Def if it can be assumed that 2 instructions load or - /// store the same value. - /// FIXME: This analysis works only on single block because of restrictions - /// at the call site. + /// store the same value and NonLocal which indicate that non-local Def was + /// found, which can be retrieved by calling getNonLocalPointerDependency + /// with the same queried instruction. MemDepResult getInvariantGroupPointerDependency(LoadInst *LI, BasicBlock *BB); /// Looks at a memory location for a load (specified by MemLocBase, Offs, and Index: lib/Analysis/MemoryDependenceAnalysis.cpp =================================================================== --- lib/Analysis/MemoryDependenceAnalysis.cpp +++ lib/Analysis/MemoryDependenceAnalysis.cpp @@ -323,17 +323,27 @@ const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt, BasicBlock *BB, Instruction *QueryInst, unsigned *Limit) { + MemDepResult InvariantGroupDependency = MemDepResult::getUnknown(); if (QueryInst != nullptr) { if (auto *LI = dyn_cast(QueryInst)) { - MemDepResult invariantGroupDependency = - getInvariantGroupPointerDependency(LI, BB); + InvariantGroupDependency = getInvariantGroupPointerDependency(LI, BB); - if (invariantGroupDependency.isDef()) - return invariantGroupDependency; + if (InvariantGroupDependency.isDef()) + return InvariantGroupDependency; } } - return getSimplePointerDependencyFrom(MemLoc, isLoad, ScanIt, BB, QueryInst, - Limit); + MemDepResult SimpleDep = getSimplePointerDependencyFrom( + MemLoc, isLoad, ScanIt, BB, QueryInst, Limit); + if (SimpleDep.isDef()) + return SimpleDep; + // Non-local invariant group dependency indicates there is non local Def, + // which is better than local clobber and everything else. + if (InvariantGroupDependency.isNonLocal()) + return InvariantGroupDependency; + + assert(InvariantGroupDependency.isUnknown() && + "InvariantGroupDependency should be only unknown at this point"); + return SimpleDep; } MemDepResult @@ -353,6 +363,17 @@ // Queue to process all pointers that are equivalent to load operand. SmallVector LoadOperandsQueue; SmallSet SeenValues; + Instruction *ClosestDependency = nullptr; + // Order of instructions in uses list is unpredictible. In order to always + // get the same result, we will look for the closest dominance. + auto GetClosestDependency = [this](Instruction *Best, Instruction *Other) { + assert(Other != nullptr); + if (Best == nullptr) + return Other; + if (DT.dominates(Best, Other)) + return Other; + return Best; + }; auto TryInsertToQueue = [&](Value *V) { if (SeenValues.insert(V).second) LoadOperandsQueue.push_back(V); @@ -398,12 +419,27 @@ // If we hit load/store with the same invariant.group metadata (and the // same pointer operand) we can assume that value pointed by pointer // operand didn't change. - if ((isa(U) || isa(U)) && U->getParent() == BB && - U->getMetadata(LLVMContext::MD_invariant_group) == InvariantGroupMD) - return MemDepResult::getDef(U); + if ((isa(U) || isa(U)) && + U->getMetadata(LLVMContext::MD_invariant_group) == InvariantGroupMD) { + ClosestDependency = GetClosestDependency(ClosestDependency, U); + } } } - return MemDepResult::getUnknown(); + + if (!ClosestDependency) + return MemDepResult::getUnknown(); + if (ClosestDependency->getParent() == BB) + return MemDepResult::getDef(ClosestDependency); + // Def(U) can't be returned here because it is non-local. If local + // dependency won't be found then return nonLocal counting that the + // user will call getNonLocalPointerDependency, which will return cached + // result. + NonLocalDefsCache.try_emplace( + LI, NonLocalDepResult(ClosestDependency->getParent(), + MemDepResult::getDef(ClosestDependency), + nullptr)); + return MemDepResult::getNonLocal(); + } MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( @@ -887,7 +923,14 @@ assert(Loc.Ptr->getType()->isPointerTy() && "Can't get pointer deps of a non-pointer!"); Result.clear(); - + { + auto NonLocalDefIt = NonLocalDefsCache.find(QueryInst); + if (NonLocalDefIt != NonLocalDefsCache.end()) { + Result.push_back(std::move(NonLocalDefIt->second)); + NonLocalDefsCache.erase(NonLocalDefIt); + return; + } + } // This routine does not expect to deal with volatile instructions. // Doing so would require piping through the QueryInst all the way through. // TODO: volatiles can't be elided, but they can be reordered with other Index: test/Transforms/GVN/assume-equal.ll =================================================================== --- test/Transforms/GVN/assume-equal.ll +++ test/Transforms/GVN/assume-equal.ll @@ -65,22 +65,20 @@ %vtable1 = load i8**, i8*** %1, align 8, !invariant.group !0 %vtable2.cast = bitcast i8** %vtable1 to i32 (%struct.A*)** %call1 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable2.cast, align 8 -; FIXME: those loads could be also direct, but right now the invariant.group -; analysis works only on single block -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %callx = tail call i32 %call1(%struct.A* %0) #1 %vtable2 = load i8**, i8*** %1, align 8, !invariant.group !0 %vtable3.cast = bitcast i8** %vtable2 to i32 (%struct.A*)** %call4 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable3.cast, align 8 -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %cally = tail call i32 %call4(%struct.A* %0) #1 %b = bitcast i8* %call to %struct.A** %vtable3 = load %struct.A*, %struct.A** %b, align 8, !invariant.group !0 %vtable4.cast = bitcast %struct.A* %vtable3 to i32 (%struct.A*)** %vfun = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable4.cast, align 8 -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %unknown = tail call i32 %vfun(%struct.A* %0) #1 br label %if.end Index: test/Transforms/GVN/invariant.group.ll =================================================================== --- test/Transforms/GVN/invariant.group.ll +++ test/Transforms/GVN/invariant.group.ll @@ -344,6 +344,45 @@ ret void } +; CHECK-LABEL: define void @handling_loops() +define void @handling_loops() { + %a = alloca %struct.A, align 8 + %1 = bitcast %struct.A* %a to i8* + %2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**), i32 (...)*** %2, align 8, !invariant.group !0 + %3 = load i8, i8* @unknownPtr, align 4 + %4 = icmp sgt i8 %3, 0 + br i1 %4, label %.lr.ph.i, label %_Z2g2R1A.exit + +.lr.ph.i: ; preds = %0 + %5 = bitcast %struct.A* %a to void (%struct.A*)*** + %6 = load i8, i8* @unknownPtr, align 4 + %7 = icmp sgt i8 %6, 1 + br i1 %7, label %._crit_edge.preheader, label %_Z2g2R1A.exit + +._crit_edge.preheader: ; preds = %.lr.ph.i + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.preheader, %._crit_edge + %8 = phi i8 [ %10, %._crit_edge ], [ 1, %._crit_edge.preheader ] + %.pre = load void (%struct.A*)**, void (%struct.A*)*** %5, align 8, !invariant.group !0 + %9 = load void (%struct.A*)*, void (%struct.A*)** %.pre, align 8 + ; CHECK: call void @_ZN1A3fooEv(%struct.A* nonnull %a) + call void %9(%struct.A* nonnull %a) #3 + ; CHECK-NOT: call void % + %10 = add nuw nsw i8 %8, 1 + %11 = load i8, i8* @unknownPtr, align 4 + %12 = icmp slt i8 %10, %11 + br i1 %12, label %._crit_edge, label %_Z2g2R1A.exit.loopexit + +_Z2g2R1A.exit.loopexit: ; preds = %._crit_edge + br label %_Z2g2R1A.exit + +_Z2g2R1A.exit: ; preds = %_Z2g2R1A.exit.loopexit, %.lr.ph.i, %0 + ret void +} + + declare void @foo(i8*) declare void @bar(i8) declare i8* @getPointer(i8*) Index: test/Transforms/NewGVN/assume-equal.ll =================================================================== --- test/Transforms/NewGVN/assume-equal.ll +++ test/Transforms/NewGVN/assume-equal.ll @@ -66,22 +66,20 @@ %vtable1 = load i8**, i8*** %1, align 8, !invariant.group !0 %vtable2.cast = bitcast i8** %vtable1 to i32 (%struct.A*)** %call1 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable2.cast, align 8 -; FIXME: those loads could be also direct, but right now the invariant.group -; analysis works only on single block -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %callx = tail call i32 %call1(%struct.A* %0) #1 %vtable2 = load i8**, i8*** %1, align 8, !invariant.group !0 %vtable3.cast = bitcast i8** %vtable2 to i32 (%struct.A*)** %call4 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable3.cast, align 8 -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %cally = tail call i32 %call4(%struct.A* %0) #1 %b = bitcast i8* %call to %struct.A** %vtable3 = load %struct.A*, %struct.A** %b, align 8, !invariant.group !0 %vtable4.cast = bitcast %struct.A* %vtable3 to i32 (%struct.A*)** %vfun = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable4.cast, align 8 -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %unknown = tail call i32 %vfun(%struct.A* %0) #1 br label %if.end Index: test/Transforms/NewGVN/invariant.group.ll =================================================================== --- test/Transforms/NewGVN/invariant.group.ll +++ test/Transforms/NewGVN/invariant.group.ll @@ -345,6 +345,46 @@ ret void } +; CHECK-LABEL: define void @handling_loops() +define void @handling_loops() { + %a = alloca %struct.A, align 8 + %1 = bitcast %struct.A* %a to i8* + %2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**), i32 (...)*** %2, align 8, !invariant.group !0 + %3 = load i8, i8* @unknownPtr, align 4 + %4 = icmp sgt i8 %3, 0 + br i1 %4, label %.lr.ph.i, label %_Z2g2R1A.exit + +.lr.ph.i: ; preds = %0 + %5 = bitcast %struct.A* %a to void (%struct.A*)*** + %6 = load i8, i8* @unknownPtr, align 4 + %7 = icmp sgt i8 %6, 1 + br i1 %7, label %._crit_edge.preheader, label %_Z2g2R1A.exit + +._crit_edge.preheader: ; preds = %.lr.ph.i + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.preheader, %._crit_edge + %8 = phi i8 [ %10, %._crit_edge ], [ 1, %._crit_edge.preheader ] + %.pre = load void (%struct.A*)**, void (%struct.A*)*** %5, align 8, !invariant.group !0 + %9 = load void (%struct.A*)*, void (%struct.A*)** %.pre, align 8 +; CHECK: call void @_ZN1A3fooEv(%struct.A* nonnull %a) + call void %9(%struct.A* nonnull %a) #3 + +; CHECK-NOT: call void % + %10 = add nuw nsw i8 %8, 1 + %11 = load i8, i8* @unknownPtr, align 4 + %12 = icmp slt i8 %10, %11 + br i1 %12, label %._crit_edge, label %_Z2g2R1A.exit.loopexit + +_Z2g2R1A.exit.loopexit: ; preds = %._crit_edge + br label %_Z2g2R1A.exit + +_Z2g2R1A.exit: ; preds = %_Z2g2R1A.exit.loopexit, %.lr.ph.i, %0 + ret void +} + + declare void @foo(i8*) declare void @bar(i8) declare i8* @getPointer(i8*)