Index: llvm/trunk/include/llvm/Analysis/MemoryDependenceAnalysis.h =================================================================== --- llvm/trunk/include/llvm/Analysis/MemoryDependenceAnalysis.h +++ llvm/trunk/include/llvm/Analysis/MemoryDependenceAnalysis.h @@ -302,6 +302,10 @@ NonLocalPointerInfo() : Size(MemoryLocation::UnknownSize) {} }; + /// Cache storing single nonlocal def for the instruction. + /// It is set when nonlocal def would be found in function returning only + /// local dependencies. + DenseMap NonLocalDefsCache; /// This map stores the cached results of doing a pointer lookup at the /// bottom of a block. /// @@ -441,9 +445,9 @@ /// This analysis looks for other loads and stores with invariant.group /// metadata and the same pointer operand. Returns Unknown if it does not /// find anything, and Def if it can be assumed that 2 instructions load or - /// store the same value. - /// FIXME: This analysis works only on single block because of restrictions - /// at the call site. + /// store the same value and NonLocal which indicate that non-local Def was + /// found, which can be retrieved by calling getNonLocalPointerDependency + /// with the same queried instruction. MemDepResult getInvariantGroupPointerDependency(LoadInst *LI, BasicBlock *BB); /// Looks at a memory location for a load (specified by MemLocBase, Offs, and Index: llvm/trunk/lib/Analysis/MemoryDependenceAnalysis.cpp =================================================================== --- llvm/trunk/lib/Analysis/MemoryDependenceAnalysis.cpp +++ llvm/trunk/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -323,17 +323,28 @@ const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt, BasicBlock *BB, Instruction *QueryInst, unsigned *Limit) { + MemDepResult InvariantGroupDependency = MemDepResult::getUnknown(); if (QueryInst != nullptr) { if (auto *LI = dyn_cast(QueryInst)) { - MemDepResult InvariantGroupDependency = - getInvariantGroupPointerDependency(LI, BB); + InvariantGroupDependency = getInvariantGroupPointerDependency(LI, BB); if (InvariantGroupDependency.isDef()) return InvariantGroupDependency; } } - return getSimplePointerDependencyFrom(MemLoc, isLoad, ScanIt, BB, QueryInst, - Limit); + MemDepResult SimpleDep = getSimplePointerDependencyFrom( + MemLoc, isLoad, ScanIt, BB, QueryInst, Limit); + if (SimpleDep.isDef()) + return SimpleDep; + // Non-local invariant group dependency indicates there is non local Def + // (it only returns nonLocal if it finds nonLocal def), which is better than + // local clobber and everything else. + if (InvariantGroupDependency.isNonLocal()) + return InvariantGroupDependency; + + assert(InvariantGroupDependency.isUnknown() && + "InvariantGroupDependency should be only unknown at this point"); + return SimpleDep; } MemDepResult @@ -358,6 +369,20 @@ // Queue to process all pointers that are equivalent to load operand. SmallVector LoadOperandsQueue; LoadOperandsQueue.push_back(LoadOperand); + + Instruction *ClosestDependency = nullptr; + // Order of instructions in uses list is unpredictible. In order to always + // get the same result, we will look for the closest dominance. + auto GetClosestDependency = [this](Instruction *Best, Instruction *Other) { + assert(Other && "Must call it with not null instruction"); + if (Best == nullptr || DT.dominates(Best, Other)) + return Other; + return Best; + }; + + + // FIXME: This loop is O(N^2) because dominates can be O(n) and in worst case + // we will see all the instructions. This should be fixed in MSSA. while (!LoadOperandsQueue.empty()) { const Value *Ptr = LoadOperandsQueue.pop_back_val(); assert(Ptr && !isa(Ptr) && @@ -388,12 +413,24 @@ // If we hit load/store with the same invariant.group metadata (and the // same pointer operand) we can assume that value pointed by pointer // operand didn't change. - if ((isa(U) || isa(U)) && U->getParent() == BB && + if ((isa(U) || isa(U)) && U->getMetadata(LLVMContext::MD_invariant_group) == InvariantGroupMD) - return MemDepResult::getDef(U); + ClosestDependency = GetClosestDependency(ClosestDependency, U); } } - return MemDepResult::getUnknown(); + + if (!ClosestDependency) + return MemDepResult::getUnknown(); + if (ClosestDependency->getParent() == BB) + return MemDepResult::getDef(ClosestDependency); + // Def(U) can't be returned here because it is non-local. If local + // dependency won't be found then return nonLocal counting that the + // user will call getNonLocalPointerDependency, which will return cached + // result. + NonLocalDefsCache.try_emplace( + LI, NonLocalDepResult(ClosestDependency->getParent(), + MemDepResult::getDef(ClosestDependency), nullptr)); + return MemDepResult::getNonLocal(); } MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( @@ -877,7 +914,17 @@ assert(Loc.Ptr->getType()->isPointerTy() && "Can't get pointer deps of a non-pointer!"); Result.clear(); - + { + // Check if there is cached Def with invariant.group. FIXME: cache might be + // invalid if cached instruction would be removed between call to + // getPointerDependencyFrom and this function. + auto NonLocalDefIt = NonLocalDefsCache.find(QueryInst); + if (NonLocalDefIt != NonLocalDefsCache.end()) { + Result.push_back(std::move(NonLocalDefIt->second)); + NonLocalDefsCache.erase(NonLocalDefIt); + return; + } + } // This routine does not expect to deal with volatile instructions. // Doing so would require piping through the QueryInst all the way through. // TODO: volatiles can't be elided, but they can be reordered with other Index: llvm/trunk/test/Transforms/GVN/assume-equal.ll =================================================================== --- llvm/trunk/test/Transforms/GVN/assume-equal.ll +++ llvm/trunk/test/Transforms/GVN/assume-equal.ll @@ -65,22 +65,20 @@ %vtable1 = load i8**, i8*** %1, align 8, !invariant.group !0 %vtable2.cast = bitcast i8** %vtable1 to i32 (%struct.A*)** %call1 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable2.cast, align 8 -; FIXME: those loads could be also direct, but right now the invariant.group -; analysis works only on single block -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %callx = tail call i32 %call1(%struct.A* %0) #1 %vtable2 = load i8**, i8*** %1, align 8, !invariant.group !0 %vtable3.cast = bitcast i8** %vtable2 to i32 (%struct.A*)** %call4 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable3.cast, align 8 -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %cally = tail call i32 %call4(%struct.A* %0) #1 %b = bitcast i8* %call to %struct.A** %vtable3 = load %struct.A*, %struct.A** %b, align 8, !invariant.group !0 %vtable4.cast = bitcast %struct.A* %vtable3 to i32 (%struct.A*)** %vfun = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable4.cast, align 8 -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %unknown = tail call i32 %vfun(%struct.A* %0) #1 br label %if.end Index: llvm/trunk/test/Transforms/GVN/invariant.group.ll =================================================================== --- llvm/trunk/test/Transforms/GVN/invariant.group.ll +++ llvm/trunk/test/Transforms/GVN/invariant.group.ll @@ -392,6 +392,44 @@ ret void } +; CHECK-LABEL: define void @handling_loops() +define void @handling_loops() { + %a = alloca %struct.A, align 8 + %1 = bitcast %struct.A* %a to i8* + %2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**), i32 (...)*** %2, align 8, !invariant.group !0 + %3 = load i8, i8* @unknownPtr, align 4 + %4 = icmp sgt i8 %3, 0 + br i1 %4, label %.lr.ph.i, label %_Z2g2R1A.exit + +.lr.ph.i: ; preds = %0 + %5 = bitcast %struct.A* %a to void (%struct.A*)*** + %6 = load i8, i8* @unknownPtr, align 4 + %7 = icmp sgt i8 %6, 1 + br i1 %7, label %._crit_edge.preheader, label %_Z2g2R1A.exit + +._crit_edge.preheader: ; preds = %.lr.ph.i + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.preheader, %._crit_edge + %8 = phi i8 [ %10, %._crit_edge ], [ 1, %._crit_edge.preheader ] + %.pre = load void (%struct.A*)**, void (%struct.A*)*** %5, align 8, !invariant.group !0 + %9 = load void (%struct.A*)*, void (%struct.A*)** %.pre, align 8 + ; CHECK: call void @_ZN1A3fooEv(%struct.A* nonnull %a) + call void %9(%struct.A* nonnull %a) #3 + ; CHECK-NOT: call void % + %10 = add nuw nsw i8 %8, 1 + %11 = load i8, i8* @unknownPtr, align 4 + %12 = icmp slt i8 %10, %11 + br i1 %12, label %._crit_edge, label %_Z2g2R1A.exit.loopexit + +_Z2g2R1A.exit.loopexit: ; preds = %._crit_edge + br label %_Z2g2R1A.exit + +_Z2g2R1A.exit: ; preds = %_Z2g2R1A.exit.loopexit, %.lr.ph.i, %0 + ret void +} + declare void @foo(i8*) declare void @foo2(i8*, i8) Index: llvm/trunk/test/Transforms/NewGVN/assume-equal.ll =================================================================== --- llvm/trunk/test/Transforms/NewGVN/assume-equal.ll +++ llvm/trunk/test/Transforms/NewGVN/assume-equal.ll @@ -66,22 +66,20 @@ %vtable1 = load i8**, i8*** %1, align 8, !invariant.group !0 %vtable2.cast = bitcast i8** %vtable1 to i32 (%struct.A*)** %call1 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable2.cast, align 8 -; FIXME: those loads could be also direct, but right now the invariant.group -; analysis works only on single block -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %callx = tail call i32 %call1(%struct.A* %0) #1 %vtable2 = load i8**, i8*** %1, align 8, !invariant.group !0 %vtable3.cast = bitcast i8** %vtable2 to i32 (%struct.A*)** %call4 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable3.cast, align 8 -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %cally = tail call i32 %call4(%struct.A* %0) #1 %b = bitcast i8* %call to %struct.A** %vtable3 = load %struct.A*, %struct.A** %b, align 8, !invariant.group !0 %vtable4.cast = bitcast %struct.A* %vtable3 to i32 (%struct.A*)** %vfun = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable4.cast, align 8 -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %unknown = tail call i32 %vfun(%struct.A* %0) #1 br label %if.end Index: llvm/trunk/test/Transforms/NewGVN/invariant.group.ll =================================================================== --- llvm/trunk/test/Transforms/NewGVN/invariant.group.ll +++ llvm/trunk/test/Transforms/NewGVN/invariant.group.ll @@ -393,6 +393,45 @@ ret void } +; CHECK-LABEL: define void @handling_loops() +define void @handling_loops() { + %a = alloca %struct.A, align 8 + %1 = bitcast %struct.A* %a to i8* + %2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**), i32 (...)*** %2, align 8, !invariant.group !0 + %3 = load i8, i8* @unknownPtr, align 4 + %4 = icmp sgt i8 %3, 0 + br i1 %4, label %.lr.ph.i, label %_Z2g2R1A.exit + +.lr.ph.i: ; preds = %0 + %5 = bitcast %struct.A* %a to void (%struct.A*)*** + %6 = load i8, i8* @unknownPtr, align 4 + %7 = icmp sgt i8 %6, 1 + br i1 %7, label %._crit_edge.preheader, label %_Z2g2R1A.exit + +._crit_edge.preheader: ; preds = %.lr.ph.i + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.preheader, %._crit_edge + %8 = phi i8 [ %10, %._crit_edge ], [ 1, %._crit_edge.preheader ] + %.pre = load void (%struct.A*)**, void (%struct.A*)*** %5, align 8, !invariant.group !0 + %9 = load void (%struct.A*)*, void (%struct.A*)** %.pre, align 8 +; CHECK: call void @_ZN1A3fooEv(%struct.A* nonnull %a) + call void %9(%struct.A* nonnull %a) #3 + +; CHECK-NOT: call void % + %10 = add nuw nsw i8 %8, 1 + %11 = load i8, i8* @unknownPtr, align 4 + %12 = icmp slt i8 %10, %11 + br i1 %12, label %._crit_edge, label %_Z2g2R1A.exit.loopexit + +_Z2g2R1A.exit.loopexit: ; preds = %._crit_edge + br label %_Z2g2R1A.exit + +_Z2g2R1A.exit: ; preds = %_Z2g2R1A.exit.loopexit, %.lr.ph.i, %0 + ret void +} + declare void @foo(i8*) declare void @foo2(i8*, i8)