Index: include/llvm/Analysis/MemoryDependenceAnalysis.h =================================================================== --- include/llvm/Analysis/MemoryDependenceAnalysis.h +++ include/llvm/Analysis/MemoryDependenceAnalysis.h @@ -381,7 +381,7 @@ /// /// This method assumes the pointer has a "NonLocal" dependency within /// QueryInst's parent basic block. - void getNonLocalPointerDependency(Instruction *QueryInst, + void getNonLocalPointerDependency(MemoryLocation &Loc, bool isLoad, Instruction *QueryInst, SmallVectorImpl &Result); /// Removes an instruction from the dependence analysis, updating the Index: lib/Analysis/MemDepPrinter.cpp =================================================================== --- lib/Analysis/MemDepPrinter.cpp +++ lib/Analysis/MemDepPrinter.cpp @@ -119,7 +119,9 @@ SmallVector NLDI; assert( (isa(Inst) || isa(Inst) || isa(Inst)) && "Unknown memory instruction!"); - MDA.getNonLocalPointerDependency(Inst, NLDI); + + MemoryLocation Loc = MemoryLocation::get(Inst); + MDA.getNonLocalPointerDependency(Loc, isa(Inst), Inst, NLDI); DepSet &InstDeps = Deps[Inst]; for (const NonLocalDepResult &I : NLDI) { Index: lib/Analysis/MemoryDependenceAnalysis.cpp =================================================================== --- lib/Analysis/MemoryDependenceAnalysis.cpp +++ lib/Analysis/MemoryDependenceAnalysis.cpp @@ -875,9 +875,8 @@ } void MemoryDependenceResults::getNonLocalPointerDependency( - Instruction *QueryInst, SmallVectorImpl &Result) { - const MemoryLocation Loc = MemoryLocation::get(QueryInst); - bool isLoad = isa(QueryInst); + MemoryLocation &Loc, bool isLoad, Instruction *QueryInst, + SmallVectorImpl &Result) { BasicBlock *FromBB = QueryInst->getParent(); assert(FromBB); Index: lib/Transforms/Scalar/GVN.cpp =================================================================== --- lib/Transforms/Scalar/GVN.cpp +++ lib/Transforms/Scalar/GVN.cpp @@ -1621,7 +1621,8 @@ // Step 1: Find the non-local dependencies of the load. LoadDepVect Deps; - MD->getNonLocalPointerDependency(LI, Deps); + MemoryLocation Loc = MemoryLocation::get(LI); + MD->getNonLocalPointerDependency(Loc, true, LI, Deps); // If we had to process more than one hundred blocks to find the // dependencies, this load isn't worth worrying about. Optimizing Index: lib/Transforms/Scalar/MemCpyOptimizer.cpp =================================================================== --- lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -989,9 +989,17 @@ // // NOTE: This is conservative, it will stop on any read from the source loc, // not just the defining memcpy. - MemDepResult SourceDep = - MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false, - M->getIterator(), M->getParent()); + MemoryLocation SourceLoc = MemoryLocation::getForSource(MDep); + MemDepResult SourceDep = MD->getPointerDependencyFrom(SourceLoc, false, + M->getIterator(), M->getParent()); + + SmallVector NonLocalDepResults; + if (SourceDep.isNonLocal()) { + MD->getNonLocalPointerDependency(SourceLoc, false, M, NonLocalDepResults); + if (NonLocalDepResults.size() == 1) + SourceDep = NonLocalDepResults[0].getResult(); + } + if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; @@ -1190,6 +1198,13 @@ MemDepResult SrcDepInfo = MD->getPointerDependencyFrom( SrcLoc, true, M->getIterator(), M->getParent()); + SmallVector NonLocalDepResults; + if (SrcDepInfo.isNonLocal()) { + MD->getNonLocalPointerDependency(SrcLoc, true, M, NonLocalDepResults); + if (NonLocalDepResults.size() == 1) + SrcDepInfo = NonLocalDepResults[0].getResult(); + } + if (SrcDepInfo.isClobber()) { if (MemCpyInst *MDep = dyn_cast(SrcDepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep); Index: test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll =================================================================== --- /dev/null +++ test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll @@ -0,0 +1,118 @@ +; RUN: opt < %s -memcpyopt -S | FileCheck %s +; Make sure memcpy-memcpy dependence is optimized across +; basic blocks (conditional branches and invokes). + +; ModuleID = 'nonlocal-memcpy-memcpy.cpp' +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.11.0" + +%struct.s = type { i32, i32 } + +@foo_s = private unnamed_addr constant %struct.s { i32 1, i32 2 }, align 4 +@baz_s = private unnamed_addr constant %struct.s { i32 1, i32 2 }, align 4 +@i = external constant i8* + +; Function Attrs: ssp uwtable +define i32 @foo() #0 { + %x = alloca i32, align 4 + %s = alloca %struct.s, align 4 + %t = alloca %struct.s, align 4 + %1 = call i32 @bar() + store i32 %1, i32* %x, align 4 + %2 = bitcast %struct.s* %s to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.s* @foo_s to i8*), i64 8, i32 4, i1 false) + %3 = load i32, i32* %x, align 4 + %4 = icmp ne i32 %3, 0 + br i1 %4, label %5, label %8 + +;