This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
lib/Transforms/Scalar/
-
Transforms/
-
Scalar/
5
MemCpyOptimizer.cpp

Differential D25175

[MemCpyOpt] Optimize memcpy-memcpy dependencies more aggressively.
Needs ReviewPublic

Authored by bryant on Oct 2 2016, 11:02 PM.

Download Raw Diff

Details

Reviewers

aaron.ballman
majnemer
eli.friedman
rnk
efriedma

Summary

Currently, memcpy-memcpy pairs are only considered when there are no mods or
refs of either the source or dest memory operands of the examined memcpy:

ir
memcpy(b <- a)  ; the "dependee" memcpy
...  ; no mod/ref of a, b, or c in between
memcpy(c <- b)  ; the examined memcpy

In the above, if b and/or c are mod/refed in the space between the two
memcopies, then the mod/ref-ing instruction closest to the examined memcpy is
matched and the dependee is never seen. If on the other hand only a is
mod/refed in between, then the memcpy pair is recognized but ultimately ignored
because the processMemCpyMemCpyDependence transformation would be invalid:

ir
memcpy(b <- a); *a = 42; memcpy(c <- b)
    =>
memcpy(b <- a); *a = 42; memcpy(c <- a)

What this patch does is search harder for memcpy pairs and then match and
transform them against three general cases:

Case 1:

ir
memcpy(b <- a); ...; *b = 42; ...; memcpy(a <- b);
    => if a is never mod/refed in between the two memcpys
...; *a = 42; ...; memcpy(b <- a);

Case 2 (essentially the todo mentioned in processMemCpyMemCpyDependence):

ir
memcpy(b <- a); ...;  memcpy(c <- b);
    => if "..." doesn't mod/ref either c or b
memcpy(c <- a); memcpy(b <- a); *a = 42;

Case 3:

ir
memcpy(b <- a); ...; memcpy(c <- b)
    => if "..." doesn't mod/ref b or a
...; memcpy(b <- a); memcpy(c <- b)

Feedback on the soundness of these three cases is eagerly sought.

At this time, only case 2 has been implemented because it's the easiest and
most useful. For instance:

c
typedef struct { unsigned char large[65536]; } S;

extern void g_(S *);

S p1(unsigned g) {
  S rv = {0};
  if (g) {
    S rv2;
    g_(&rv2);
    return rv2;
  }
  rv.large[g] = g + 1;
  return rv;
}

S p0() {
  S k = p1(32);
  k.large[445] = 2302;
  return k;
}

S set(S x, unsigned n) {
  x.large[n] = n;
  return x;
}

S p() {
  S k = p0();
  k = set(k, 99);
  k.large[22] += 23;
  return k;
}

produces, at -O3 (without the patch; extraneous memcopies marked):

ir
define void @p(%struct.S* noalias nocapture sret) local_unnamed_addr #0 {
  %2 = alloca %struct.S, align 1
  %3 = alloca [22 x i8], align 8
  %4 = alloca [76 x i8], align 1
  %5 = alloca [345 x i8], align 1
  %6 = alloca [65090 x i8], align 2
  %7 = getelementptr inbounds [22 x i8], [22 x i8]* %3, i64 0, i64 0
  call void @llvm.lifetime.start(i64 22, i8* %7)
  %8 = getelementptr inbounds [76 x i8], [76 x i8]* %4, i64 0, i64 0
  call void @llvm.lifetime.start(i64 76, i8* %8)
  %9 = getelementptr inbounds [345 x i8], [345 x i8]* %5, i64 0, i64 0
  call void @llvm.lifetime.start(i64 345, i8* %9)
  %10 = getelementptr inbounds [65090 x i8], [65090 x i8]* %6, i64 0, i64 0
  call void @llvm.lifetime.start(i64 65090, i8* %10)
  %11 = getelementptr inbounds %struct.S, %struct.S* %2, i64 0, i32 0, i64 0
  call void @llvm.lifetime.start(i64 65536, i8* %11) #3, !noalias !8
  call void @g_(%struct.S* nonnull %2) #3, !noalias !8
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %7, i8* %11, i64 22, i32 1, i1 false)            <===
  %12 = getelementptr inbounds %struct.S, %struct.S* %2, i64 0, i32 0, i64 22
  %13 = load i8, i8* %12, align 1
  %14 = getelementptr inbounds %struct.S, %struct.S* %2, i64 0, i32 0, i64 23
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %8, i8* %14, i64 76, i32 1, i1 false)            <===
  %15 = getelementptr inbounds %struct.S, %struct.S* %2, i64 0, i32 0, i64 100
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %9, i8* %15, i64 345, i32 1, i1 false)           <===
  %16 = getelementptr inbounds %struct.S, %struct.S* %2, i64 0, i32 0, i64 446
  %17 = getelementptr inbounds [65090 x i8], [65090 x i8]* %6, i64 0, i64 0
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %17, i8* %16, i64 65090, i32 1, i1 false) #3     <===
  call void @llvm.lifetime.end(i64 65536, i8* %11) #3, !noalias !8
  %18 = add i8 %13, 23
  %19 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 0
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %19, i8* %7, i64 22, i32 1, i1 false)
  %20 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 22
  store i8 %18, i8* %20, align 1
  %21 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 23
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %21, i8* %8, i64 76, i32 1, i1 false)
  %22 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 99
  store i8 99, i8* %22, align 1
  %23 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 100
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %23, i8* %9, i64 345, i32 1, i1 false)
  %24 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 445
  store i8 -2, i8* %24, align 1
  %25 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 446
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %25, i8* %10, i64 65090, i32 1, i1 false)
  call void @llvm.lifetime.end(i64 22, i8* %7)
  call void @llvm.lifetime.end(i64 76, i8* %8)
  call void @llvm.lifetime.end(i64 345, i8* %9)
  call void @llvm.lifetime.end(i64 65090, i8* %10)
  ret void
}

With this patch, The highlighted memcopies are properly seen, transformed, and
later removed by DSE.

Diff Detail

Repository: rL LLVM

Event Timeline

bryant updated this revision to Diff 73235.Oct 2 2016, 11:02 PM

bryant retitled this revision from to [MemCpyOpt] Optimize memcpy-memcpy dependencies more aggressively..

bryant updated this object.

bryant added reviewers: efriedma, majnemer, rnk, aaron.ballman, eli.friedman.

bryant set the repository for this revision to rL LLVM.

bryant added a subscriber: llvm-commits.

Hoisting a memcpy past unrelated instruction is generally sound... and probably a good idea if it leads to simplifications.

This needs a lot of IR tests; I don't see any at the moment.

lib/Transforms/Scalar/MemCpyOptimizer.cpp
1002	You're not counting the case where nothing mods/refs a, b, or c. We can obviously either hoist or sink if we want to in that case, but it's not clear it's actually helpful.
1027	Might want to note that you're specifically trying to hoist the computation of M->getDest(). You're missing a check whether it's actually safe to move these instructions; they could have side-effects. You're also missing a check to make sure you aren't hoisting a memcpy past a call which throws an exception.
1051	You need to sort tomove so the instructions are in source order. Or there might be a better algorithm to do this; I haven't really thought it through. Do you actually want to move M before MDep? I think that doesn't interact correctly with the UseMemMove case.
1088	You probably want to check this before you start moving instructions around.
1317	This looks like a duplicate of the check at the beginning of processMemCpyMemCpyDependence?

Revision Contents

Path

Size

lib/

Transforms/

Scalar/

MemCpyOptimizer.cpp

104 lines

Diff 73235

lib/Transforms/Scalar/MemCpyOptimizer.cpp

Show First 20 Lines • Show All 986 Lines • ▼ Show 20 Lines	bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
// TODO: If the code between M and MDep is transparent to the destination "c",		// TODO: If the code between M and MDep is transparent to the destination "c",
// then we could still perform the xform by moving M up to the first memcpy.		// then we could still perform the xform by moving M up to the first memcpy.
//		//
// NOTE: This is conservative, it will stop on any read from the source loc,		// NOTE: This is conservative, it will stop on any read from the source loc,
// not just the defining memcpy.		// not just the defining memcpy.
MemDepResult SourceDep =		MemDepResult SourceDep =
MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,		MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
M->getIterator(), M->getParent());		M->getIterator(), M->getParent());
if (!SourceDep.isClobber() \|\| SourceDep.getInst() != MDep)		MemDepResult MSourceDep = MD->getPointerDependencyFrom(
		MemoryLocation::getForSource(M), false, M->getIterator(), M->getParent());
		MemDepResult DestDep =
		MD->getPointerDependencyFrom(MemoryLocation::getForDest(M), false,
		M->getIterator(), M->getParent(), M);
		DominatorTree &DT = LookupDomTree();

		// Three cases:
		efriedmaUnsubmitted Not Done Reply Inline Actions You're not counting the case where nothing mods/refs a, b, or c. We can obviously either hoist or sink if we want to in that case, but it's not clear it's actually helpful. efriedma: You're not counting the case where nothing mods/refs a, b, or c. We can obviously either hoist…
		// Case 1:
		// memcpy(b <- a); ...; *b = 42; ...; memcpy(a <- b);
		// => if a is never mod/refed in between the two memcpys
		// ...; *a = 42; ...; memcpy(b <- a);
		if (M->getDest() == MDep->getSource() && DestDep.getInst() == MDep) {
		// TODO: figure out how to replace uses within a basic block range
		DEBUG(dbgs() << "TODO: for case 1, figure out how to replace uses within "
		"bb range\n");
return false;		return false;
		}

		// Case 2:
		// memcpy(b <- a); ...; memcpy(c <- b);
		// => if "..." doesn't mod/ref either c or b
		// memcpy(c <- a); memcpy(b <- a); *a = 42;
		else if (MSourceDep.getInst() == MDep &&
		(!DestDep.getInst() \|\| DestDep.getInst() == MDep \|\|
		DT.dominates(DestDep.getInst(), MDep))) {
		DEBUG(dbgs() << "case 2: " << *MDep << "\n");
		// move our memcpy up to just after mdep
		DenseSet<Instruction *> inrange, visited;
		for (Instruction &i : make_range(MDep->getIterator(), M->getIterator())) {
		inrange.insert(&i);
		}
		// identify dependencies of the memcpy that also need to moved upwards.
		efriedmaUnsubmitted Not Done Reply Inline Actions Might want to note that you're specifically trying to hoist the computation of M->getDest(). You're missing a check whether it's actually safe to move these instructions; they could have side-effects. You're also missing a check to make sure you aren't hoisting a memcpy past a call which throws an exception. efriedma: Might want to note that you're specifically trying to hoist the computation of M->getDest().
		SmallVector<Instruction *, 8> tomove, stack{M};
		while (!stack.empty()) {
		SmallVector<Instruction *, 8> next;
		Instruction *cur = stack.back();
		for (Use &op : cur->operands()) {
		if (Instruction *i = dyn_cast<Instruction>(op.get())) {
		if (inrange.find(i) != inrange.end() &&
		visited.find(i) == visited.end()) {
		next.push_back(i);
		}
		}
		}
		if (next.empty()) {
		// leaf node
		tomove.push_back(cur);
		visited.insert(cur);
		stack.pop_back();
		} else {
		stack.append(next.begin(), next.end());
		}
		}

		for (auto i : tomove) {
		i->moveBefore(MDep);
		efriedmaUnsubmitted Not Done Reply Inline Actions You need to sort tomove so the instructions are in source order. Or there might be a better algorithm to do this; I haven't really thought it through. Do you actually want to move M before MDep? I think that doesn't interact correctly with the UseMemMove case. efriedma: You need to sort tomove so the instructions are in source order. Or there might be a better…
		// refresh MemDep cache
		MD->removeInstruction(i);
		}
		}

		// TODO: Case 3:
		// memcpy(b <- a); ...; memcpy(c <- b)
		// => if "..." doesn't mod/ref b or a
		// ...; memcpy(b <- a); memcpy(c <- b)
		else if (MSourceDep.getInst() == MDep &&
		(!SourceDep.getInst() \|\| SourceDep.getInst() == MDep \|\|
		DT.dominates(SourceDep.getInst(), MDep))) {
		DEBUG(dbgs() << "TODO: case 3.\n");
		return false;
		} else {
		// none of the cases match; ignore.
		DEBUG(dbgs() << "No matching case. Ignoring. " << *M << "\n"
		<< *MDep << "\n");
		return false;
		}

		// Bail early if `memcpy(a <- b); memcpy(b <- a)`
		if (AA.isMustAlias(MemoryLocation::getForDest(M),
		MemoryLocation::getForSource(MDep))) {
		MD->removeInstruction(M);
		M->eraseFromParent();
		++NumMemCpyInstr;
		return true;
		}

// If the dest of the second might alias the source of the first, then the		// If the dest of the second might alias the source of the first, then the
// source and dest might overlap. We still want to eliminate the intermediate		// source and dest might overlap. We still want to eliminate the intermediate
// value, but we have to generate a memmove instead of memcpy.		// value, but we have to generate a memmove instead of memcpy.
bool UseMemMove = false;		bool UseMemMove = false;
if (!AA.isNoAlias(MemoryLocation::getForDest(M),		if (!AA.isNoAlias(MemoryLocation::getForDest(M),
MemoryLocation::getForSource(MDep)))		MemoryLocation::getForSource(MDep)))
UseMemMove = true;		UseMemMove = true;
		efriedmaUnsubmitted Not Done Reply Inline Actions You probably want to check this before you start moving instructions around. efriedma: You probably want to check this before you start moving instructions around.

// If all checks passed, then we can transform M.		// If all checks passed, then we can transform M.

// Make sure to use the lesser of the alignment of the source and the dest		// Make sure to use the lesser of the alignment of the source and the dest
// since we're changing where we're reading from, but don't want to increase		// since we're changing where we're reading from, but don't want to increase
// the alignment past what can be read from or written to.		// the alignment past what can be read from or written to.
// TODO: Is this worth it if we're creating a less aligned memcpy? For		// TODO: Is this worth it if we're creating a less aligned memcpy? For
// example we could be moving from movaps -> movq on x86.		// example we could be moving from movaps -> movq on x86.
▲ Show 20 Lines • Show All 175 Lines • ▼ Show 20 Lines	if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
}		}
}		}
}		}

MemoryLocation SrcLoc = MemoryLocation::getForSource(M);		MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(		MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
SrcLoc, true, M->getIterator(), M->getParent());		SrcLoc, true, M->getIterator(), M->getParent());

if (SrcDepInfo.isClobber()) {		if (SrcDepInfo.isDef()) {
if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
return processMemCpyMemCpyDependence(M, MDep);
} else if (SrcDepInfo.isDef()) {
Instruction *I = SrcDepInfo.getInst();		Instruction *I = SrcDepInfo.getInst();
bool hasUndefContents = false;		bool hasUndefContents = false;

if (isa<AllocaInst>(I)) {		if (isa<AllocaInst>(I)) {
hasUndefContents = true;		hasUndefContents = true;
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {		} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
if (II->getIntrinsicID() == Intrinsic::lifetime_start)		if (II->getIntrinsicID() == Intrinsic::lifetime_start)
if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0)))		if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0)))
Show All 13 Lines	if (SrcDepInfo.isClobber())
if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))		if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
if (performMemCpyToMemSetOptzn(M, MDep)) {		if (performMemCpyToMemSetOptzn(M, MDep)) {
MD->removeInstruction(M);		MD->removeInstruction(M);
M->eraseFromParent();		M->eraseFromParent();
++NumCpyToSet;		++NumCpyToSet;
return true;		return true;
}		}

		// search upwards within bb for possible memcpy-memcpy dep
		for (MemDepResult d = MD->getPointerDependencyFrom(
		SrcLoc, false, M->getIterator(), M->getParent());
		!d.isNonLocal() && d.getInst();
		d = MD->getPointerDependencyFrom(
		SrcLoc, false, d.getInst()->getIterator(), M->getParent(), M)) {
		if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(d.getInst())) {
		if (MDep->getDest() == M->getSource()) {
		efriedmaUnsubmitted Not Done Reply Inline Actions This looks like a duplicate of the check at the beginning of processMemCpyMemCpyDependence? efriedma: This looks like a duplicate of the check at the beginning of processMemCpyMemCpyDependence?
		return processMemCpyMemCpyDependence(M, MDep);
		}
		}
		}

return false;		return false;
}		}

/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed		/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
/// not to alias.		/// not to alias.
bool MemCpyOptPass::processMemMove(MemMoveInst *M) {		bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
AliasAnalysis &AA = LookupAliasAnalysis();		AliasAnalysis &AA = LookupAliasAnalysis();

▲ Show 20 Lines • Show All 208 Lines • Show Last 20 Lines