This is an archive of the discontinued LLVM Phabricator instance.

[MemCpyOpt] Optimize memcpy-memcpy dependencies more aggressively.
Needs ReviewPublic

Authored by bryant on Oct 2 2016, 11:02 PM.

Details

Summary

Currently, memcpy-memcpy pairs are only considered when there are no mods or
refs of either the source or dest memory operands of the examined memcpy:

ir
memcpy(b <- a)  ; the "dependee" memcpy
...  ; no mod/ref of a, b, or c in between
memcpy(c <- b)  ; the examined memcpy

In the above, if b and/or c are mod/refed in the space between the two
memcopies, then the mod/ref-ing instruction closest to the examined memcpy is
matched and the dependee is never seen. If on the other hand only a is
mod/refed in between, then the memcpy pair is recognized but ultimately ignored
because the processMemCpyMemCpyDependence transformation would be invalid:

ir
memcpy(b <- a); *a = 42; memcpy(c <- b)
    =>
memcpy(b <- a); *a = 42; memcpy(c <- a)

What this patch does is search harder for memcpy pairs and then match and
transform them against three general cases:

Case 1:

ir
memcpy(b <- a); ...; *b = 42; ...; memcpy(a <- b);
    => if a is never mod/refed in between the two memcpys
...; *a = 42; ...; memcpy(b <- a);

Case 2 (essentially the todo mentioned in processMemCpyMemCpyDependence):

ir
memcpy(b <- a); ...;  memcpy(c <- b);
    => if "..." doesn't mod/ref either c or b
memcpy(c <- a); memcpy(b <- a); *a = 42;

Case 3:

ir
memcpy(b <- a); ...; memcpy(c <- b)
    => if "..." doesn't mod/ref b or a
...; memcpy(b <- a); memcpy(c <- b)

Feedback on the soundness of these three cases is eagerly sought.

At this time, only case 2 has been implemented because it's the easiest and
most useful. For instance:

c
typedef struct { unsigned char large[65536]; } S;

extern void g_(S *);

S p1(unsigned g) {
  S rv = {0};
  if (g) {
    S rv2;
    g_(&rv2);
    return rv2;
  }
  rv.large[g] = g + 1;
  return rv;
}

S p0() {
  S k = p1(32);
  k.large[445] = 2302;
  return k;
}

S set(S x, unsigned n) {
  x.large[n] = n;
  return x;
}

S p() {
  S k = p0();
  k = set(k, 99);
  k.large[22] += 23;
  return k;
}

produces, at -O3 (without the patch; extraneous memcopies marked):

ir
define void @p(%struct.S* noalias nocapture sret) local_unnamed_addr #0 {
  %2 = alloca %struct.S, align 1
  %3 = alloca [22 x i8], align 8
  %4 = alloca [76 x i8], align 1
  %5 = alloca [345 x i8], align 1
  %6 = alloca [65090 x i8], align 2
  %7 = getelementptr inbounds [22 x i8], [22 x i8]* %3, i64 0, i64 0
  call void @llvm.lifetime.start(i64 22, i8* %7)
  %8 = getelementptr inbounds [76 x i8], [76 x i8]* %4, i64 0, i64 0
  call void @llvm.lifetime.start(i64 76, i8* %8)
  %9 = getelementptr inbounds [345 x i8], [345 x i8]* %5, i64 0, i64 0
  call void @llvm.lifetime.start(i64 345, i8* %9)
  %10 = getelementptr inbounds [65090 x i8], [65090 x i8]* %6, i64 0, i64 0
  call void @llvm.lifetime.start(i64 65090, i8* %10)
  %11 = getelementptr inbounds %struct.S, %struct.S* %2, i64 0, i32 0, i64 0
  call void @llvm.lifetime.start(i64 65536, i8* %11) #3, !noalias !8
  call void @g_(%struct.S* nonnull %2) #3, !noalias !8
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %7, i8* %11, i64 22, i32 1, i1 false)            <===
  %12 = getelementptr inbounds %struct.S, %struct.S* %2, i64 0, i32 0, i64 22
  %13 = load i8, i8* %12, align 1
  %14 = getelementptr inbounds %struct.S, %struct.S* %2, i64 0, i32 0, i64 23
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %8, i8* %14, i64 76, i32 1, i1 false)            <===
  %15 = getelementptr inbounds %struct.S, %struct.S* %2, i64 0, i32 0, i64 100
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %9, i8* %15, i64 345, i32 1, i1 false)           <===
  %16 = getelementptr inbounds %struct.S, %struct.S* %2, i64 0, i32 0, i64 446
  %17 = getelementptr inbounds [65090 x i8], [65090 x i8]* %6, i64 0, i64 0
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %17, i8* %16, i64 65090, i32 1, i1 false) #3     <===
  call void @llvm.lifetime.end(i64 65536, i8* %11) #3, !noalias !8
  %18 = add i8 %13, 23
  %19 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 0
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %19, i8* %7, i64 22, i32 1, i1 false)
  %20 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 22
  store i8 %18, i8* %20, align 1
  %21 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 23
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %21, i8* %8, i64 76, i32 1, i1 false)
  %22 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 99
  store i8 99, i8* %22, align 1
  %23 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 100
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %23, i8* %9, i64 345, i32 1, i1 false)
  %24 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 445
  store i8 -2, i8* %24, align 1
  %25 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 446
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %25, i8* %10, i64 65090, i32 1, i1 false)
  call void @llvm.lifetime.end(i64 22, i8* %7)
  call void @llvm.lifetime.end(i64 76, i8* %8)
  call void @llvm.lifetime.end(i64 345, i8* %9)
  call void @llvm.lifetime.end(i64 65090, i8* %10)
  ret void
}

With this patch, The highlighted memcopies are properly seen, transformed, and
later removed by DSE.

Diff Detail

Repository
rL LLVM

Event Timeline

bryant updated this revision to Diff 73235.Oct 2 2016, 11:02 PM
bryant retitled this revision from to [MemCpyOpt] Optimize memcpy-memcpy dependencies more aggressively..
bryant updated this object.
bryant set the repository for this revision to rL LLVM.
bryant added a subscriber: llvm-commits.
efriedma edited edge metadata.Oct 6 2016, 12:36 PM

Hoisting a memcpy past unrelated instruction is generally sound... and probably a good idea if it leads to simplifications.

This needs a lot of IR tests; I don't see any at the moment.

lib/Transforms/Scalar/MemCpyOptimizer.cpp
1002

You're not counting the case where nothing mods/refs a, b, or c. We can obviously either hoist or sink if we want to in that case, but it's not clear it's actually helpful.

1027

Might want to note that you're specifically trying to hoist the computation of M->getDest().

You're missing a check whether it's actually safe to move these instructions; they could have side-effects.

You're also missing a check to make sure you aren't hoisting a memcpy past a call which throws an exception.

1051

You need to sort tomove so the instructions are in source order. Or there might be a better algorithm to do this; I haven't really thought it through.

Do you actually want to move M before MDep? I think that doesn't interact correctly with the UseMemMove case.

1088

You probably want to check this before you start moving instructions around.

1317

This looks like a duplicate of the check at the beginning of processMemCpyMemCpyDependence?