Currently, memcpy-memcpy pairs are only considered when there are no mods or
refs of either the source or dest memory operands of the examined memcpy:
ir memcpy(b <- a) ; the "dependee" memcpy ... ; no mod/ref of a, b, or c in between memcpy(c <- b) ; the examined memcpy
In the above, if b and/or c are mod/refed in the space between the two
memcopies, then the mod/ref-ing instruction closest to the examined memcpy is
matched and the dependee is never seen. If on the other hand only a is
mod/refed in between, then the memcpy pair is recognized but ultimately ignored
because the processMemCpyMemCpyDependence transformation would be invalid:
ir memcpy(b <- a); *a = 42; memcpy(c <- b) => memcpy(b <- a); *a = 42; memcpy(c <- a)
What this patch does is search harder for memcpy pairs and then match and
transform them against three general cases:
Case 1:
ir memcpy(b <- a); ...; *b = 42; ...; memcpy(a <- b); => if a is never mod/refed in between the two memcpys ...; *a = 42; ...; memcpy(b <- a);
Case 2 (essentially the todo mentioned in processMemCpyMemCpyDependence):
ir memcpy(b <- a); ...; memcpy(c <- b); => if "..." doesn't mod/ref either c or b memcpy(c <- a); memcpy(b <- a); *a = 42;
Case 3:
ir memcpy(b <- a); ...; memcpy(c <- b) => if "..." doesn't mod/ref b or a ...; memcpy(b <- a); memcpy(c <- b)
Feedback on the soundness of these three cases is eagerly sought.
At this time, only case 2 has been implemented because it's the easiest and
most useful. For instance:
c typedef struct { unsigned char large[65536]; } S; extern void g_(S *); S p1(unsigned g) { S rv = {0}; if (g) { S rv2; g_(&rv2); return rv2; } rv.large[g] = g + 1; return rv; } S p0() { S k = p1(32); k.large[445] = 2302; return k; } S set(S x, unsigned n) { x.large[n] = n; return x; } S p() { S k = p0(); k = set(k, 99); k.large[22] += 23; return k; }
produces, at -O3 (without the patch; extraneous memcopies marked):
ir define void @p(%struct.S* noalias nocapture sret) local_unnamed_addr #0 { %2 = alloca %struct.S, align 1 %3 = alloca [22 x i8], align 8 %4 = alloca [76 x i8], align 1 %5 = alloca [345 x i8], align 1 %6 = alloca [65090 x i8], align 2 %7 = getelementptr inbounds [22 x i8], [22 x i8]* %3, i64 0, i64 0 call void @llvm.lifetime.start(i64 22, i8* %7) %8 = getelementptr inbounds [76 x i8], [76 x i8]* %4, i64 0, i64 0 call void @llvm.lifetime.start(i64 76, i8* %8) %9 = getelementptr inbounds [345 x i8], [345 x i8]* %5, i64 0, i64 0 call void @llvm.lifetime.start(i64 345, i8* %9) %10 = getelementptr inbounds [65090 x i8], [65090 x i8]* %6, i64 0, i64 0 call void @llvm.lifetime.start(i64 65090, i8* %10) %11 = getelementptr inbounds %struct.S, %struct.S* %2, i64 0, i32 0, i64 0 call void @llvm.lifetime.start(i64 65536, i8* %11) #3, !noalias !8 call void @g_(%struct.S* nonnull %2) #3, !noalias !8 call void @llvm.memcpy.p0i8.p0i8.i64(i8* %7, i8* %11, i64 22, i32 1, i1 false) <=== %12 = getelementptr inbounds %struct.S, %struct.S* %2, i64 0, i32 0, i64 22 %13 = load i8, i8* %12, align 1 %14 = getelementptr inbounds %struct.S, %struct.S* %2, i64 0, i32 0, i64 23 call void @llvm.memcpy.p0i8.p0i8.i64(i8* %8, i8* %14, i64 76, i32 1, i1 false) <=== %15 = getelementptr inbounds %struct.S, %struct.S* %2, i64 0, i32 0, i64 100 call void @llvm.memcpy.p0i8.p0i8.i64(i8* %9, i8* %15, i64 345, i32 1, i1 false) <=== %16 = getelementptr inbounds %struct.S, %struct.S* %2, i64 0, i32 0, i64 446 %17 = getelementptr inbounds [65090 x i8], [65090 x i8]* %6, i64 0, i64 0 call void @llvm.memcpy.p0i8.p0i8.i64(i8* %17, i8* %16, i64 65090, i32 1, i1 false) #3 <=== call void @llvm.lifetime.end(i64 65536, i8* %11) #3, !noalias !8 %18 = add i8 %13, 23 %19 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 0 call void @llvm.memcpy.p0i8.p0i8.i64(i8* %19, i8* %7, i64 22, i32 1, i1 false) %20 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 22 store i8 %18, i8* %20, align 1 %21 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 23 call void @llvm.memcpy.p0i8.p0i8.i64(i8* %21, i8* %8, i64 76, i32 1, i1 false) %22 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 99 store i8 99, i8* %22, align 1 %23 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 100 call void @llvm.memcpy.p0i8.p0i8.i64(i8* %23, i8* %9, i64 345, i32 1, i1 false) %24 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 445 store i8 -2, i8* %24, align 1 %25 = getelementptr inbounds %struct.S, %struct.S* %0, i64 0, i32 0, i64 446 call void @llvm.memcpy.p0i8.p0i8.i64(i8* %25, i8* %10, i64 65090, i32 1, i1 false) call void @llvm.lifetime.end(i64 22, i8* %7) call void @llvm.lifetime.end(i64 76, i8* %8) call void @llvm.lifetime.end(i64 345, i8* %9) call void @llvm.lifetime.end(i64 65090, i8* %10) ret void }
With this patch, The highlighted memcopies are properly seen, transformed, and
later removed by DSE.
You're not counting the case where nothing mods/refs a, b, or c. We can obviously either hoist or sink if we want to in that case, but it's not clear it's actually helpful.