Index: llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp =================================================================== --- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -945,12 +945,51 @@ return false; } - // Check that src isn't captured by the called function since the - // transformation can cause aliasing issues in that case. - for (unsigned ArgI = 0, E = C->arg_size(); ArgI != E; ++ArgI) - if (C->getArgOperand(ArgI) == cpySrc && !C->doesNotCapture(ArgI)) + // Check whether src is captured by the called function, in which case there + // may be further indirect uses of src. + bool SrcIsCaptured = any_of(C->args(), [&](Use &U) { + return U == cpySrc && !C->doesNotCapture(C->getArgOperandNo(&U)); + }); + + // If src is captured, then check whether there are any potential uses of + // src through the captured pointer before the lifetime of src ends, either + // due to a lifetime.end or a return from the function. + if (SrcIsCaptured) { + // We ensure below that dest is not captured in this function, but it + // might be captured before, e.g. in case of a global. Make sure the dest + // is identified function local. + if (!isIdentifiedFunctionLocal(getUnderlyingObject(cpyDest))) return false; + MemoryLocation SrcLoc = + MemoryLocation(srcAlloca, LocationSize::precise(srcSize)); + for (Instruction &I : + make_range(++C->getIterator(), C->getParent()->end())) { + // Lifetime of srcAlloca ends at lifetime.end. + if (auto *II = dyn_cast(&I)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_end && + II->getArgOperand(1)->stripPointerCasts() == srcAlloca && + cast(II->getArgOperand(0))->uge(srcSize)) + break; + } + + // Lifetime of srcAlloca ends at return. + if (isa(&I)) + break; + + // Ignore the direct read of src in the load. + if (&I == cpyLoad) + continue; + + // Check whether this instruction may mod/ref src through the captured + // pointer (we have already any direct mod/refs in the loop above). + // Also bail if we hit a terminator, as we don't want to scan into other + // blocks. + if (isModOrRefSet(AA->getModRefInfo(&I, SrcLoc)) || I.isTerminator()) + return false; + } + } + // Since we're changing the parameter to the callsite, we need to make sure // that what would be the new parameter dominates the callsite. if (!DT->dominates(cpyDest, C)) { Index: llvm/test/Transforms/MemCpyOpt/capturing-func.ll =================================================================== --- llvm/test/Transforms/MemCpyOpt/capturing-func.ll +++ llvm/test/Transforms/MemCpyOpt/capturing-func.ll @@ -1,5 +1,5 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -basic-aa -memcpyopt -S -verify-memoryssa | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature +; RUN: opt < %s -basic-aa -globals-aa -memcpyopt -S -verify-memoryssa | FileCheck %s target datalayout = "e" @@ -11,7 +11,7 @@ ; Check that the transformation isn't applied if the called function can ; capture the pointer argument (i.e. the nocapture attribute isn't present) define void @test() { -; CHECK-LABEL: @test( +; CHECK-LABEL: define {{[^@]+}}@test() { ; CHECK-NEXT: [[PTR1:%.*]] = alloca i8, align 1 ; CHECK-NEXT: [[PTR2:%.*]] = alloca i8, align 1 ; CHECK-NEXT: call void @foo(i8* [[PTR2]]) @@ -30,7 +30,7 @@ ; Same as previous test, but with a bitcasted argument. ; TODO: Call slot optimization should not be applied here. define void @test_bitcast() { -; CHECK-LABEL: @test_bitcast( +; CHECK-LABEL: define {{[^@]+}}@test_bitcast() { ; CHECK-NEXT: [[PTR1:%.*]] = alloca [2 x i8], align 1 ; CHECK-NEXT: [[PTR2:%.*]] = alloca [2 x i8], align 1 ; CHECK-NEXT: [[PTR1_CAST:%.*]] = bitcast [2 x i8]* [[PTR1]] to i8* @@ -53,12 +53,11 @@ ; Lifetime of %ptr2 ends before the potential use of the capture in the second ; call. define void @test_lifetime_end() { -; CHECK-LABEL: @test_lifetime_end( +; CHECK-LABEL: define {{[^@]+}}@test_lifetime_end() { ; CHECK-NEXT: [[PTR1:%.*]] = alloca i8, align 1 ; CHECK-NEXT: [[PTR2:%.*]] = alloca i8, align 1 ; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 1, i8* [[PTR2]]) -; CHECK-NEXT: call void @foo(i8* [[PTR2]]) -; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[PTR1]], i8* [[PTR2]], i32 1, i1 false) +; CHECK-NEXT: call void @foo(i8* [[PTR1]]) ; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 1, i8* [[PTR2]]) ; CHECK-NEXT: call void @foo(i8* [[PTR1]]) ; CHECK-NEXT: ret void @@ -73,19 +72,126 @@ ret void } +; Lifetime of %ptr2 does not end, because of size mismatch. +define void @test_lifetime_not_end() { +; CHECK-LABEL: define {{[^@]+}}@test_lifetime_not_end() { +; CHECK-NEXT: [[PTR1:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[PTR2:%.*]] = alloca i8, align 1 +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 1, i8* [[PTR2]]) +; CHECK-NEXT: call void @foo(i8* [[PTR2]]) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[PTR1]], i8* [[PTR2]], i32 1, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 0, i8* [[PTR2]]) +; CHECK-NEXT: call void @foo(i8* [[PTR1]]) +; CHECK-NEXT: ret void +; + %ptr1 = alloca i8 + %ptr2 = alloca i8 + call void @llvm.lifetime.start.p0i8(i64 1, i8* %ptr2) + call void @foo(i8* %ptr2) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr1, i8* %ptr2, i32 1, i1 false) + call void @llvm.lifetime.end.p0i8(i64 0, i8* %ptr2) + call void @foo(i8* %ptr1) + ret void +} + ; Lifetime of %ptr2 ends before any potential use of the capture because we ; return from the function. define void @test_function_end() { -; CHECK-LABEL: @test_function_end( +; CHECK-LABEL: define {{[^@]+}}@test_function_end() { +; CHECK-NEXT: [[PTR1:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[PTR2:%.*]] = alloca i8, align 1 +; CHECK-NEXT: call void @foo(i8* [[PTR1]]) +; CHECK-NEXT: ret void +; + %ptr1 = alloca i8 + %ptr2 = alloca i8 + call void @foo(i8* %ptr2) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr1, i8* %ptr2, i32 1, i1 false) + ret void +} + +; A potential use of the capture occurs in a later block, can't be optimized. +define void @test_terminator() { +; CHECK-LABEL: define {{[^@]+}}@test_terminator() { ; CHECK-NEXT: [[PTR1:%.*]] = alloca i8, align 1 ; CHECK-NEXT: [[PTR2:%.*]] = alloca i8, align 1 ; CHECK-NEXT: call void @foo(i8* [[PTR2]]) ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[PTR1]], i8* [[PTR2]], i32 1, i1 false) +; CHECK-NEXT: br label [[NEXT:%.*]] +; CHECK: next: +; CHECK-NEXT: call void @foo(i8* [[PTR1]]) +; CHECK-NEXT: ret void +; + %ptr1 = alloca i8 + %ptr2 = alloca i8 + call void @foo(i8* %ptr2) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr1, i8* %ptr2, i32 1, i1 false) + br label %next + +next: + call void @foo(i8* %ptr1) + ret void +} + +; This case can be optimized, but would require a scan across multiple blocks +; and is currently not performed. +define void @test_terminator2() { +; CHECK-LABEL: define {{[^@]+}}@test_terminator2() { +; CHECK-NEXT: [[PTR1:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[PTR2:%.*]] = alloca i8, align 1 +; CHECK-NEXT: call void @foo(i8* [[PTR2]]) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[PTR1]], i8* [[PTR2]], i32 1, i1 false) +; CHECK-NEXT: br label [[NEXT:%.*]] +; CHECK: next: ; CHECK-NEXT: ret void ; %ptr1 = alloca i8 %ptr2 = alloca i8 call void @foo(i8* %ptr2) call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr1, i8* %ptr2, i32 1, i1 false) + br label %next + +next: + ret void +} + +@g = internal global i8 0 + +; This case should not be optimized, because @g is captured before the call +; (being a global) and @icmp_g might depend on its identity. +define void @test_global() { +; CHECK-LABEL: define {{[^@]+}}@test_global() { +; CHECK-NEXT: [[PTR:%.*]] = alloca i8, align 1 +; CHECK-NEXT: call void @icmp_g(i8* [[PTR]]) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* @g, i8* [[PTR]], i32 1, i1 false) +; CHECK-NEXT: ret void +; + %ptr = alloca i8 + call void @icmp_g(i8* %ptr) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* @g, i8* %ptr, i32 1, i1 false) + ret void +} + +define void @icmp_g(i8* %p) { +; CHECK-LABEL: define {{[^@]+}}@icmp_g +; CHECK-SAME: (i8* [[P:%.*]]) { +; CHECK-NEXT: [[C:%.*]] = icmp eq i8* [[P]], @g +; CHECK-NEXT: br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK: if: +; CHECK-NEXT: store i8 1, i8* [[P]], align 1 +; CHECK-NEXT: ret void +; CHECK: else: +; CHECK-NEXT: store i8 2, i8* [[P]], align 1 +; CHECK-NEXT: ret void +; + %c = icmp eq i8* %p, @g + br i1 %c, label %if, label %else + +if: + store i8 1, i8* %p + ret void + +else: + store i8 2, i8* %p ret void }