Index: clang/test/CodeGen/aggregate-assign-call.c =================================================================== --- clang/test/CodeGen/aggregate-assign-call.c +++ clang/test/CodeGen/aggregate-assign-call.c @@ -13,25 +13,39 @@ struct S foo(void); // CHECK-LABEL: define dso_local void @bar +__attribute__((optnone)) struct S bar() { // O0-NOT: @llvm.lifetime.start // O0-NOT: @llvm.lifetime.end struct S r; - // O1: call void @llvm.lifetime.start.p0i8({{[^,]*}}, i8* nonnull %[[TMP1:[^)]+]]) + // O1: %[[TMP1_ALLOCA:[^ ]+]] = alloca %struct.S + // O1: %[[TMP2_ALLOCA:[^ ]+]] = alloca %struct.S + // O1: %[[TMP3_ALLOCA:[^ ]+]] = alloca %struct.S + + // O1: %[[P:[^ ]+]] = bitcast %struct.S* %[[TMP1_ALLOCA]] to i8* + // O1: call void @llvm.lifetime.start.p0i8({{[^,]*}}, i8* %[[P]]) // O1: call void @foo r = foo(); - // O1: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* nonnull %[[TMP1]]) + // O1: memcpy + // O1: %[[P:[^ ]+]] = bitcast %struct.S* %[[TMP1_ALLOCA]] to i8* + // O1: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* %[[P]]) - // O1: call void @llvm.lifetime.start.p0i8({{[^,]*}}, i8* nonnull %[[TMP2:[^)]+]]) + // O1: %[[P:[^ ]+]] = bitcast %struct.S* %[[TMP2_ALLOCA]] to i8* + // O1: call void @llvm.lifetime.start.p0i8({{[^,]*}}, i8* %[[P]]) // O1: call void @foo r = foo(); - // O1: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* nonnull %[[TMP2]]) + // O1: memcpy + // O1: %[[P:[^ ]+]] = bitcast %struct.S* %[[TMP2_ALLOCA]] to i8* + // O1: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* %[[P]]) - // O1: call void @llvm.lifetime.start.p0i8({{[^,]*}}, i8* nonnull %[[TMP3:[^)]+]]) + // O1: %[[P:[^ ]+]] = bitcast %struct.S* %[[TMP3_ALLOCA]] to i8* + // O1: call void @llvm.lifetime.start.p0i8({{[^,]*}}, i8* %[[P]]) // O1: call void @foo r = foo(); - // O1: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* nonnull %[[TMP3]]) + // O1: memcpy + // O1: %[[P:[^ ]+]] = bitcast %struct.S* %[[TMP3_ALLOCA]] to i8* + // O1: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* %[[P]]) return r; } Index: llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp =================================================================== --- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -945,11 +945,44 @@ return false; } - // Check that src isn't captured by the called function since the - // transformation can cause aliasing issues in that case. - for (unsigned ArgI = 0, E = C->arg_size(); ArgI != E; ++ArgI) - if (C->getArgOperand(ArgI) == cpySrc && !C->doesNotCapture(ArgI)) - return false; + // Check whether src is captured by the called function, in which case there + // may be further indirect uses of src. + bool SrcIsCaptured = any_of(C->args(), [&](Use &U) { + return U == cpySrc && !C->doesNotCapture(C->getArgOperandNo(&U)); + }); + + // If src is captured, then check whether there are any potential uses of + // src through the captured pointer before the lifetime of src ends, either + // due to a lifetime.end or a return from the function. + if (SrcIsCaptured) { + MemoryLocation SrcLoc = + MemoryLocation(srcAlloca, LocationSize::precise(srcSize)); + for (Instruction &I : + make_range(++C->getIterator(), C->getParent()->end())) { + // Lifetime of srcAlloca ends at lifetime.end. + if (auto *II = dyn_cast(&I)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_end && + II->getArgOperand(1)->stripPointerCasts() == srcAlloca && + cast(II->getArgOperand(0))->uge(srcSize)) + break; + } + + // Lifetime of srcAlloca ends at return. + if (isa(&I)) + break; + + // Ignore the direct read of src in the load. + if (&I == cpyLoad) + continue; + + // Check whether this instruction may mod/ref src through the captured + // pointer (we have already any direct mod/refs in the loop above). + // Also bail if we hit a terminator, as we don't want to scan into other + // blocks. + if (isModOrRefSet(AA->getModRefInfo(&I, SrcLoc)) || I.isTerminator()) + return false; + } + } // Since we're changing the parameter to the callsite, we need to make sure // that what would be the new parameter dominates the callsite. Index: llvm/test/Transforms/MemCpyOpt/capturing-func.ll =================================================================== --- llvm/test/Transforms/MemCpyOpt/capturing-func.ll +++ llvm/test/Transforms/MemCpyOpt/capturing-func.ll @@ -57,8 +57,7 @@ ; CHECK-NEXT: [[PTR1:%.*]] = alloca i8, align 1 ; CHECK-NEXT: [[PTR2:%.*]] = alloca i8, align 1 ; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 1, i8* [[PTR2]]) -; CHECK-NEXT: call void @foo(i8* [[PTR2]]) -; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[PTR1]], i8* [[PTR2]], i32 1, i1 false) +; CHECK-NEXT: call void @foo(i8* [[PTR1]]) ; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 1, i8* [[PTR2]]) ; CHECK-NEXT: call void @foo(i8* [[PTR1]]) ; CHECK-NEXT: ret void @@ -73,19 +72,85 @@ ret void } +; Lifetime of %ptr2 does not end, because of size mismatch. +define void @test_lifetime_not_end() { +; CHECK-LABEL: @test_lifetime_not_end( +; CHECK-NEXT: [[PTR1:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[PTR2:%.*]] = alloca i8, align 1 +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 1, i8* [[PTR2]]) +; CHECK-NEXT: call void @foo(i8* [[PTR2]]) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[PTR1]], i8* [[PTR2]], i32 1, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 0, i8* [[PTR2]]) +; CHECK-NEXT: call void @foo(i8* [[PTR1]]) +; CHECK-NEXT: ret void +; + %ptr1 = alloca i8 + %ptr2 = alloca i8 + call void @llvm.lifetime.start.p0i8(i64 1, i8* %ptr2) + call void @foo(i8* %ptr2) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr1, i8* %ptr2, i32 1, i1 false) + call void @llvm.lifetime.end.p0i8(i64 0, i8* %ptr2) + call void @foo(i8* %ptr1) + ret void +} + ; Lifetime of %ptr2 ends before any potential use of the capture because we ; return from the function. define void @test_function_end() { ; CHECK-LABEL: @test_function_end( ; CHECK-NEXT: [[PTR1:%.*]] = alloca i8, align 1 ; CHECK-NEXT: [[PTR2:%.*]] = alloca i8, align 1 +; CHECK-NEXT: call void @foo(i8* [[PTR1]]) +; CHECK-NEXT: ret void +; + %ptr1 = alloca i8 + %ptr2 = alloca i8 + call void @foo(i8* %ptr2) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr1, i8* %ptr2, i32 1, i1 false) + ret void +} + +; A potential use of the capture occurs in a later block, can't be optimized. +define void @test_terminator() { +; CHECK-LABEL: @test_terminator( +; CHECK-NEXT: [[PTR1:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[PTR2:%.*]] = alloca i8, align 1 +; CHECK-NEXT: call void @foo(i8* [[PTR2]]) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[PTR1]], i8* [[PTR2]], i32 1, i1 false) +; CHECK-NEXT: br label [[NEXT:%.*]] +; CHECK: next: +; CHECK-NEXT: call void @foo(i8* [[PTR1]]) +; CHECK-NEXT: ret void +; + %ptr1 = alloca i8 + %ptr2 = alloca i8 + call void @foo(i8* %ptr2) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr1, i8* %ptr2, i32 1, i1 false) + br label %next + +next: + call void @foo(i8* %ptr1) + ret void +} + +; This case can be optimized, but would require a scan across multiple blocks +; and is currently not performed. +define void @test_terminator2() { +; CHECK-LABEL: @test_terminator2( +; CHECK-NEXT: [[PTR1:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[PTR2:%.*]] = alloca i8, align 1 ; CHECK-NEXT: call void @foo(i8* [[PTR2]]) ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[PTR1]], i8* [[PTR2]], i32 1, i1 false) +; CHECK-NEXT: br label [[NEXT:%.*]] +; CHECK: next: ; CHECK-NEXT: ret void ; %ptr1 = alloca i8 %ptr2 = alloca i8 call void @foo(i8* %ptr2) call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr1, i8* %ptr2, i32 1, i1 false) + br label %next + +next: ret void }