Index: llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp =================================================================== --- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -797,23 +797,29 @@ // a number of cases to consider: // 1. cpyDest cannot be accessed between C and cpyStore as a precondition of // the transform. - // 2. C itself may not access cpyDest (prior to the transform). This is - // checked further below. - // 3. If cpyDest is accessible to the caller of this function (potentially - // captured and not based on an alloca), we need to ensure that we cannot - // unwind between C and cpyStore. This is checked here. + // 2. C itself must not access cpyDest (prior to the transform). + // 3. If cpyDest is potentially accessible to the caller of this function + // (captured or an argument), we need to ensure that we cannot unwind + // between C and cpyStore. // 4. If cpyDest is potentially captured, there may be accesses to it from // another thread. In this case, we need to check that cpyStore is // guaranteed to be executed if C is. As it is a non-atomic access, it // renders accesses from other threads undefined. - // TODO: This is currently not checked. - // TODO: Check underlying object, so we can look through GEPs. - if (!isa(cpyDest)) { - assert(C->getParent() == cpyStore->getParent() && - "call and copy must be in the same block"); + + MemoryLocation destLoc(cpyDest, srcSize); + bool callCaptures = isModOrRefSet(AA->callCapturesBefore(C, destLoc, DT)); + + // Ensure dest is not accessed by the call. + if (callCaptures && isModOrRefSet(AA->getModRefInfo(C, destLoc))) + return false; + + // Explicitly check for arguments, as callCapturesBefore() considers + // noalias arguments to be non-capturing. + if (callCaptures || isa(getUnderlyingObject(cpyDest))) { + // Ensure cpyStore is executed, rendering accesses from other threads UB. for (const Instruction &I : make_range(C->getIterator(), cpyStore->getIterator())) { - if (I.mayThrow()) + if (!isGuaranteedToTransferExecutionToSuccessor(&I)) return false; } } @@ -869,17 +875,6 @@ if (!DT->dominates(cpyDestInst, C)) return false; - // In addition to knowing that the call does not access src in some - // unexpected manner, for example via a global, which we deduce from - // the use analysis, we also need to know that it does not sneakily - // access dest. We rely on AA to figure this out for us. - ModRefInfo MR = AA->getModRefInfo(C, cpyDest, LocationSize::precise(srcSize)); - // If necessary, perform additional analysis. - if (isModOrRefSet(MR)) - MR = AA->callCapturesBefore(C, cpyDest, LocationSize::precise(srcSize), DT); - if (isModOrRefSet(MR)) - return false; - // We can't create address space casts here because we don't know if they're // safe for the target. if (cpySrc->getType()->getPointerAddressSpace() != Index: llvm/test/Transforms/MemCpyOpt/callslot.ll =================================================================== --- llvm/test/Transforms/MemCpyOpt/callslot.ll +++ llvm/test/Transforms/MemCpyOpt/callslot.ll @@ -130,8 +130,9 @@ ; CHECK-NEXT: [[SRC:%.*]] = alloca [8 x i8], align 1 ; CHECK-NEXT: [[SRC_I8:%.*]] = bitcast [8 x i8]* [[SRC]] to i8* ; CHECK-NEXT: [[DEST_I8:%.*]] = getelementptr [16 x i8], [16 x i8]* [[DEST]], i64 0, i64 8 -; CHECK-NEXT: call void @accept_ptr(i8* [[SRC_I8]]) -; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST_I8]], i8* [[SRC_I8]], i64 8, i1 false) +; CHECK-NEXT: [[DEST_I81:%.*]] = bitcast i8* [[DEST_I8]] to [8 x i8]* +; CHECK-NEXT: [[DEST_I812:%.*]] = bitcast [8 x i8]* [[DEST_I81]] to i8* +; CHECK-NEXT: call void @accept_ptr(i8* [[DEST_I812]]) ; CHECK-NEXT: ret void ; %dest = alloca [16 x i8] @@ -169,8 +170,8 @@ ; CHECK-NEXT: [[DEST_I8:%.*]] = bitcast [16 x i8]* [[DEST]] to i8* ; CHECK-NEXT: [[SRC_I8:%.*]] = bitcast [16 x i8]* [[SRC]] to i8* ; CHECK-NEXT: call void @accept_ptr(i8* [[DEST_I8]]) -; CHECK-NEXT: [[DEST1:%.*]] = bitcast [16 x i8]* [[DEST]] to i8* -; CHECK-NEXT: call void @accept_ptr(i8* [[DEST1]]) [[ATTR4:#.*]] +; CHECK-NEXT: call void @accept_ptr(i8* [[SRC_I8]]) [[ATTR4:#.*]] +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST_I8]], i8* [[SRC_I8]], i64 16, i1 false) ; CHECK-NEXT: ret void ; %dest = alloca [16 x i8] Index: llvm/test/Transforms/MemCpyOpt/loadstore-sret.ll =================================================================== --- llvm/test/Transforms/MemCpyOpt/loadstore-sret.ll +++ llvm/test/Transforms/MemCpyOpt/loadstore-sret.ll @@ -26,4 +26,4 @@ ret void } -declare void @_Z3barv(%"class.std::auto_ptr"* nocapture sret) nounwind +declare void @_Z3barv(%"class.std::auto_ptr"* nocapture sret) nounwind willreturn Index: llvm/test/Transforms/MemCpyOpt/sret.ll =================================================================== --- llvm/test/Transforms/MemCpyOpt/sret.ll +++ llvm/test/Transforms/MemCpyOpt/sret.ll @@ -44,6 +44,6 @@ ret void } -declare void @ccoshl(%0* noalias nocapture sret, %0* byval) nounwind +declare void @ccoshl(%0* noalias nocapture sret, %0* byval) nounwind willreturn declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind