Index: lib/Transforms/Scalar/MemCpyOptimizer.cpp =================================================================== --- lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -832,46 +832,31 @@ if (cpyLen < srcSize) return false; + // The store to dest may never happen if the call can throw. + if (C->mayThrow() && !isa(cpyDest)) + return false; + // Check that accessing the first srcSize bytes of dest will not cause a // trap. Otherwise the transform is invalid since it might cause a trap // to occur earlier than it otherwise would. - if (AllocaInst *A = dyn_cast(cpyDest)) { - // The destination is an alloca. Check it is larger than srcSize. - ConstantInt *destArraySize = dyn_cast(A->getArraySize()); - if (!destArraySize) - return false; - - uint64_t destSize = DL.getTypeAllocSize(A->getAllocatedType()) * - destArraySize->getZExtValue(); - - if (destSize < srcSize) - return false; - } else if (Argument *A = dyn_cast(cpyDest)) { - // The store to dest may never happen if the call can throw. - if (C->mayThrow()) - return false; - - if (A->getDereferenceableBytes() < srcSize) { - // If the destination is an sret parameter then only accesses that are - // outside of the returned struct type can trap. - if (!A->hasStructRetAttr()) - return false; - - Type *StructTy = cast(A->getType())->getElementType(); - if (!StructTy->isSized()) { - // The call may never return and hence the copy-instruction may never - // be executed, and therefore it's not safe to say "the destination - // has at least bytes, as implied by the copy-instruction", - return false; - } - - uint64_t destSize = DL.getTypeAllocSize(StructTy); - if (destSize < srcSize) - return false; + bool CanBeNull; + uint64_t DestSize = 0; + + if (const auto GEP = dyn_cast(cpyDest)) { + APInt Offset(DL.getPointerTypeSizeInBits(GEP->getType()), 0); + APInt APOffset(DL.getPointerSizeInBits(GEP->getPointerAddressSpace()), 0); + if (GEP->accumulateConstantOffset(DL, APOffset) && !Offset.isNegative()) { + auto Offset = APOffset.getSExtValue(); + auto InnerSize = GEP->getPointerOperand()->getPointerDereferenceableBytes( + DL, CanBeNull); + if (Offset <= InnerSize) + DestSize = InnerSize - Offset; } - } else { + } else + DestSize = cpyDest->getPointerDereferenceableBytes(DL, CanBeNull); + + if (CanBeNull || DestSize < srcSize) return false; - } // Check that dest points to memory that is at least as aligned as src. unsigned srcAlign = srcAlloca->getAlignment(); @@ -923,9 +908,21 @@ // Since we're changing the parameter to the callsite, we need to make sure // that what would be the new parameter dominates the callsite. DominatorTree &DT = LookupDomTree(); - if (Instruction *cpyDestInst = dyn_cast(cpyDest)) - if (!DT.dominates(cpyDestInst, C)) - return false; + if (Instruction *cpyDestInst = dyn_cast(cpyDest)) { + if (!DT.dominates(cpyDestInst, C)) { + // Try to move a GEP up to allow the optimization to happen + if (auto G = dyn_cast(cpyDest)) { + auto P = G->getPointerOperand(); + auto PI = dyn_cast(P); + if (isa(P) || (PI && DT.dominates(PI, C))) + G->moveBefore(C); + else + return false; + } + else + return false; + } + } // In addition to knowing that the call does not access src in some // unexpected manner, for example via a global, which we deduce from Index: test/Transforms/MemCpyOpt/callslot_gep.ll =================================================================== --- /dev/null +++ test/Transforms/MemCpyOpt/callslot_gep.ll @@ -0,0 +1,20 @@ +; RUN: opt -S -memcpyopt < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +define void @foo(i8* noalias dereferenceable(201) %d) #0 { +; CHECK-LABEL: @foo( +; CHECK-NOT: call void @llvm.memcpy.p0i8.p0i8.i64 +start: + %s = alloca i8, i64 200 + call void @writer(i8* nocapture %s) + %0 = getelementptr inbounds i8, i8* %d, i32 1 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %s, i64 200, i32 1, i1 false) + ret void +} + +declare void @writer(i8* nocapture) #0 +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { argmemonly nounwind }