diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1050,10 +1050,12 @@ // Second, the length of the memcpy's must be the same, or the preceding one // must be larger than the following one. - ConstantInt *MDepLen = dyn_cast(MDep->getLength()); - ConstantInt *MLen = dyn_cast(M->getLength()); - if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) - return false; + if (MDep->getLength() != M->getLength()) { + ConstantInt *MDepLen = dyn_cast(MDep->getLength()); + ConstantInt *MLen = dyn_cast(M->getLength()); + if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) + return false; + } // Verify that the copied-from memory doesn't change in between the two // transfers. For example, in: @@ -1229,21 +1231,23 @@ /// Determine whether the instruction has undefined content for the given Size, /// either because it was freshly alloca'd or started its lifetime. -static bool hasUndefContents(Instruction *I, ConstantInt *Size) { +static bool hasUndefContents(Instruction *I, Value *Size) { if (isa(I)) return true; - if (IntrinsicInst *II = dyn_cast(I)) - if (II->getIntrinsicID() == Intrinsic::lifetime_start) - if (ConstantInt *LTSize = dyn_cast(II->getArgOperand(0))) - if (LTSize->getZExtValue() >= Size->getZExtValue()) - return true; + if (ConstantInt *CSize = dyn_cast(Size)) { + if (IntrinsicInst *II = dyn_cast(I)) + if (II->getIntrinsicID() == Intrinsic::lifetime_start) + if (ConstantInt *LTSize = dyn_cast(II->getArgOperand(0))) + if (LTSize->getZExtValue() >= CSize->getZExtValue()) + return true; + } return false; } static bool hasUndefContentsMSSA(MemorySSA *MSSA, AliasAnalysis *AA, Value *V, - MemoryDef *Def, ConstantInt *Size) { + MemoryDef *Def, Value *Size) { if (MSSA->isLiveOnEntryDef(Def)) return isa(getUnderlyingObject(V)); @@ -1251,14 +1255,17 @@ dyn_cast_or_null(Def->getMemoryInst())) { if (II->getIntrinsicID() == Intrinsic::lifetime_start) { ConstantInt *LTSize = cast(II->getArgOperand(0)); - if (AA->isMustAlias(V, II->getArgOperand(1)) && - LTSize->getZExtValue() >= Size->getZExtValue()) - return true; - // If the lifetime.start covers a whole alloca (as it almost always does) - // and we're querying a pointer based on that alloca, then we know the - // memory is definitely undef, regardless of how exactly we alias. The - // size also doesn't matter, as an out-of-bounds access would be UB. + if (ConstantInt *CSize = dyn_cast(Size)) { + if (AA->isMustAlias(V, II->getArgOperand(1)) && + LTSize->getZExtValue() >= CSize->getZExtValue()) + return true; + } + + // If the lifetime.start covers a whole alloca (as it almost always + // does) and we're querying a pointer based on that alloca, then we know + // the memory is definitely undef, regardless of how exactly we alias. + // The size also doesn't matter, as an out-of-bounds access would be UB. AllocaInst *Alloca = dyn_cast(getUnderlyingObject(V)); if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) { DataLayout DL = Alloca->getModule()->getDataLayout(); @@ -1284,8 +1291,6 @@ /// memset(dst2, c, dst2_size); /// \endcode /// When dst2_size <= dst1_size. -/// -/// The \p MemCpy must have a Constant length. bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, MemSetInst *MemSet) { // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and @@ -1293,38 +1298,47 @@ if (!AA->isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource())) return false; - // A known memset size is required. - ConstantInt *MemSetSize = dyn_cast(MemSet->getLength()); - if (!MemSetSize) - return false; + Value *MemSetSize = MemSet->getLength(); + Value *CopySize = MemCpy->getLength(); - // Make sure the memcpy doesn't read any more than what the memset wrote. - // Don't worry about sizes larger than i64. - ConstantInt *CopySize = cast(MemCpy->getLength()); - if (CopySize->getZExtValue() > MemSetSize->getZExtValue()) { - // If the memcpy is larger than the memset, but the memory was undef prior - // to the memset, we can just ignore the tail. Technically we're only - // interested in the bytes from MemSetSize..CopySize here, but as we can't - // easily represent this location, we use the full 0..CopySize range. - MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy); - bool CanReduceSize = false; - if (EnableMemorySSA) { - MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet); - MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( - MemSetAccess->getDefiningAccess(), MemCpyLoc); - if (auto *MD = dyn_cast(Clobber)) - if (hasUndefContentsMSSA(MSSA, AA, MemCpy->getSource(), MD, CopySize)) - CanReduceSize = true; - } else { - MemDepResult DepInfo = MD->getPointerDependencyFrom( - MemCpyLoc, true, MemSet->getIterator(), MemSet->getParent()); - if (DepInfo.isDef() && hasUndefContents(DepInfo.getInst(), CopySize)) - CanReduceSize = true; - } + if (MemSetSize != CopySize) { + // Make sure the memcpy doesn't read any more than what the memset wrote. + // Don't worry about sizes larger than i64. + + // A known memset size is required. + ConstantInt *CMemSetSize = dyn_cast(MemSetSize); + if (!CMemSetSize) + return false; - if (!CanReduceSize) + // A known memcpy size is also required. + ConstantInt *CCopySize = dyn_cast(CopySize); + if (!CCopySize) return false; - CopySize = MemSetSize; + if (CCopySize->getZExtValue() > CMemSetSize->getZExtValue()) { + // If the memcpy is larger than the memset, but the memory was undef prior + // to the memset, we can just ignore the tail. Technically we're only + // interested in the bytes from MemSetSize..CopySize here, but as we can't + // easily represent this location, we use the full 0..CopySize range. + MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy); + bool CanReduceSize = false; + if (EnableMemorySSA) { + MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet); + MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( + MemSetAccess->getDefiningAccess(), MemCpyLoc); + if (auto *MD = dyn_cast(Clobber)) + if (hasUndefContentsMSSA(MSSA, AA, MemCpy->getSource(), MD, CopySize)) + CanReduceSize = true; + } else { + MemDepResult DepInfo = MD->getPointerDependencyFrom( + MemCpyLoc, true, MemSet->getIterator(), MemSet->getParent()); + if (DepInfo.isDef() && hasUndefContents(DepInfo.getInst(), CopySize)) + CanReduceSize = true; + } + + if (!CanReduceSize) + return false; + CopySize = MemSetSize; + } } IRBuilder<> Builder(MemCpy); @@ -1396,10 +1410,6 @@ if (processMemSetMemCpyDependence(M, MDep)) return true; - // The optimizations after this point require the memcpy size. - ConstantInt *CopySize = dyn_cast(M->getLength()); - if (!CopySize) return false; - MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess( AnyClobber, MemoryLocation::getForSource(M)); @@ -1412,26 +1422,29 @@ // d) memcpy from a just-memset'd source can be turned into memset. if (auto *MD = dyn_cast(SrcClobber)) { if (Instruction *MI = MD->getMemoryInst()) { - if (auto *C = dyn_cast(MI)) { - // The memcpy must post-dom the call. Limit to the same block for now. - // Additionally, we need to ensure that there are no accesses to dest - // between the call and the memcpy. Accesses to src will be checked - // by performCallSlotOptzn(). - // TODO: Support non-local call-slot optimization? - if (C->getParent() == M->getParent() && - !accessedBetween(*AA, DestLoc, MD, MA)) { - // FIXME: Can we pass in either of dest/src alignment here instead - // of conservatively taking the minimum? - Align Alignment = std::min(M->getDestAlign().valueOrOne(), - M->getSourceAlign().valueOrOne()); - if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(), - CopySize->getZExtValue(), Alignment, C)) { - LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n" - << " call: " << *C << "\n" - << " memcpy: " << *M << "\n"); - eraseInstruction(M); - ++NumMemCpyInstr; - return true; + if (ConstantInt *CopySize = dyn_cast(M->getLength())) { + if (auto *C = dyn_cast(MI)) { + // The memcpy must post-dom the call. Limit to the same block for + // now. Additionally, we need to ensure that there are no accesses + // to dest between the call and the memcpy. Accesses to src will be + // checked by performCallSlotOptzn(). + // TODO: Support non-local call-slot optimization? + if (C->getParent() == M->getParent() && + !accessedBetween(*AA, DestLoc, MD, MA)) { + // FIXME: Can we pass in either of dest/src alignment here instead + // of conservatively taking the minimum? + Align Alignment = std::min(M->getDestAlign().valueOrOne(), + M->getSourceAlign().valueOrOne()); + if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(), + CopySize->getZExtValue(), Alignment, + C)) { + LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n" + << " call: " << *C << "\n" + << " memcpy: " << *M << "\n"); + eraseInstruction(M); + ++NumMemCpyInstr; + return true; + } } } } @@ -1447,7 +1460,7 @@ } } - if (hasUndefContentsMSSA(MSSA, AA, M->getSource(), MD, CopySize)) { + if (hasUndefContentsMSSA(MSSA, AA, M->getSource(), MD, M->getLength())) { LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n"); eraseInstruction(M); ++NumMemCpyInstr; @@ -1464,10 +1477,6 @@ if (processMemSetMemCpyDependence(M, MDep)) return true; - // The optimizations after this point require the memcpy size. - ConstantInt *CopySize = dyn_cast(M->getLength()); - if (!CopySize) return false; - // There are four possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. @@ -1475,17 +1484,19 @@ // its lifetime copies undefined data, and we can therefore eliminate // the memcpy in favor of the data that was already at the destination. // d) memcpy from a just-memset'd source can be turned into memset. - if (DepInfo.isClobber()) { - if (CallInst *C = dyn_cast(DepInfo.getInst())) { - // FIXME: Can we pass in either of dest/src alignment here instead - // of conservatively taking the minimum? - Align Alignment = std::min(M->getDestAlign().valueOrOne(), - M->getSourceAlign().valueOrOne()); - if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(), - CopySize->getZExtValue(), Alignment, C)) { - eraseInstruction(M); - ++NumMemCpyInstr; - return true; + if (ConstantInt *CopySize = dyn_cast(M->getLength())) { + if (DepInfo.isClobber()) { + if (CallInst *C = dyn_cast(DepInfo.getInst())) { + // FIXME: Can we pass in either of dest/src alignment here instead + // of conservatively taking the minimum? + Align Alignment = std::min(M->getDestAlign().valueOrOne(), + M->getSourceAlign().valueOrOne()); + if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(), + CopySize->getZExtValue(), Alignment, C)) { + eraseInstruction(M); + ++NumMemCpyInstr; + return true; + } } } } @@ -1498,7 +1509,7 @@ if (MemCpyInst *MDep = dyn_cast(SrcDepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep); } else if (SrcDepInfo.isDef()) { - if (hasUndefContents(SrcDepInfo.getInst(), CopySize)) { + if (hasUndefContents(SrcDepInfo.getInst(), M->getLength())) { eraseInstruction(M); ++NumMemCpyInstr; return true; diff --git a/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-memcpy.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-memcpy.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s +; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @test(i8* %src, i64 %size) { +; CHECK-LABEL: @test( +; CHECK-NEXT: [[TMP:%.*]] = alloca i8, i64 [[SIZE:%.*]], align 1 +; CHECK-NEXT: [[DST:%.*]] = alloca i8, i64 [[SIZE]], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP]], i8* align 8 [[SRC:%.*]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST]], i8* align 8 [[SRC]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: ret void +; + %tmp = alloca i8, i64 %size + %dst = alloca i8, i64 %size + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %tmp, i8* align 8 %src, i64 %size, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %dst, i8* align 8 %tmp, i64 %size, i1 false) + + ret void +} + +; Differing sizes, so left as it is. +define void @negative_test(i8* %src, i64 %size1, i64 %size2) { +; CHECK-LABEL: @negative_test( +; CHECK-NEXT: [[TMP:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1 +; CHECK-NEXT: [[DST:%.*]] = alloca i8, i64 [[SIZE2:%.*]], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP]], i8* align 8 [[SRC:%.*]], i64 [[SIZE1]], i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST]], i8* align 8 [[TMP]], i64 [[SIZE2]], i1 false) +; CHECK-NEXT: ret void +; + %tmp = alloca i8, i64 %size1 + %dst = alloca i8, i64 %size2 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %tmp, i8* align 8 %src, i64 %size1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %dst, i8* align 8 %tmp, i64 %size2, i1 false) + + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i1) diff --git a/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-uninit.ll b/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-uninit.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-uninit.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s +; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @test(i64 %size) { +; CHECK-LABEL: @test( +; CHECK-NEXT: [[SRC:%.*]] = alloca i8, i64 [[SIZE:%.*]], align 1 +; CHECK-NEXT: [[DST:%.*]] = alloca i8, i64 [[SIZE]], align 1 +; CHECK-NEXT: ret void +; + %src = alloca i8, i64 %size + %dst = alloca i8, i64 %size + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %dst, i8* align 8 %src, i64 %size, i1 false) + + ret void +} + +define void @test2(i64 %size1, i64 %size2, i64 %cpy_size) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: [[SRC:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1 +; CHECK-NEXT: [[DST:%.*]] = alloca i8, i64 [[SIZE2:%.*]], align 1 +; CHECK-NEXT: ret void +; + %src = alloca i8, i64 %size1 + %dst = alloca i8, i64 %size2 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %dst, i8* align 8 %src, i64 %cpy_size, i1 false) + + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i1) diff --git a/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s +; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @test(i8* %src, i8 %c, i64 %size) { +; CHECK-LABEL: @test( +; CHECK-NEXT: [[DST1:%.*]] = alloca i8, i64 [[SIZE:%.*]], align 1 +; CHECK-NEXT: [[DST2:%.*]] = alloca i8, i64 [[SIZE]], align 1 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[DST1]], i8 [[C:%.*]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[DST2]], i8 [[C]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: ret void +; + %dst1 = alloca i8, i64 %size + %dst2 = alloca i8, i64 %size + call void @llvm.memset.p0i8.i64(i8* align 8 %dst1, i8 %c, i64 %size, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %dst2, i8* align 8 %dst1, i64 %size, i1 false) + + ret void +} + +; Differing sizes, so left as it is. +define void @negative_test(i8* %src, i8 %c, i64 %size1, i64 %size2) { +; CHECK-LABEL: @negative_test( +; CHECK-NEXT: [[DST1:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1 +; CHECK-NEXT: [[DST2:%.*]] = alloca i8, i64 [[SIZE2:%.*]], align 1 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[DST1]], i8 [[C:%.*]], i64 [[SIZE1]], i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST2]], i8* align 8 [[DST1]], i64 [[SIZE2]], i1 false) +; CHECK-NEXT: ret void +; + %dst1 = alloca i8, i64 %size1 + %dst2 = alloca i8, i64 %size2 + call void @llvm.memset.p0i8.i64(i8* align 8 %dst1, i8 %c, i64 %size1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %dst2, i8* align 8 %dst1, i64 %size2, i1 false) + + ret void +} + +declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i1) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i1)