Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -20,7 +20,9 @@ #include "llvm/Analysis/Verifier.h" #include "llvm/PassManager.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" @@ -114,6 +116,7 @@ } void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { + DEBUG(dbgs() << "top of PassManagerBuilder::populateModulePassManager\n"); // If all optimizations are disabled, just run the always-inline pass. if (OptLevel == 0) { if (Inliner) { @@ -201,6 +204,9 @@ // Run instcombine after redundancy elimination to exploit opportunities // opened up by them. MPM.add(createInstructionCombiningPass()); + // Run MemCpyOptimization after instcombine, as it calls SimplifyLIbCalls, + // which can create llvm.mem* intrinsics, which MemCpyOptimization matches on. + MPM.add(createMemCpyOptPass()); MPM.add(createJumpThreadingPass()); // Thread jumps MPM.add(createCorrelatedValuePropagationPass()); MPM.add(createDeadStoreEliminationPass()); // Delete dead stores Index: lib/Transforms/Scalar/MemCpyOptimizer.cpp =================================================================== --- lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -335,6 +335,7 @@ uint64_t cpyLen, unsigned cpyAlign, CallInst *C); bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, uint64_t MSize); + bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep); bool processByValArgument(CallSite CS, unsigned ArgNo); Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr, Value *ByteVal); @@ -800,6 +801,74 @@ return true; } +/// \brief We've found that the (upward scanning) memory dependence of +/// \p MemCpy is \p MemSet. Try to simplify \p MemSet to only set \p MemCpy's +/// out that aren't copied over by \p MemCpy. +/// +/// In other words, transform: +/// \code +/// memset(dest, setVal, dest_size); +/// memcpy(dest, src, src_size); +/// \endcode +///into: +/// \code +/// memcpy(dest, src, dest_size); +/// memset((char *)dest + dest_size, setVal, +/// src_size > dest_size ? src_size - dest_size : 0); +/// \endcode +bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy, + MemSetInst *MemSet) { + // We can only transforms memset/memcpy with the same destinations. + if (MemSet->getDest() != MemCpy->getDest() || MemSet->isVolatile()) + return false; + + // Just blat out: + // call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %dest_size, + // i32 1, i1 false) + // %newDest = getelementptr i8* %dest, i32 %src_size + // %srcSizeGT = icmp ugt i32 %src_size, %dest_size + // %lenDiff = sub i32 %src_size, %dest_size + // %newLen = select i1 %srcSizeGT, i32 %lenDiff, i32 0 + // %setValTrunc = trunc i32 %setVal to i8 + // call void @llvm.memset.p0i8.i32(i8* %newDest, i8 %setValTrunc, + // i32 %newLen, i33 1, i1 false) + + // first, get all of the variables needed: dest, dest_size, src, src_size, + // setVal + Value *Dest = MemSet->getOperand(0); + Value *DestSize = MemSet->getOperand(2); + Value *Src = MemCpy->getOperand(1); + Value *SrcSize = MemCpy->getOperand(2); + Value *SetVal = MemSet->getOperand(1); + unsigned Align = MemCpy->getAlignment(); + Value *IsVolatile = MemCpy->getOperand(4); + IRBuilder<> Builder(MemCpy); + + Builder.CreateMemCpy(Dest, Src, DestSize, Align, IsVolatile); + + Value *NewDest = Builder.CreateGEP(Dest, SrcSize); + NewDest->setName("NewDest"); + Value *SrcSizeGT = Builder.CreateICmpUGT(SrcSize, DestSize); + SrcSizeGT->setName("SrcSizeGT"); + Value* LenDiff = Builder.CreateSub(SrcSize, DestSize); + LenDiff->setName("LenDiff"); + LLVMContext &Ctx = MemCpy->getContext(); + Constant *Zero = ConstantInt::getNullValue(LenDiff->getType()); + Value * NewLen = Builder.CreateSelect(SrcSizeGT, LenDiff, Zero); + NewLen->setName("NewLen"); + Value *SetValTrunc = Builder.CreateTrunc(SetVal, Type::getInt8Ty(Ctx)); + SetValTrunc->setName("SetValTrunc"); + CallInst *NewMemSet = + Builder.CreateMemSet(NewDest, SetValTrunc, NewLen, Align, IsVolatile); + + MD->removeInstruction(MemCpy); + MemCpy->eraseFromParent(); + MD->removeInstruction(MemSet); + MemSet->eraseFromParent(); + DEBUG(NewMemSet->getParent()->dump()); + + return true; +} /// processMemCpy - perform simplification of memcpy's. If we have memcpy A /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite @@ -807,6 +876,14 @@ /// circumstances). This allows later passes to remove the first memcpy /// altogether. bool MemCpyOpt::processMemCpy(MemCpyInst *M) { + AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M); + MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, + M, M->getParent()); + if (SrcDepInfo.isClobber()) + if (MemSetInst *MDep = dyn_cast(SrcDepInfo.getInst())) + if (processMemSetMemCpyDependence(M, MDep)) + return true; + // We can only optimize statically-sized memcpy's that are non-volatile. ConstantInt *CopySize = dyn_cast(M->getLength()); if (CopySize == 0 || M->isVolatile()) return false; @@ -831,7 +908,7 @@ return true; } - // The are two possible optimizations we can do for memcpy: + // There are two possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. MemDepResult DepInfo = MD->getDependency(M); @@ -847,9 +924,6 @@ } } - AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M); - MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, - M, M->getParent()); if (SrcDepInfo.isClobber()) { if (MemCpyInst *MDep = dyn_cast(SrcDepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); @@ -998,6 +1072,7 @@ // function. // bool MemCpyOpt::runOnFunction(Function &F) { + DEBUG(dbgs() << "top of MemCpyOpt::runOnFunction\n"); bool MadeChange = false; MD = &getAnalysis(); TD = getAnalysisIfAvailable(); Index: test/Transforms/MemCpyOpt/memSetMemCpyArgRewrite.ll =================================================================== --- test/Transforms/MemCpyOpt/memSetMemCpyArgRewrite.ll +++ test/Transforms/MemCpyOpt/memSetMemCpyArgRewrite.ll @@ -0,0 +1,16 @@ +; RUN: opt -memcpyopt %s -S | FileCheck %s +define void @foo(i8* %src, i32 %src_size, i8* %dest, i32 %dest_size, i8 %setVal) { +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %dest_size, i32 1, i1 true) +; CHECK: %NewDest = getelementptr i8* %dest, i32 %dest_size +; CHECK: %SrcSizeGT = icmp ugt i32 %dest_size, %dest_size +; CHECK: %LenDiff = sub i32 %dest_size, %dest_size +; CHECK: %NewLen = select i1 %SrcSizeGT, i32 %LenDiff, i32 0 +; CHECK: call void @llvm.memset.p0i8.i32(i8* %NewDest, i8 %SetValTrunc, i32 %NewLen, i32 1, i1 true) + + call void @llvm.memset.p0i8.i32(i8* %dest, i8 %setVal, i32 %dest_size, i32 1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %dest_size, i32 1, i1 false) + ret void +} + +declare void @llvm.memset.p0i8.i32(i8* , i8, i32, i32, i1) +declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)