Index: llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -22,7 +22,7 @@ // TODO List: // // Future loop memory idioms to recognize: -// memcmp, memmove, strlen, etc. +// memcmp, strlen, etc. // Future floating point idioms to recognize in -ffast-math mode: // fpowi // Future integer operation idioms to recognize: @@ -44,6 +44,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" @@ -70,6 +71,7 @@ STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); +STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores"); static cl::opt UseLIRCodeSizeHeurs( "use-lir-code-size-heurs", @@ -131,6 +133,7 @@ SmallPtrSetImpl &Stores, const SCEVAddRecExpr *Ev, const SCEV *BECount, bool NegStride, bool IsLoopMemset = false); + bool coverLoop(Loop *L, SmallVectorImpl &Insts) const; bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount); bool avoidLIRForMultiBlockLoop(bool IsMemset = false, bool IsLoopMemset = false); @@ -229,7 +232,7 @@ // Disable loop idiom recognition if the function's name is a common idiom. StringRef Name = L->getHeader()->getParent()->getName(); - if (Name == "memset" || Name == "memcpy") + if (Name == "memset" || Name == "memcpy" || Name == "memmove") return false; // Determine if code size heuristics need to be applied. @@ -849,6 +852,58 @@ return true; } +// \brief Check if the instructions in Insts, together with their dependencies +// cover the loop in the sense that the loop could be safely eliminated once +// the instructions in Insts are removed. +bool LoopIdiomRecognize::coverLoop(Loop *L, + SmallVectorImpl &Insts) const { + SmallSet LoopBlocks; + for (auto *B : L->blocks()) + LoopBlocks.insert(B); + + SetVector Worklist(Insts.begin(), Insts.end()); + + // Collect all instructions from the loop that the instructions in Insts + // depend on (plus their dependencies, etc.). These instructions will + // constitute the expression trees that feed those in Insts, but the trees + // will be limited only to instructions contained in the loop. + for (unsigned i = 0; i < Worklist.size(); ++i) { + Instruction *In = Worklist[i]; + for (auto I = In->op_begin(), E = In->op_end(); I != E; ++I) { + Instruction *OpI = dyn_cast(I); + if (!OpI) + continue; + BasicBlock *PB = OpI->getParent(); + if (!LoopBlocks.count(PB)) + continue; + Worklist.insert(OpI); + } + } + + // Scan all instructions in the loop, if any of them have a user outside + // of the loop, or outside of the expressions collected above, then either + // the loop has a side-effect visible outside of it, or there are + // instructions in it that are not involved in the original set Insts. + for (auto *B : L->blocks()) { + for (auto &In : *B) { + if (isa(In) || isa(In)) + continue; + if (!Worklist.count(&In) && In.mayHaveSideEffects()) + return false; + for (const auto &K : In.users()) { + Instruction *UseI = dyn_cast(K); + if (!UseI) + continue; + BasicBlock *UseB = UseI->getParent(); + if (LI->getLoopFor(UseB) != L) + return false; + } + } + } + + return true; +} + /// If the stored value is a strided load in the same loop with the same stride /// this may be transformable into a memcpy. This kicks in for stuff like /// for (i) A[i] = B[i]; @@ -857,26 +912,26 @@ assert(SI->isSimple() && "Expected only non-volatile stores."); Value *StorePtr = SI->getPointerOperand(); - const SCEVAddRecExpr *StoreEv = cast(SE->getSCEV(StorePtr)); + auto *StoreEv = cast(SE->getSCEV(StorePtr)); APInt Stride = getStoreStride(StoreEv); unsigned StoreSize = getStoreSizeInBytes(SI, DL); bool NegStride = StoreSize == -Stride; // The store must be feeding a non-volatile load. - LoadInst *LI = cast(SI->getValueOperand()); - assert(LI->isSimple() && "Expected only non-volatile stores."); + LoadInst *LdI = cast(SI->getValueOperand()); + assert(LdI->isSimple() && "Expected only non-volatile stores."); // See if the pointer expression is an AddRec like {base,+,1} on the current // loop, which indicates a strided load. If we have something else, it's a // random load we can't handle. - const SCEVAddRecExpr *LoadEv = - cast(SE->getSCEV(LI->getPointerOperand())); + auto *LoadEv = cast(SE->getSCEV(LdI->getPointerOperand())); // The trip count of the loop and the base pointer of the addrec SCEV is // guaranteed to be loop invariant, which means that it should dominate the // header. This allows us to insert code for it in the preheader. BasicBlock *Preheader = CurLoop->getLoopPreheader(); - IRBuilder<> Builder(Preheader->getTerminator()); + Instruction *ExpPt = Preheader->getTerminator(); + IRBuilder<> Builder(ExpPt); SCEVExpander Expander(*SE, *DL, "loop-idiom"); const SCEV *StrStart = StoreEv->getStart(); @@ -893,21 +948,48 @@ // or write the memory region we're storing to. This includes the load that // feeds the stores. Check for an alias by generating the base address and // checking everything. - Value *StoreBasePtr = Expander.expandCodeFor( - StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); + Value *StoreBasePtr = + Expander.expandCodeFor(StrStart, Builder.getInt8PtrTy(StrAS), ExpPt); + Value *LoadBasePtr = nullptr; - SmallPtrSet Stores; - Stores.insert(SI); + bool isMemcpySafe = true; + SmallPtrSet Ignore1; + Ignore1.insert(SI); if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, - StoreSize, *AA, Stores)) { - Expander.clear(); - // If we generated new code for the base pointer, clean up. - RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); - return false; + StoreSize, *AA, Ignore1)) { + // Check if the load is the offending instruction. + Ignore1.insert(LdI); + if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, + StoreSize, *AA, Ignore1)) { + CleanupAndExit: + Expander.clear(); + // If we generated new code for the base pointer, clean up. + if (StoreBasePtr && (LoadBasePtr != StoreBasePtr)) { + RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); + StoreBasePtr = nullptr; + } + if (LoadBasePtr) { + RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); + LoadBasePtr = nullptr; + } + return false; + } + // It worked with the load ignored. + isMemcpySafe = false; + + // In case of a memmove, the call to memmove will be executed instead + // of the loop, so we need to make sure that there is nothing else in + // the loop than the load, store and instructions that these two depend + // on. + SmallVector Insts; + Insts.push_back(SI); + Insts.push_back(LdI); + if (!coverLoop(CurLoop, Insts)) + goto CleanupAndExit; } const SCEV *LdStart = LoadEv->getStart(); - unsigned LdAS = LI->getPointerAddressSpace(); + unsigned LdAS = LdI->getPointerAddressSpace(); // Handle negative strided loops. if (NegStride) @@ -915,26 +997,36 @@ // For a memcpy, we have to make sure that the input array is not being // mutated by the loop. - Value *LoadBasePtr = Expander.expandCodeFor( + LoadBasePtr = Expander.expandCodeFor( LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); + SmallPtrSet Ignore2; + Ignore2.insert(SI); if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize, - *AA, Stores)) { - Expander.clear(); - // If we generated new code for the base pointer, clean up. - RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); - RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); - return false; - } + *AA, Ignore2)) + goto CleanupAndExit; if (avoidLIRForMultiBlockLoop()) - return false; + goto CleanupAndExit; // Okay, everything is safe, we can transform this! + BasicBlock *ExitB; + if (!isMemcpySafe) { + // The runtime check needs a single exit block. + SmallVector ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); + if (ExitBlocks.size() != 1) + goto CleanupAndExit; + ExitB = ExitBlocks[0]; + } + // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. + LLVMContext &Ctx = SI->getContext(); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); + unsigned Align = std::min(SI->getAlignment(), LdI->getAlignment()); + DebugLoc DLoc = SI->getDebugLoc(); const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW); @@ -942,22 +1034,122 @@ NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), SCEV::FlagNUW); - Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); + Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, ExpPt); + if (Instruction *In = dyn_cast(NumBytes)) + if (Value *Simp = SimplifyInstruction(In, *DL, TLI, DT)) + NumBytes = Simp; + + CallInst *NewCall; + + if (isMemcpySafe) { + NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align); + ++NumMemCpy; + } else { + unsigned Threshold = 0 /* RuntimeMemSizeThreshold */; + if (ConstantInt *CI = dyn_cast(NumBytes)) { + uint64_t C = CI->getZExtValue(); + if (Threshold != 0 && C < Threshold) + goto CleanupAndExit; + /* if (C < CompileTimeMemSizeThreshold) + goto CleanupAndExit; */ + } + + BasicBlock *Header = CurLoop->getHeader(); + Function *Func = Header->getParent(); + Loop *ParentL = LI->getLoopFor(Preheader); + StringRef HeaderName = Header->getName(); + + // Create a new (empty) preheader, and update the PHI nodes in the + // header to use the new preheader. + BasicBlock *NewPreheader = BasicBlock::Create(Ctx, HeaderName+".rtli.ph", + Func, Header); + if (ParentL) + ParentL->addBasicBlockToLoop(NewPreheader, *LI); + IRBuilder<>(NewPreheader).CreateBr(Header); + for (auto &In : *Header) { + PHINode *PN = dyn_cast(&In); + if (!PN) + break; + int bx = PN->getBasicBlockIndex(Preheader); + if (bx >= 0) + PN->setIncomingBlock(bx, NewPreheader); + } + DT->addNewBlock(NewPreheader, Preheader); + DT->changeImmediateDominator(Header, NewPreheader); + + // Check for safe conditions to execute memmove. + // If stride is positive, copying things from higher to lower addresses + // is equivalent to memmove. For negative stride, it's the other way + // around. Copying forward in memory with positive stride may not be + // same as memmove since we may be copying values that we just stored + // in some previous iteration. + Value *LA = Builder.CreatePtrToInt(LoadBasePtr, IntPtrTy); + Value *SA = Builder.CreatePtrToInt(StoreBasePtr, IntPtrTy); + Value *LowA = NegStride ? LA : SA; + Value *HighA = NegStride ? SA : LA; + Value *CmpA = Builder.CreateICmpULT(LowA, HighA); + Value *Cond = CmpA; + + // Check for distance between pointers. + Value *Dist = Builder.CreateSub(HighA, LowA); + Value *CmpD = Builder.CreateICmpSLT(NumBytes, Dist); + Value *CmpEither = Builder.CreateOr(Cond, CmpD); + Cond = CmpEither; + + if (Threshold != 0) { + Type *Ty = NumBytes->getType(); + Value *Thr = ConstantInt::get(Ty, Threshold); + Value *CmpB = Builder.CreateICmpULT(Thr, NumBytes); + Value *CmpBoth = Builder.CreateAnd(Cond, CmpB); + Cond = CmpBoth; + } + BasicBlock *MemmoveB = BasicBlock::Create(Ctx, Header->getName()+".rtli", + Func, NewPreheader); + if (ParentL) + ParentL->addBasicBlockToLoop(MemmoveB, *LI); + Instruction *OldT = Preheader->getTerminator(); + Builder.CreateCondBr(Cond, MemmoveB, NewPreheader); + OldT->eraseFromParent(); + Preheader->setName(Preheader->getName()+".old"); + DT->addNewBlock(MemmoveB, Preheader); + // Find the new immediate dominator of the exit block. + BasicBlock *ExitD = Preheader; + for (auto PI = pred_begin(ExitB), PE = pred_end(ExitB); PI != PE; ++PI) { + BasicBlock *PB = *PI; + ExitD = DT->findNearestCommonDominator(ExitD, PB); + if (!ExitD) + break; + } + // If the prior immediate dominator of ExitB was dominated by the + // old preheader, then the old preheader becomes the new immediate + // dominator. Otherwise don't change anything (because the newly + // added blocks are dominated by the old preheader). + if (ExitD && DT->dominates(Preheader, ExitD)) { + DomTreeNode *BN = DT->getNode(ExitB); + DomTreeNode *DN = DT->getNode(ExitD); + BN->setIDom(DN); + } + + // Add a call to memmove to the conditional block. + IRBuilder<> CondBuilder(MemmoveB); + CondBuilder.CreateBr(ExitB); + CondBuilder.SetInsertPoint(MemmoveB->getTerminator()); + + NewCall = CondBuilder.CreateMemMove(StoreBasePtr, LoadBasePtr, + NumBytes, Align); + ++NumMemMove; + } - CallInst *NewCall = - Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, - std::min(SI->getAlignment(), LI->getAlignment())); NewCall->setDebugLoc(SI->getDebugLoc()); - DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" - << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" + DEBUG(dbgs() << " Formed " << (isMemcpySafe ? "memcpy: " : "memmove: ") + << *NewCall << "\n" + << " from load ptr=" << *LoadEv << " at: " << *LdI << "\n" << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); // Okay, the memcpy has been formed. Zap the original store and anything that // feeds into it. deleteDeadInstruction(SI); - ++NumMemCpy; return true; }