Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -197,6 +197,7 @@ void initializeLoopDistributeLegacyPass(PassRegistry&); void initializeLoopExtractorPass(PassRegistry&); void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry&); +void initializeLoopVersioningIdiomRecognizeLegacyPassPass(PassRegistry&); void initializeLoopInfoWrapperPassPass(PassRegistry&); void initializeLoopInstSimplifyLegacyPassPass(PassRegistry&); void initializeLoopInterchangePass(PassRegistry &); Index: llvm/include/llvm/LinkAllPasses.h =================================================================== --- llvm/include/llvm/LinkAllPasses.h +++ llvm/include/llvm/LinkAllPasses.h @@ -125,6 +125,7 @@ (void) llvm::createLoopUnswitchPass(); (void) llvm::createLoopVersioningLICMPass(); (void) llvm::createLoopIdiomPass(); + (void) llvm::createLoopVersioningIdiomPass(); (void) llvm::createLoopRotatePass(); (void) llvm::createLowerExpectIntrinsicPass(); (void) llvm::createLowerInvokePass(); Index: llvm/include/llvm/Transforms/Scalar.h =================================================================== --- llvm/include/llvm/Transforms/Scalar.h +++ llvm/include/llvm/Transforms/Scalar.h @@ -207,6 +207,13 @@ //===----------------------------------------------------------------------===// // +// LoopVersioningIdiom - This pass recognizes and replaces idioms in loops with the +// help of loop versioning for runtime information. +// +Pass *createLoopVersioningIdiomPass(); + +//===----------------------------------------------------------------------===// +// // LoopVersioningLICM - This pass is a loop versioning pass for LICM. // Pass *createLoopVersioningLICMPass(); Index: llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h =================================================================== --- llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h +++ llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h @@ -28,6 +28,14 @@ PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U); }; + +/// Performs Loop Idiom Recognize Pass with Loop Versioning. +class LoopVersioningIdiomRecognizePass + : public PassInfoMixin { +public: + PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, LPMUpdater &U); +}; } // end namespace llvm #endif // LLVM_TRANSFORMS_SCALAR_LOOPIDIOMRECOGNIZE_H Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -577,6 +577,11 @@ // llvm.loop.distribute=true or when -enable-loop-distribute is specified. OptimizePM.addPass(LoopDistributePass()); + // Recognize loop patterns and use loop versioning to compute at run-time + // correctness and cost of the transform. + OptimizePM.addPass( + createFunctionToLoopPassAdaptor(LoopVersioningIdiomRecognizePass())); + // Now run the core loop vectorizer. OptimizePM.addPass(LoopVectorizePass()); Index: llvm/lib/Passes/PassRegistry.def =================================================================== --- llvm/lib/Passes/PassRegistry.def +++ llvm/lib/Passes/PassRegistry.def @@ -218,6 +218,7 @@ LOOP_PASS("invalidate", InvalidateAllAnalysesPass()) LOOP_PASS("licm", LICMPass()) LOOP_PASS("loop-idiom", LoopIdiomRecognizePass()) +LOOP_PASS("loop-versioning-idiom", LoopVersioningIdiomRecognizePass()) LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass()) LOOP_PASS("rotate", LoopRotatePass()) LOOP_PASS("no-op-loop", NoOpLoopPass()) Index: llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp =================================================================== --- llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -33,35 +33,6 @@ using namespace llvm; -static cl::opt DisableMemcpyIdiom("disable-memcpy-idiom", - cl::Hidden, cl::init(false), - cl::desc("Disable generation of memcpy in loop idiom recognition")); - -static cl::opt DisableMemmoveIdiom("disable-memmove-idiom", - cl::Hidden, cl::init(false), - cl::desc("Disable generation of memmove in loop idiom recognition")); - -static cl::opt RuntimeMemSizeThreshold("runtime-mem-idiom-threshold", - cl::Hidden, cl::init(0), cl::desc("Threshold (in bytes) for the runtime " - "check guarding the memmove.")); - -static cl::opt CompileTimeMemSizeThreshold( - "compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64), - cl::desc("Threshold (in bytes) to perform the transformation, if the " - "runtime loop count (mem transfer size) is known at compile-time.")); - -static cl::opt OnlyNonNestedMemmove("only-nonnested-memmove-idiom", - cl::Hidden, cl::init(true), - cl::desc("Only enable generating memmove in non-nested loops")); - -cl::opt HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy", - cl::Hidden, cl::init(false), - cl::desc("Enable Hexagon-specific memcpy for volatile destination.")); - -static const char *HexagonVolatileMemcpyName - = "hexagon_memcpy_forward_vp4cp4n2"; - - namespace llvm { void initializeHexagonLoopIdiomRecognizePass(PassRegistry&); Pass *createHexagonLoopIdiomPass(); @@ -93,16 +64,6 @@ bool runOnLoop(Loop *L, LPPassManager &LPM) override; private: - unsigned getStoreSizeInBytes(StoreInst *SI); - int getSCEVStride(const SCEVAddRecExpr *StoreEv); - bool isLegalStore(Loop *CurLoop, StoreInst *SI); - void collectStores(Loop *CurLoop, BasicBlock *BB, - SmallVectorImpl &Stores); - bool processCopyingStore(Loop *CurLoop, StoreInst *SI, const SCEV *BECount); - bool coverLoop(Loop *L, SmallVectorImpl &Insts) const; - bool runOnLoopBlock(Loop *CurLoop, BasicBlock *BB, const SCEV *BECount, - SmallVectorImpl &ExitBlocks); - bool runOnCountableLoop(Loop *L); AliasAnalysis *AA; const DataLayout *DL; @@ -110,7 +71,6 @@ LoopInfo *LF; const TargetLibraryInfo *TLI; ScalarEvolution *SE; - bool HasMemcpy, HasMemmove; }; } @@ -1063,519 +1023,6 @@ return true; } - -unsigned HexagonLoopIdiomRecognize::getStoreSizeInBytes(StoreInst *SI) { - uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType()); - assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) && - "Don't overflow unsigned."); - return (unsigned)SizeInBits >> 3; -} - - -int HexagonLoopIdiomRecognize::getSCEVStride(const SCEVAddRecExpr *S) { - if (const SCEVConstant *SC = dyn_cast(S->getOperand(1))) - return SC->getAPInt().getSExtValue(); - return 0; -} - - -bool HexagonLoopIdiomRecognize::isLegalStore(Loop *CurLoop, StoreInst *SI) { - // Allow volatile stores if HexagonVolatileMemcpy is enabled. - if (!(SI->isVolatile() && HexagonVolatileMemcpy) && !SI->isSimple()) - return false; - - Value *StoredVal = SI->getValueOperand(); - Value *StorePtr = SI->getPointerOperand(); - - // Reject stores that are so large that they overflow an unsigned. - uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); - if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) - return false; - - // See if the pointer expression is an AddRec like {base,+,1} on the current - // loop, which indicates a strided store. If we have something else, it's a - // random store we can't handle. - auto *StoreEv = dyn_cast(SE->getSCEV(StorePtr)); - if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) - return false; - - // Check to see if the stride matches the size of the store. If so, then we - // know that every byte is touched in the loop. - int Stride = getSCEVStride(StoreEv); - if (Stride == 0) - return false; - unsigned StoreSize = getStoreSizeInBytes(SI); - if (StoreSize != unsigned(std::abs(Stride))) - return false; - - // The store must be feeding a non-volatile load. - LoadInst *LI = dyn_cast(SI->getValueOperand()); - if (!LI || !LI->isSimple()) - return false; - - // See if the pointer expression is an AddRec like {base,+,1} on the current - // loop, which indicates a strided load. If we have something else, it's a - // random load we can't handle. - Value *LoadPtr = LI->getPointerOperand(); - auto *LoadEv = dyn_cast(SE->getSCEV(LoadPtr)); - if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine()) - return false; - - // The store and load must share the same stride. - if (StoreEv->getOperand(1) != LoadEv->getOperand(1)) - return false; - - // Success. This store can be converted into a memcpy. - return true; -} - - -/// mayLoopAccessLocation - Return true if the specified loop might access the -/// specified pointer location, which is a loop-strided access. The 'Access' -/// argument specifies what the verboten forms of access are (read or write). -static bool -mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, - const SCEV *BECount, unsigned StoreSize, - AliasAnalysis &AA, - SmallPtrSetImpl &Ignored) { - // Get the location that may be stored across the loop. Since the access - // is strided positively through memory, we say that the modified location - // starts at the pointer and has infinite size. - uint64_t AccessSize = MemoryLocation::UnknownSize; - - // If the loop iterates a fixed number of times, we can refine the access - // size to be exactly the size of the memset, which is (BECount+1)*StoreSize - if (const SCEVConstant *BECst = dyn_cast(BECount)) - AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize; - - // TODO: For this to be really effective, we have to dive into the pointer - // operand in the store. Store to &A[i] of 100 will always return may alias - // with store of &A[100], we need to StoreLoc to be "A" with size of 100, - // which will then no-alias a store to &A[100]. - MemoryLocation StoreLoc(Ptr, AccessSize); - - for (auto *B : L->blocks()) - for (auto &I : *B) - if (Ignored.count(&I) == 0 && (AA.getModRefInfo(&I, StoreLoc) & Access)) - return true; - - return false; -} - - -void HexagonLoopIdiomRecognize::collectStores(Loop *CurLoop, BasicBlock *BB, - SmallVectorImpl &Stores) { - Stores.clear(); - for (Instruction &I : *BB) - if (StoreInst *SI = dyn_cast(&I)) - if (isLegalStore(CurLoop, SI)) - Stores.push_back(SI); -} - - -bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop, - StoreInst *SI, const SCEV *BECount) { - assert((SI->isSimple() || (SI->isVolatile() && HexagonVolatileMemcpy)) && - "Expected only non-volatile stores, or Hexagon-specific memcpy" - "to volatile destination."); - - Value *StorePtr = SI->getPointerOperand(); - auto *StoreEv = cast(SE->getSCEV(StorePtr)); - unsigned Stride = getSCEVStride(StoreEv); - unsigned StoreSize = getStoreSizeInBytes(SI); - if (Stride != StoreSize) - return false; - - // See if the pointer expression is an AddRec like {base,+,1} on the current - // loop, which indicates a strided load. If we have something else, it's a - // random load we can't handle. - LoadInst *LI = dyn_cast(SI->getValueOperand()); - auto *LoadEv = cast(SE->getSCEV(LI->getPointerOperand())); - - // The trip count of the loop and the base pointer of the addrec SCEV is - // guaranteed to be loop invariant, which means that it should dominate the - // header. This allows us to insert code for it in the preheader. - BasicBlock *Preheader = CurLoop->getLoopPreheader(); - Instruction *ExpPt = Preheader->getTerminator(); - IRBuilder<> Builder(ExpPt); - SCEVExpander Expander(*SE, *DL, "hexagon-loop-idiom"); - - Type *IntPtrTy = Builder.getIntPtrTy(*DL, SI->getPointerAddressSpace()); - - // Okay, we have a strided store "p[i]" of a loaded value. We can turn - // this into a memcpy/memmove in the loop preheader now if we want. However, - // this would be unsafe to do if there is anything else in the loop that may - // read or write the memory region we're storing to. For memcpy, this - // includes the load that feeds the stores. Check for an alias by generating - // the base address and checking everything. - Value *StoreBasePtr = Expander.expandCodeFor(StoreEv->getStart(), - Builder.getInt8PtrTy(SI->getPointerAddressSpace()), ExpPt); - Value *LoadBasePtr = nullptr; - - bool Overlap = false; - bool DestVolatile = SI->isVolatile(); - Type *BECountTy = BECount->getType(); - - if (DestVolatile) { - // The trip count must fit in i32, since it is the type of the "num_words" - // argument to hexagon_memcpy_forward_vp4cp4n2. - if (StoreSize != 4 || DL->getTypeSizeInBits(BECountTy) > 32) { -CleanupAndExit: - // If we generated new code for the base pointer, clean up. - Expander.clear(); - if (StoreBasePtr && (LoadBasePtr != StoreBasePtr)) { - RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); - StoreBasePtr = nullptr; - } - if (LoadBasePtr) { - RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); - LoadBasePtr = nullptr; - } - return false; - } - } - - SmallPtrSet Ignore1; - Ignore1.insert(SI); - if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, - StoreSize, *AA, Ignore1)) { - // Check if the load is the offending instruction. - Ignore1.insert(LI); - if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, - StoreSize, *AA, Ignore1)) { - // Still bad. Nothing we can do. - goto CleanupAndExit; - } - // It worked with the load ignored. - Overlap = true; - } - - if (!Overlap) { - if (DisableMemcpyIdiom || !HasMemcpy) - goto CleanupAndExit; - } else { - // Don't generate memmove if this function will be inlined. This is - // because the caller will undergo this transformation after inlining. - Function *Func = CurLoop->getHeader()->getParent(); - if (Func->hasFnAttribute(Attribute::AlwaysInline)) - goto CleanupAndExit; - - // In case of a memmove, the call to memmove will be executed instead - // of the loop, so we need to make sure that there is nothing else in - // the loop than the load, store and instructions that these two depend - // on. - SmallVector Insts; - Insts.push_back(SI); - Insts.push_back(LI); - if (!coverLoop(CurLoop, Insts)) - goto CleanupAndExit; - - if (DisableMemmoveIdiom || !HasMemmove) - goto CleanupAndExit; - bool IsNested = CurLoop->getParentLoop() != 0; - if (IsNested && OnlyNonNestedMemmove) - goto CleanupAndExit; - } - - // For a memcpy, we have to make sure that the input array is not being - // mutated by the loop. - LoadBasePtr = Expander.expandCodeFor(LoadEv->getStart(), - Builder.getInt8PtrTy(LI->getPointerAddressSpace()), ExpPt); - - SmallPtrSet Ignore2; - Ignore2.insert(SI); - if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize, - *AA, Ignore2)) - goto CleanupAndExit; - - // Check the stride. - bool StridePos = getSCEVStride(LoadEv) >= 0; - - // Currently, the volatile memcpy only emulates traversing memory forward. - if (!StridePos && DestVolatile) - goto CleanupAndExit; - - bool RuntimeCheck = (Overlap || DestVolatile); - - BasicBlock *ExitB; - if (RuntimeCheck) { - // The runtime check needs a single exit block. - SmallVector ExitBlocks; - CurLoop->getUniqueExitBlocks(ExitBlocks); - if (ExitBlocks.size() != 1) - goto CleanupAndExit; - ExitB = ExitBlocks[0]; - } - - // The # stored bytes is (BECount+1)*Size. Expand the trip count out to - // pointer size if it isn't already. - LLVMContext &Ctx = SI->getContext(); - BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); - unsigned Alignment = std::min(SI->getAlignment(), LI->getAlignment()); - DebugLoc DLoc = SI->getDebugLoc(); - - const SCEV *NumBytesS = - SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW); - if (StoreSize != 1) - NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), - SCEV::FlagNUW); - Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, ExpPt); - if (Instruction *In = dyn_cast(NumBytes)) - if (Value *Simp = SimplifyInstruction(In, *DL, TLI, DT)) - NumBytes = Simp; - - CallInst *NewCall; - - if (RuntimeCheck) { - unsigned Threshold = RuntimeMemSizeThreshold; - if (ConstantInt *CI = dyn_cast(NumBytes)) { - uint64_t C = CI->getZExtValue(); - if (Threshold != 0 && C < Threshold) - goto CleanupAndExit; - if (C < CompileTimeMemSizeThreshold) - goto CleanupAndExit; - } - - BasicBlock *Header = CurLoop->getHeader(); - Function *Func = Header->getParent(); - Loop *ParentL = LF->getLoopFor(Preheader); - StringRef HeaderName = Header->getName(); - - // Create a new (empty) preheader, and update the PHI nodes in the - // header to use the new preheader. - BasicBlock *NewPreheader = BasicBlock::Create(Ctx, HeaderName+".rtli.ph", - Func, Header); - if (ParentL) - ParentL->addBasicBlockToLoop(NewPreheader, *LF); - IRBuilder<>(NewPreheader).CreateBr(Header); - for (auto &In : *Header) { - PHINode *PN = dyn_cast(&In); - if (!PN) - break; - int bx = PN->getBasicBlockIndex(Preheader); - if (bx >= 0) - PN->setIncomingBlock(bx, NewPreheader); - } - DT->addNewBlock(NewPreheader, Preheader); - DT->changeImmediateDominator(Header, NewPreheader); - - // Check for safe conditions to execute memmove. - // If stride is positive, copying things from higher to lower addresses - // is equivalent to memmove. For negative stride, it's the other way - // around. Copying forward in memory with positive stride may not be - // same as memmove since we may be copying values that we just stored - // in some previous iteration. - Value *LA = Builder.CreatePtrToInt(LoadBasePtr, IntPtrTy); - Value *SA = Builder.CreatePtrToInt(StoreBasePtr, IntPtrTy); - Value *LowA = StridePos ? SA : LA; - Value *HighA = StridePos ? LA : SA; - Value *CmpA = Builder.CreateICmpULT(LowA, HighA); - Value *Cond = CmpA; - - // Check for distance between pointers. - Value *Dist = Builder.CreateSub(HighA, LowA); - Value *CmpD = Builder.CreateICmpSLT(NumBytes, Dist); - Value *CmpEither = Builder.CreateOr(Cond, CmpD); - Cond = CmpEither; - - if (Threshold != 0) { - Type *Ty = NumBytes->getType(); - Value *Thr = ConstantInt::get(Ty, Threshold); - Value *CmpB = Builder.CreateICmpULT(Thr, NumBytes); - Value *CmpBoth = Builder.CreateAnd(Cond, CmpB); - Cond = CmpBoth; - } - BasicBlock *MemmoveB = BasicBlock::Create(Ctx, Header->getName()+".rtli", - Func, NewPreheader); - if (ParentL) - ParentL->addBasicBlockToLoop(MemmoveB, *LF); - Instruction *OldT = Preheader->getTerminator(); - Builder.CreateCondBr(Cond, MemmoveB, NewPreheader); - OldT->eraseFromParent(); - Preheader->setName(Preheader->getName()+".old"); - DT->addNewBlock(MemmoveB, Preheader); - // Find the new immediate dominator of the exit block. - BasicBlock *ExitD = Preheader; - for (auto PI = pred_begin(ExitB), PE = pred_end(ExitB); PI != PE; ++PI) { - BasicBlock *PB = *PI; - ExitD = DT->findNearestCommonDominator(ExitD, PB); - if (!ExitD) - break; - } - // If the prior immediate dominator of ExitB was dominated by the - // old preheader, then the old preheader becomes the new immediate - // dominator. Otherwise don't change anything (because the newly - // added blocks are dominated by the old preheader). - if (ExitD && DT->dominates(Preheader, ExitD)) { - DomTreeNode *BN = DT->getNode(ExitB); - DomTreeNode *DN = DT->getNode(ExitD); - BN->setIDom(DN); - } - - // Add a call to memmove to the conditional block. - IRBuilder<> CondBuilder(MemmoveB); - CondBuilder.CreateBr(ExitB); - CondBuilder.SetInsertPoint(MemmoveB->getTerminator()); - - if (DestVolatile) { - Type *Int32Ty = Type::getInt32Ty(Ctx); - Type *Int32PtrTy = Type::getInt32PtrTy(Ctx); - Type *VoidTy = Type::getVoidTy(Ctx); - Module *M = Func->getParent(); - Constant *CF = M->getOrInsertFunction(HexagonVolatileMemcpyName, VoidTy, - Int32PtrTy, Int32PtrTy, Int32Ty, - nullptr); - Function *Fn = cast(CF); - Fn->setLinkage(Function::ExternalLinkage); - - const SCEV *OneS = SE->getConstant(Int32Ty, 1); - const SCEV *BECount32 = SE->getTruncateOrZeroExtend(BECount, Int32Ty); - const SCEV *NumWordsS = SE->getAddExpr(BECount32, OneS, SCEV::FlagNUW); - Value *NumWords = Expander.expandCodeFor(NumWordsS, Int32Ty, - MemmoveB->getTerminator()); - if (Instruction *In = dyn_cast(NumWords)) - if (Value *Simp = SimplifyInstruction(In, *DL, TLI, DT)) - NumWords = Simp; - - Value *Op0 = (StoreBasePtr->getType() == Int32PtrTy) - ? StoreBasePtr - : CondBuilder.CreateBitCast(StoreBasePtr, Int32PtrTy); - Value *Op1 = (LoadBasePtr->getType() == Int32PtrTy) - ? LoadBasePtr - : CondBuilder.CreateBitCast(LoadBasePtr, Int32PtrTy); - NewCall = CondBuilder.CreateCall(Fn, {Op0, Op1, NumWords}); - } else { - NewCall = CondBuilder.CreateMemMove(StoreBasePtr, LoadBasePtr, - NumBytes, Alignment); - } - } else { - NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, - NumBytes, Alignment); - // Okay, the memcpy has been formed. Zap the original store and - // anything that feeds into it. - RecursivelyDeleteTriviallyDeadInstructions(SI, TLI); - } - - NewCall->setDebugLoc(DLoc); - - DEBUG(dbgs() << " Formed " << (Overlap ? "memmove: " : "memcpy: ") - << *NewCall << "\n" - << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" - << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); - - return true; -} - - -// \brief Check if the instructions in Insts, together with their dependencies -// cover the loop in the sense that the loop could be safely eliminated once -// the instructions in Insts are removed. -bool HexagonLoopIdiomRecognize::coverLoop(Loop *L, - SmallVectorImpl &Insts) const { - SmallSet LoopBlocks; - for (auto *B : L->blocks()) - LoopBlocks.insert(B); - - SetVector Worklist(Insts.begin(), Insts.end()); - - // Collect all instructions from the loop that the instructions in Insts - // depend on (plus their dependencies, etc.). These instructions will - // constitute the expression trees that feed those in Insts, but the trees - // will be limited only to instructions contained in the loop. - for (unsigned i = 0; i < Worklist.size(); ++i) { - Instruction *In = Worklist[i]; - for (auto I = In->op_begin(), E = In->op_end(); I != E; ++I) { - Instruction *OpI = dyn_cast(I); - if (!OpI) - continue; - BasicBlock *PB = OpI->getParent(); - if (!LoopBlocks.count(PB)) - continue; - Worklist.insert(OpI); - } - } - - // Scan all instructions in the loop, if any of them have a user outside - // of the loop, or outside of the expressions collected above, then either - // the loop has a side-effect visible outside of it, or there are - // instructions in it that are not involved in the original set Insts. - for (auto *B : L->blocks()) { - for (auto &In : *B) { - if (isa(In) || isa(In)) - continue; - if (!Worklist.count(&In) && In.mayHaveSideEffects()) - return false; - for (const auto &K : In.users()) { - Instruction *UseI = dyn_cast(K); - if (!UseI) - continue; - BasicBlock *UseB = UseI->getParent(); - if (LF->getLoopFor(UseB) != L) - return false; - } - } - } - - return true; -} - -/// runOnLoopBlock - Process the specified block, which lives in a counted loop -/// with the specified backedge count. This block is known to be in the current -/// loop and not in any subloops. -bool HexagonLoopIdiomRecognize::runOnLoopBlock(Loop *CurLoop, BasicBlock *BB, - const SCEV *BECount, SmallVectorImpl &ExitBlocks) { - // We can only promote stores in this block if they are unconditionally - // executed in the loop. For a block to be unconditionally executed, it has - // to dominate all the exit blocks of the loop. Verify this now. - auto DominatedByBB = [this,BB] (BasicBlock *EB) -> bool { - return DT->dominates(BB, EB); - }; - if (!std::all_of(ExitBlocks.begin(), ExitBlocks.end(), DominatedByBB)) - return false; - - bool MadeChange = false; - // Look for store instructions, which may be optimized to memset/memcpy. - SmallVector Stores; - collectStores(CurLoop, BB, Stores); - - // Optimize the store into a memcpy, if it feeds an similarly strided load. - for (auto &SI : Stores) - MadeChange |= processCopyingStore(CurLoop, SI, BECount); - - return MadeChange; -} - - -bool HexagonLoopIdiomRecognize::runOnCountableLoop(Loop *L) { - PolynomialMultiplyRecognize PMR(L, *DL, *DT, *TLI, *SE); - if (PMR.recognize()) - return true; - - if (!HasMemcpy && !HasMemmove) - return false; - - const SCEV *BECount = SE->getBackedgeTakenCount(L); - assert(!isa(BECount) && - "runOnCountableLoop() called on a loop without a predictable" - "backedge-taken count"); - - SmallVector ExitBlocks; - L->getUniqueExitBlocks(ExitBlocks); - - bool Changed = false; - - // Scan all the blocks in the loop that are not in subloops. - for (auto *BB : L->getBlocks()) { - // Ignore blocks in subloops. - if (LF->getLoopFor(BB) != L) - continue; - Changed |= runOnLoopBlock(L, BB, BECount, ExitBlocks); - } - - return Changed; -} - - bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { const Module &M = *L->getHeader()->getParent()->getParent(); if (Triple(M.getTargetTriple()).getArch() != Triple::hexagon) @@ -1601,11 +1048,12 @@ TLI = &getAnalysis().getTLI(); SE = &getAnalysis().getSE(); - HasMemcpy = TLI->has(LibFunc_memcpy); - HasMemmove = TLI->has(LibFunc_memmove); + if (SE->hasLoopInvariantBackedgeTakenCount(L)) { + PolynomialMultiplyRecognize PMR(L, *DL, *DT, *TLI, *SE); + if (PMR.recognize()) + return true; + } - if (SE->hasLoopInvariantBackedgeTakenCount(L)) - return runOnCountableLoop(L); return false; } Index: llvm/lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -562,6 +562,8 @@ // llvm.loop.distribute=true or when -enable-loop-distribute is specified. MPM.add(createLoopDistributePass()); + MPM.add(createLoopVersioningIdiomPass()); + MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); // Eliminate loads by forwarding stores from the previous iteration to loads Index: llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -22,7 +22,7 @@ // TODO List: // // Future loop memory idioms to recognize: -// memcmp, memmove, strlen, etc. +// memcmp, strlen, etc. // Future floating point idioms to recognize in -ffast-math mode: // fpowi // Future integer operation idioms to recognize: @@ -44,7 +44,9 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" @@ -70,6 +72,7 @@ STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); +STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores"); static cl::opt UseLIRCodeSizeHeurs( "use-lir-code-size-heurs", @@ -77,6 +80,30 @@ "with -Os/-Oz"), cl::init(true), cl::Hidden); +static cl::opt + RuntimeMemSizeThreshold("runtime-mem-idiom-threshold", cl::Hidden, + cl::init(0), + cl::desc("Threshold (in bytes) for the runtime " + "check guarding the memmove.")); + +static cl::opt CompileTimeMemSizeThreshold( + "compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64), + cl::desc( + "Threshold (in bytes) to perform the transformation, if the " + "runtime loop count (mem transfer size) is known at compile-time.")); + +static cl::opt DisableMemcpyIdiom( + "disable-memcpy-idiom", cl::Hidden, cl::init(false), + cl::desc("Disable generation of memcpy in loop idiom recognition")); + +static cl::opt DisableMemmoveIdiom( + "disable-memmove-idiom", cl::Hidden, cl::init(false), + cl::desc("Disable generation of memmove in loop idiom recognition")); + +static cl::opt OnlyNonNestedMemmove( + "only-nonnested-memmove-idiom", cl::Hidden, cl::init(false), + cl::desc("Only enable generating memmove in non-nested loops")); + namespace { class LoopIdiomRecognize { @@ -88,6 +115,7 @@ TargetLibraryInfo *TLI; const TargetTransformInfo *TTI; const DataLayout *DL; + bool UseLoopVersioning; bool ApplyCodeSizeHeuristics; public: @@ -95,9 +123,9 @@ LoopInfo *LI, ScalarEvolution *SE, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, - const DataLayout *DL) + const DataLayout *DL, bool ULV) : CurLoop(nullptr), AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), - DL(DL) {} + DL(DL), UseLoopVersioning(ULV) {} bool runOnLoop(Loop *L); @@ -110,6 +138,7 @@ bool HasMemset; bool HasMemsetPattern; bool HasMemcpy; + bool HasMemmove; /// \name Countable Loop Idiom Handling /// @{ @@ -131,6 +160,7 @@ SmallPtrSetImpl &Stores, const SCEVAddRecExpr *Ev, const SCEV *BECount, bool NegStride, bool IsLoopMemset = false); + bool coverLoop(Loop *L, SmallVectorImpl &Insts) const; bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount); bool avoidLIRForMultiBlockLoop(bool IsMemset = false, bool IsLoopMemset = false); @@ -171,7 +201,44 @@ *L->getHeader()->getParent()); const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout(); - LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL); + LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, false); + return LIR.runOnLoop(L); + } + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG. + /// + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + getLoopAnalysisUsage(AU); + } +}; + +class LoopVersioningIdiomRecognizeLegacyPass : public LoopPass { +public: + static char ID; + explicit LoopVersioningIdiomRecognizeLegacyPass() : LoopPass(ID) { + initializeLoopVersioningIdiomRecognizeLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override { + if (skipLoop(L)) + return false; + + AliasAnalysis *AA = &getAnalysis().getAAResults(); + DominatorTree *DT = &getAnalysis().getDomTree(); + LoopInfo *LI = &getAnalysis().getLoopInfo(); + ScalarEvolution *SE = &getAnalysis().getSE(); + TargetLibraryInfo *TLI = + &getAnalysis().getTLI(); + const TargetTransformInfo *TTI = + &getAnalysis().getTTI( + *L->getHeader()->getParent()); + const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout(); + + LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, true); return LIR.runOnLoop(L); } @@ -181,9 +248,14 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addRequired(); + AU.addRequired(); getLoopAnalysisUsage(AU); } }; + } // End anonymous namespace. PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, @@ -191,7 +263,22 @@ LPMUpdater &) { const auto *DL = &L.getHeader()->getModule()->getDataLayout(); - LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL); + LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL, + false); + if (!LIR.runOnLoop(&L)) + return PreservedAnalyses::all(); + + return getLoopPassPreservedAnalyses(); +} + +PreservedAnalyses +LoopVersioningIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &) { + const auto *DL = &L.getHeader()->getModule()->getDataLayout(); + + LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL, + true); if (!LIR.runOnLoop(&L)) return PreservedAnalyses::all(); @@ -207,7 +294,27 @@ INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom", "Recognize loop idioms", false, false) +char LoopVersioningIdiomRecognizeLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(LoopVersioningIdiomRecognizeLegacyPass, + "loop-versioning-idiom", + "Recognize loop idioms with loop versioning", false, + false) +INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(LoopVersioningIdiomRecognizeLegacyPass, + "loop-versioning-idiom", + "Recognize loop idioms with loop versioning", false, false) + Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); } +Pass *llvm::createLoopVersioningIdiomPass() { + return new LoopVersioningIdiomRecognizeLegacyPass(); +} static void deleteDeadInstruction(Instruction *I) { I->replaceAllUsesWith(UndefValue::get(I->getType())); @@ -229,7 +336,7 @@ // Disable loop idiom recognition if the function's name is a common idiom. StringRef Name = L->getHeader()->getParent()->getName(); - if (Name == "memset" || Name == "memcpy") + if (Name == "memset" || Name == "memcpy" || Name == "memmove") return false; // Determine if code size heuristics need to be applied. @@ -239,8 +346,9 @@ HasMemset = TLI->has(LibFunc_memset); HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); HasMemcpy = TLI->has(LibFunc_memcpy); + HasMemmove = TLI->has(LibFunc_memmove); - if (HasMemset || HasMemsetPattern || HasMemcpy) + if (HasMemset || HasMemsetPattern || HasMemcpy || HasMemmove) if (SE->hasLoopInvariantBackedgeTakenCount(L)) return runOnCountableLoop(); @@ -397,7 +505,7 @@ } // Otherwise, see if the store can be turned into a memcpy. - if (HasMemcpy) { + if (HasMemcpy || HasMemmove) { // Check to see if the stride matches the size of the store. If so, then we // know that every byte is touched in the loop. APInt Stride = getStoreStride(StoreEv); @@ -849,6 +957,58 @@ return true; } +// \brief Check if the instructions in Insts, together with their dependencies +// cover the loop in the sense that the loop could be safely eliminated once +// the instructions in Insts are removed. +bool LoopIdiomRecognize::coverLoop(Loop *L, + SmallVectorImpl &Insts) const { + SmallSet LoopBlocks; + for (auto *B : L->blocks()) + LoopBlocks.insert(B); + + SetVector Worklist(Insts.begin(), Insts.end()); + + // Collect all instructions from the loop that the instructions in Insts + // depend on (plus their dependencies, etc.). These instructions will + // constitute the expression trees that feed those in Insts, but the trees + // will be limited only to instructions contained in the loop. + for (unsigned i = 0; i < Worklist.size(); ++i) { + Instruction *In = Worklist[i]; + for (auto I = In->op_begin(), E = In->op_end(); I != E; ++I) { + Instruction *OpI = dyn_cast(I); + if (!OpI) + continue; + BasicBlock *PB = OpI->getParent(); + if (!LoopBlocks.count(PB)) + continue; + Worklist.insert(OpI); + } + } + + // Scan all instructions in the loop, if any of them have a user outside + // of the loop, or outside of the expressions collected above, then either + // the loop has a side-effect visible outside of it, or there are + // instructions in it that are not involved in the original set Insts. + for (auto *B : L->blocks()) { + for (auto &In : *B) { + if (isa(In) || isa(In)) + continue; + if (!Worklist.count(&In) && In.mayHaveSideEffects()) + return false; + for (const auto &K : In.users()) { + Instruction *UseI = dyn_cast(K); + if (!UseI) + continue; + BasicBlock *UseB = UseI->getParent(); + if (LI->getLoopFor(UseB) != L) + return false; + } + } + } + + return true; +} + /// If the stored value is a strided load in the same loop with the same stride /// this may be transformable into a memcpy. This kicks in for stuff like /// for (i) A[i] = B[i]; @@ -857,26 +1017,26 @@ assert(SI->isSimple() && "Expected only non-volatile stores."); Value *StorePtr = SI->getPointerOperand(); - const SCEVAddRecExpr *StoreEv = cast(SE->getSCEV(StorePtr)); + auto *StoreEv = cast(SE->getSCEV(StorePtr)); APInt Stride = getStoreStride(StoreEv); unsigned StoreSize = getStoreSizeInBytes(SI, DL); bool NegStride = StoreSize == -Stride; // The store must be feeding a non-volatile load. - LoadInst *LI = cast(SI->getValueOperand()); - assert(LI->isSimple() && "Expected only non-volatile stores."); + LoadInst *LdI = cast(SI->getValueOperand()); + assert(LdI->isSimple() && "Expected only non-volatile stores."); // See if the pointer expression is an AddRec like {base,+,1} on the current // loop, which indicates a strided load. If we have something else, it's a // random load we can't handle. - const SCEVAddRecExpr *LoadEv = - cast(SE->getSCEV(LI->getPointerOperand())); + auto *LoadEv = cast(SE->getSCEV(LdI->getPointerOperand())); // The trip count of the loop and the base pointer of the addrec SCEV is // guaranteed to be loop invariant, which means that it should dominate the // header. This allows us to insert code for it in the preheader. BasicBlock *Preheader = CurLoop->getLoopPreheader(); - IRBuilder<> Builder(Preheader->getTerminator()); + Instruction *ExpPt = Preheader->getTerminator(); + IRBuilder<> Builder(ExpPt); SCEVExpander Expander(*SE, *DL, "loop-idiom"); const SCEV *StrStart = StoreEv->getStart(); @@ -893,21 +1053,61 @@ // or write the memory region we're storing to. This includes the load that // feeds the stores. Check for an alias by generating the base address and // checking everything. - Value *StoreBasePtr = Expander.expandCodeFor( - StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); + Value *StoreBasePtr = + Expander.expandCodeFor(StrStart, Builder.getInt8PtrTy(StrAS), ExpPt); + Value *LoadBasePtr = nullptr; - SmallPtrSet Stores; - Stores.insert(SI); + bool RuntimeCheck = false; + SmallPtrSet Ignore1; + Ignore1.insert(SI); if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, - StoreSize, *AA, Stores)) { - Expander.clear(); - // If we generated new code for the base pointer, clean up. - RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); - return false; + StoreSize, *AA, Ignore1)) { + if (!UseLoopVersioning) + goto CleanupAndExit; + + // Check if the load is the offending instruction. + Ignore1.insert(LdI); + if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, + StoreSize, *AA, Ignore1)) { + CleanupAndExit: + Expander.clear(); + // If we generated new code for the base pointer, clean up. + if (StoreBasePtr && (LoadBasePtr != StoreBasePtr)) { + RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); + StoreBasePtr = nullptr; + } + if (LoadBasePtr) { + RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); + LoadBasePtr = nullptr; + } + return false; + } + // It worked with the load ignored: we need a runtime check to disambiguate + // the dependence relation between the load and the store. + RuntimeCheck = true; + + // In case of a memmove, the call to memmove will be executed instead + // of the loop, so we need to make sure that there is nothing else in + // the loop than the load, store and instructions that these two depend + // on. + SmallVector Insts; + Insts.push_back(SI); + Insts.push_back(LdI); + if (!coverLoop(CurLoop, Insts)) + goto CleanupAndExit; } + if (RuntimeCheck) { + if (DisableMemmoveIdiom) + goto CleanupAndExit; + } else if (DisableMemcpyIdiom) + goto CleanupAndExit; + + if (OnlyNonNestedMemmove && CurLoop->getParentLoop()) + goto CleanupAndExit; + const SCEV *LdStart = LoadEv->getStart(); - unsigned LdAS = LI->getPointerAddressSpace(); + unsigned LdAS = LdI->getPointerAddressSpace(); // Handle negative strided loops. if (NegStride) @@ -915,26 +1115,36 @@ // For a memcpy, we have to make sure that the input array is not being // mutated by the loop. - Value *LoadBasePtr = Expander.expandCodeFor( + LoadBasePtr = Expander.expandCodeFor( LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); + SmallPtrSet Ignore2; + Ignore2.insert(SI); if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize, - *AA, Stores)) { - Expander.clear(); - // If we generated new code for the base pointer, clean up. - RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); - RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); - return false; - } + *AA, Ignore2)) + goto CleanupAndExit; if (avoidLIRForMultiBlockLoop()) - return false; - - // Okay, everything is safe, we can transform this! + goto CleanupAndExit; + + // The runtime check needs a single exit block. + BasicBlock *ExitB = nullptr; + if (RuntimeCheck) { + ExitB = CurLoop->getUniqueExitBlock(); + if (!ExitB) + goto CleanupAndExit; + + // Make sure the loop is in simple form. + assert(CurLoop->isLoopSimplifyForm() && + "loop versioning idiom pass should run on simple loops"); + } // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. + LLVMContext &Ctx = SI->getContext(); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); + unsigned Align = std::min(SI->getAlignment(), LdI->getAlignment()); + DebugLoc DLoc = SI->getDebugLoc(); const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW); @@ -942,22 +1152,104 @@ NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), SCEV::FlagNUW); - Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); + Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, ExpPt); + CallInst *NewCall; - CallInst *NewCall = - Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, - std::min(SI->getAlignment(), LI->getAlignment())); - NewCall->setDebugLoc(SI->getDebugLoc()); + if (!RuntimeCheck) { + NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align); + NewCall->setDebugLoc(SI->getDebugLoc()); + ++NumMemCpy; + // Okay, the memcpy has been formed. Zap the original store and anything that + // feeds into it. + deleteDeadInstruction(SI); + } else { + unsigned Threshold = RuntimeMemSizeThreshold; + if (ConstantInt *CI = dyn_cast(NumBytes)) { + uint64_t C = CI->getZExtValue(); + if (Threshold != 0 && C < Threshold) + goto CleanupAndExit; + if (C < CompileTimeMemSizeThreshold) + goto CleanupAndExit; + } - DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" - << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" + BasicBlock *Header = CurLoop->getHeader(); + Function *Func = Header->getParent(); + Loop *ParentL = LI->getLoopFor(Preheader); + StringRef HeaderName = Header->getName(); + + // Create a new (empty) preheader, and update the PHI nodes in the + // header to use the new preheader. + BasicBlock *NewPreheader = BasicBlock::Create(Ctx, HeaderName+".rtli.ph", + Func, Header); + if (ParentL) + ParentL->addBasicBlockToLoop(NewPreheader, *LI); + IRBuilder<>(NewPreheader).CreateBr(Header); + for (auto &In : *Header) { + PHINode *PN = dyn_cast(&In); + if (!PN) + break; + int bx = PN->getBasicBlockIndex(Preheader); + if (bx >= 0) + PN->setIncomingBlock(bx, NewPreheader); + } + DT->addNewBlock(NewPreheader, Preheader); + DT->changeImmediateDominator(Header, NewPreheader); + + // Check for safe conditions to execute memmove. + // If stride is positive, copying things from higher to lower addresses + // is equivalent to memmove. For negative stride, it's the other way + // around. Copying forward in memory with positive stride may not be + // same as memmove since we may be copying values that we just stored + // in some previous iteration. + Value *LA = Builder.CreatePtrToInt(LoadBasePtr, IntPtrTy); + Value *SA = Builder.CreatePtrToInt(StoreBasePtr, IntPtrTy); + Value *LowA = NegStride ? LA : SA; + Value *HighA = NegStride ? SA : LA; + // Check whether the dependence is a Write After Read: a WAR dependence has + // the same semantics as memmove. + Value *IsWAR = Builder.CreateICmpULT(LowA, HighA); + + // Check for distance between pointers: in the case of a RAW dependence, we + // need to make sure that there is no overlap. + Value *Dist = Builder.CreateSub(HighA, LowA); + Value *CmpD = Builder.CreateICmpSLT(NumBytes, Dist); + Value *CmpEither = Builder.CreateOr(IsWAR, CmpD); + Value *Cond = CmpEither; + + if (Threshold != 0) { + Type *Ty = NumBytes->getType(); + Value *Thr = ConstantInt::get(Ty, Threshold); + Value *CmpB = Builder.CreateICmpULT(Thr, NumBytes); + Value *CmpBoth = Builder.CreateAnd(Cond, CmpB); + Cond = CmpBoth; + } + + BasicBlock *MemmoveB = BasicBlock::Create(Ctx, Header->getName() + ".rtli", + Func, NewPreheader); + if (ParentL) + ParentL->addBasicBlockToLoop(MemmoveB, *LI); + Instruction *OldT = Preheader->getTerminator(); + Builder.CreateCondBr(Cond, MemmoveB, NewPreheader); + OldT->eraseFromParent(); + Preheader->setName(Preheader->getName() + ".old"); + DT->addNewBlock(MemmoveB, Preheader); + DT->getNode(ExitB)->setIDom(DT->getNode(Preheader)); + + // Add a call to memmove in the conditional block. + IRBuilder<> CondBuilder(MemmoveB); + CondBuilder.CreateBr(ExitB); + CondBuilder.SetInsertPoint(MemmoveB->getTerminator()); + NewCall = + CondBuilder.CreateMemMove(StoreBasePtr, LoadBasePtr, NumBytes, Align); + NewCall->setDebugLoc(SI->getDebugLoc()); + ++NumMemMove; + } + + DEBUG(dbgs() << " Formed " << (!RuntimeCheck ? "memcpy: " : "memmove: ") + << *NewCall << "\n" + << " from load ptr=" << *LoadEv << " at: " << *LdI << "\n" << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); - // Okay, the memcpy has been formed. Zap the original store and anything that - // feeds into it. - deleteDeadInstruction(SI); - ++NumMemCpy; return true; } Index: llvm/lib/Transforms/Scalar/Scalar.cpp =================================================================== --- llvm/lib/Transforms/Scalar/Scalar.cpp +++ llvm/lib/Transforms/Scalar/Scalar.cpp @@ -67,6 +67,7 @@ initializeLoopUnswitchPass(Registry); initializeLoopVersioningLICMPass(Registry); initializeLoopIdiomRecognizeLegacyPassPass(Registry); + initializeLoopVersioningIdiomRecognizeLegacyPassPass(Registry); initializeLowerAtomicLegacyPassPass(Registry); initializeLowerExpectIntrinsicPass(Registry); initializeLowerGuardIntrinsicLegacyPassPass(Registry); @@ -165,6 +166,10 @@ unwrap(PM)->add(createLoopIdiomPass()); } +void LLVMAddLoopVersioningIdiomPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopVersioningIdiomPass()); +} + void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopRotatePass()); } Index: llvm/test/CodeGen/Hexagon/loop-idiom/hexagon-memmove1.ll =================================================================== --- llvm/test/CodeGen/Hexagon/loop-idiom/hexagon-memmove1.ll +++ /dev/null @@ -1,36 +0,0 @@ -; Check for recognizing the "memmove" idiom. -; RUN: opt -basicaa -hexagon-loop-idiom -S -mtriple hexagon-unknown-elf < %s \ -; RUN: | FileCheck %s -; CHECK: call void @llvm.memmove - -; Function Attrs: norecurse nounwind -define void @foo(i32* nocapture %A, i32* nocapture readonly %B, i32 %n) #0 { -entry: - %cmp1 = icmp sgt i32 %n, 0 - br i1 %cmp1, label %for.body.preheader, label %for.end - -for.body.preheader: ; preds = %entry - %arrayidx.gep = getelementptr i32, i32* %B, i32 0 - %arrayidx1.gep = getelementptr i32, i32* %A, i32 0 - br label %for.body - -for.body: ; preds = %for.body.preheader, %for.body - %arrayidx.phi = phi i32* [ %arrayidx.gep, %for.body.preheader ], [ %arrayidx.inc, %for.body ] - %arrayidx1.phi = phi i32* [ %arrayidx1.gep, %for.body.preheader ], [ %arrayidx1.inc, %for.body ] - %i.02 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %0 = load i32, i32* %arrayidx.phi, align 4 - store i32 %0, i32* %arrayidx1.phi, align 4 - %inc = add nuw nsw i32 %i.02, 1 - %exitcond = icmp ne i32 %inc, %n - %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1 - %arrayidx1.inc = getelementptr i32, i32* %arrayidx1.phi, i32 1 - br i1 %exitcond, label %for.body, label %for.end.loopexit - -for.end.loopexit: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.end.loopexit, %entry - ret void -} - -attributes #0 = { nounwind } Index: llvm/test/CodeGen/Hexagon/loop-idiom/hexagon-memmove2.ll =================================================================== --- llvm/test/CodeGen/Hexagon/loop-idiom/hexagon-memmove2.ll +++ /dev/null @@ -1,36 +0,0 @@ -; RUN: opt -basicaa -hexagon-loop-idiom -S -mtriple hexagon-unknown-elf < %s \ -; RUN: | FileCheck %s - -define void @PR14241(i32* %s, i64 %size) #0 { -; Ensure that we don't form a memcpy for strided loops. Briefly, when we taught -; LoopIdiom about memmove and strided loops, this got miscompiled into a memcpy -; instead of a memmove. If we get the memmove transform back, this will catch -; regressions. -; -; CHECK-LABEL: @PR14241( - -entry: - %end.idx = add i64 %size, -1 - %end.ptr = getelementptr inbounds i32, i32* %s, i64 %end.idx - br label %while.body -; CHECK-NOT: memcpy -; CHECK: memmove - -while.body: - %phi.ptr = phi i32* [ %s, %entry ], [ %next.ptr, %while.body ] - %src.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1 - %val = load i32, i32* %src.ptr, align 4 -; CHECK: load - %dst.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 0 - store i32 %val, i32* %dst.ptr, align 4 -; CHECK: store - %next.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1 - %cmp = icmp eq i32* %next.ptr, %end.ptr - br i1 %cmp, label %exit, label %while.body - -exit: - ret void -; CHECK: ret void -} - -attributes #0 = { nounwind } Index: llvm/test/Other/new-pm-defaults.ll =================================================================== --- llvm/test/Other/new-pm-defaults.ll +++ llvm/test/Other/new-pm-defaults.ll @@ -137,6 +137,7 @@ ; CHECK-O-NEXT: Running pass: Float2IntPass ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopRotatePass ; CHECK-O-NEXT: Running pass: LoopDistributePass +; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopVersioningIdiomRecognizePass ; CHECK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis Index: llvm/test/Other/pass-pipelines.ll =================================================================== --- llvm/test/Other/pass-pipelines.ll +++ llvm/test/Other/pass-pipelines.ll @@ -68,6 +68,8 @@ ; We rotate loops prior to vectorization. ; CHECK-O2: Loop Pass Manager ; CHECK-O2-NEXT: Rotate Loops +; CHECK-O2: Loop Pass Manager +; CHECK-O2-NEXT: Recognize loop idioms with loop versioning ; CHECK-O2-NOT: Manager ; CHECK-O2: Loop Vectorization ; CHECK-O2-NOT: Manager Index: llvm/test/Transforms/LoopIdiom/basic.ll =================================================================== --- llvm/test/Transforms/LoopIdiom/basic.ll +++ llvm/test/Transforms/LoopIdiom/basic.ll @@ -353,8 +353,6 @@ ; CHECK: ret void } - - ; PR9815 - This is a partial overlap case that cannot be safely transformed ; into a memcpy. @g_50 = global [7 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0], align 16 Index: llvm/test/Transforms/LoopIdiom/memmove.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopIdiom/memmove.ll @@ -0,0 +1,43 @@ +; RUN: opt -S -basicaa -loop-versioning-idiom < %s | FileCheck %s +; RUN: opt -S -basicaa -loop-idiom < %s | FileCheck %s -check-prefix=LOOP-IDIOM + +declare i64 @foo() nounwind + +; CHECK-LABEL: @test1 +; CHECK: call void @llvm.memmove.p0i8.p0i8.i64( +; LOOP-IDIOM-LABEL: @test1 +; LOOP-IDIOM-NOT: call void @llvm.memmove.p0i8.p0i8.i64( + +; Nested loops +define void @test1(i8* nocapture %A, i64 %n) nounwind { +entry: + %call8 = tail call i64 @foo() nounwind + %tobool9 = icmp eq i64 %call8, 0 + br i1 %tobool9, label %while.end, label %for.cond.preheader.lr.ph + +for.cond.preheader.lr.ph: ; preds = %entry + %cmp6 = icmp eq i64 %n, 0 + br label %for.cond.preheader + +while.cond.loopexit: ; preds = %for.body, %for.cond.preheader + %call = tail call i64 @foo() nounwind + %tobool = icmp eq i64 %call, 0 + br i1 %tobool, label %while.end, label %for.cond.preheader + +for.cond.preheader: ; preds = %for.cond.preheader.lr.ph, %while.cond.loopexit + br i1 %cmp6, label %while.cond.loopexit, label %for.body + +for.body: ; preds = %for.cond.preheader, %for.body + %i.07 = phi i64 [ %inc, %for.body ], [ 0, %for.cond.preheader ] + %add = add i64 %i.07, 10 + %arrayidx = getelementptr inbounds i8, i8* %A, i64 %add + %0 = load i8, i8* %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, i8* %A, i64 %i.07 + store i8 %0, i8* %arrayidx1, align 1 + %inc = add i64 %i.07, 1 + %exitcond = icmp eq i64 %inc, %n + br i1 %exitcond, label %while.cond.loopexit, label %for.body + +while.end: ; preds = %while.cond.loopexit, %entry + ret void +} Index: llvm/test/Transforms/LoopIdiom/memmove1.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopIdiom/memmove1.ll @@ -0,0 +1,36 @@ +; Check for recognizing the "memmove" idiom. +; RUN: opt -loop-versioning-idiom -S < %s | FileCheck %s + +; CHECK-LABEL: @foo( +; CHECK: call void @llvm.memmove + +define void @foo(i32* nocapture %A, i32* nocapture readonly %B, i32 %n) #0 { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %arrayidx.gep = getelementptr i32, i32* %B, i32 0 + %arrayidx1.gep = getelementptr i32, i32* %A, i32 0 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %arrayidx.phi = phi i32* [ %arrayidx.gep, %for.body.preheader ], [ %arrayidx.inc, %for.body ] + %arrayidx1.phi = phi i32* [ %arrayidx1.gep, %for.body.preheader ], [ %arrayidx1.inc, %for.body ] + %i.02 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %0 = load i32, i32* %arrayidx.phi, align 4 + store i32 %0, i32* %arrayidx1.phi, align 4 + %inc = add nuw nsw i32 %i.02, 1 + %exitcond = icmp ne i32 %inc, %n + %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1 + %arrayidx1.inc = getelementptr i32, i32* %arrayidx1.phi, i32 1 + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +attributes #0 = { nounwind } Index: llvm/test/Transforms/LoopIdiom/memmove2.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopIdiom/memmove2.ll @@ -0,0 +1,35 @@ +; RUN: opt -loop-versioning-idiom -S < %s | FileCheck %s + +define void @PR14241(i32* %s, i64 %size) #0 { +; Ensure that we don't form a memcpy for strided loops. Briefly, when we taught +; LoopIdiom about memmove and strided loops, this got miscompiled into a memcpy +; instead of a memmove. If we get the memmove transform back, this will catch +; regressions. +; +; CHECK-LABEL: @PR14241( + +entry: + %end.idx = add i64 %size, -1 + %end.ptr = getelementptr inbounds i32, i32* %s, i64 %end.idx + br label %while.body +; CHECK-NOT: memcpy +; CHECK: memmove + +while.body: + %phi.ptr = phi i32* [ %s, %entry ], [ %next.ptr, %while.body ] + %src.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1 + %val = load i32, i32* %src.ptr, align 4 +; CHECK: load + %dst.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 0 + store i32 %val, i32* %dst.ptr, align 4 +; CHECK: store + %next.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1 + %cmp = icmp eq i32* %next.ptr, %end.ptr + br i1 %cmp, label %exit, label %while.body + +exit: + ret void +; CHECK: ret void +} + +attributes #0 = { nounwind } Index: llvm/test/Transforms/LoopIdiom/memset_noidiom.ll =================================================================== --- llvm/test/Transforms/LoopIdiom/memset_noidiom.ll +++ llvm/test/Transforms/LoopIdiom/memset_noidiom.ll @@ -28,3 +28,54 @@ ret i8* %b } +; CHECK-LABEL: @memcpy +; CHECK-NOT: llvm.memcpy +define i8* @memcpy(i8* noalias %dst, i8* noalias %src, i64 %n) nounwind { +entry: + %tobool3 = icmp eq i64 %n, 0 + br i1 %tobool3, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %c2.06 = phi i8* [ %incdec.ptr, %while.body ], [ %src, %entry ] + %c1.05 = phi i8* [ %incdec.ptr1, %while.body ], [ %dst, %entry ] + %n.addr.04 = phi i64 [ %dec, %while.body ], [ %n, %entry ] + %dec = add i64 %n.addr.04, -1 + %incdec.ptr = getelementptr inbounds i8, i8* %c2.06, i64 1 + %0 = load i8, i8* %c2.06, align 1 + %incdec.ptr1 = getelementptr inbounds i8, i8* %c1.05, i64 1 + store i8 %0, i8* %c1.05, align 1 + %tobool = icmp eq i64 %dec, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + ret i8* %dst +} + +; CHECK-LABEL: @memmove +; CHECK-NOT: llvm.memmove +define i8* @memmove(i8* %dst, i8* nocapture %src, i64 %count) nounwind { +entry: + %sub = add i64 %count, -1 + %tobool9 = icmp eq i64 %count, 0 + br i1 %tobool9, label %while.end, label %while.body.lr.ph + +while.body.lr.ph: ; preds = %entry + %add.ptr2 = getelementptr inbounds i8, i8* %src, i64 %sub + %add.ptr = getelementptr inbounds i8, i8* %dst, i64 %sub + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %b.012 = phi i8* [ %add.ptr2, %while.body.lr.ph ], [ %incdec.ptr, %while.body ] + %a.011 = phi i8* [ %add.ptr, %while.body.lr.ph ], [ %incdec.ptr3, %while.body ] + %count.addr.010 = phi i64 [ %count, %while.body.lr.ph ], [ %dec, %while.body ] + %dec = add i64 %count.addr.010, -1 + %incdec.ptr = getelementptr inbounds i8, i8* %b.012, i64 -1 + %0 = load i8, i8* %b.012, align 1 + %incdec.ptr3 = getelementptr inbounds i8, i8* %a.011, i64 -1 + store i8 %0, i8* %a.011, align 1 + %tobool = icmp eq i64 %dec, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + ret i8* %dst +} Index: llvm/test/Transforms/LoopIdiom/pr28196.ll =================================================================== --- llvm/test/Transforms/LoopIdiom/pr28196.ll +++ llvm/test/Transforms/LoopIdiom/pr28196.ll @@ -1,8 +1,5 @@ ; RUN: opt -loop-idiom -S < %s | FileCheck %s -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - define void @test1() { entry: br label %for.body.preheader