Index: llvm/include/llvm/InitializePasses.h
===================================================================
--- llvm/include/llvm/InitializePasses.h
+++ llvm/include/llvm/InitializePasses.h
@@ -197,6 +197,7 @@
 void initializeLoopDistributeLegacyPass(PassRegistry&);
 void initializeLoopExtractorPass(PassRegistry&);
 void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry&);
+void initializeLoopVersioningIdiomRecognizeLegacyPassPass(PassRegistry&);
 void initializeLoopInfoWrapperPassPass(PassRegistry&);
 void initializeLoopInstSimplifyLegacyPassPass(PassRegistry&);
 void initializeLoopInterchangePass(PassRegistry &);
Index: llvm/include/llvm/LinkAllPasses.h
===================================================================
--- llvm/include/llvm/LinkAllPasses.h
+++ llvm/include/llvm/LinkAllPasses.h
@@ -125,6 +125,7 @@
       (void) llvm::createLoopUnswitchPass();
       (void) llvm::createLoopVersioningLICMPass();
       (void) llvm::createLoopIdiomPass();
+      (void) llvm::createLoopVersioningIdiomPass();
       (void) llvm::createLoopRotatePass();
       (void) llvm::createLowerExpectIntrinsicPass();
       (void) llvm::createLowerInvokePass();
Index: llvm/include/llvm/Transforms/Scalar.h
===================================================================
--- llvm/include/llvm/Transforms/Scalar.h
+++ llvm/include/llvm/Transforms/Scalar.h
@@ -207,6 +207,13 @@
 
 //===----------------------------------------------------------------------===//
 //
+// LoopVersioningIdiom - This pass recognizes and replaces idioms in loops with the
+// help of loop versioning for runtime information.
+//
+Pass *createLoopVersioningIdiomPass();
+
+//===----------------------------------------------------------------------===//
+//
 // LoopVersioningLICM - This pass is a loop versioning pass for LICM.
 //
 Pass *createLoopVersioningLICMPass();
Index: llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
===================================================================
--- llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
+++ llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
@@ -28,6 +28,14 @@
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
+
+/// Performs Loop Idiom Recognize Pass with Loop Versioning.
+class LoopVersioningIdiomRecognizePass
+    : public PassInfoMixin<LoopVersioningIdiomRecognizePass> {
+public:
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_SCALAR_LOOPIDIOMRECOGNIZE_H
Index: llvm/lib/Passes/PassBuilder.cpp
===================================================================
--- llvm/lib/Passes/PassBuilder.cpp
+++ llvm/lib/Passes/PassBuilder.cpp
@@ -577,6 +577,11 @@
   // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
   OptimizePM.addPass(LoopDistributePass());
 
+  // Recognize loop patterns and use loop versioning to compute at run-time
+  // correctness and cost of the transform.
+  OptimizePM.addPass(
+      createFunctionToLoopPassAdaptor(LoopVersioningIdiomRecognizePass()));
+
   // Now run the core loop vectorizer.
   OptimizePM.addPass(LoopVectorizePass());
 
Index: llvm/lib/Passes/PassRegistry.def
===================================================================
--- llvm/lib/Passes/PassRegistry.def
+++ llvm/lib/Passes/PassRegistry.def
@@ -218,6 +218,7 @@
 LOOP_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 LOOP_PASS("licm", LICMPass())
 LOOP_PASS("loop-idiom", LoopIdiomRecognizePass())
+LOOP_PASS("loop-versioning-idiom", LoopVersioningIdiomRecognizePass())
 LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass())
 LOOP_PASS("rotate", LoopRotatePass())
 LOOP_PASS("no-op-loop", NoOpLoopPass())
Index: llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
===================================================================
--- llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -33,35 +33,6 @@
 
 using namespace llvm;
 
-static cl::opt<bool> DisableMemcpyIdiom("disable-memcpy-idiom",
-  cl::Hidden, cl::init(false),
-  cl::desc("Disable generation of memcpy in loop idiom recognition"));
-
-static cl::opt<bool> DisableMemmoveIdiom("disable-memmove-idiom",
-  cl::Hidden, cl::init(false),
-  cl::desc("Disable generation of memmove in loop idiom recognition"));
-
-static cl::opt<unsigned> RuntimeMemSizeThreshold("runtime-mem-idiom-threshold",
-  cl::Hidden, cl::init(0), cl::desc("Threshold (in bytes) for the runtime "
-  "check guarding the memmove."));
-
-static cl::opt<unsigned> CompileTimeMemSizeThreshold(
-  "compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64),
-  cl::desc("Threshold (in bytes) to perform the transformation, if the "
-    "runtime loop count (mem transfer size) is known at compile-time."));
-
-static cl::opt<bool> OnlyNonNestedMemmove("only-nonnested-memmove-idiom",
-  cl::Hidden, cl::init(true),
-  cl::desc("Only enable generating memmove in non-nested loops"));
-
-cl::opt<bool> HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy",
-  cl::Hidden, cl::init(false),
-  cl::desc("Enable Hexagon-specific memcpy for volatile destination."));
-
-static const char *HexagonVolatileMemcpyName
-  = "hexagon_memcpy_forward_vp4cp4n2";
-
-
 namespace llvm {
   void initializeHexagonLoopIdiomRecognizePass(PassRegistry&);
   Pass *createHexagonLoopIdiomPass();
@@ -93,16 +64,6 @@
     bool runOnLoop(Loop *L, LPPassManager &LPM) override;
 
   private:
-    unsigned getStoreSizeInBytes(StoreInst *SI);
-    int getSCEVStride(const SCEVAddRecExpr *StoreEv);
-    bool isLegalStore(Loop *CurLoop, StoreInst *SI);
-    void collectStores(Loop *CurLoop, BasicBlock *BB,
-        SmallVectorImpl<StoreInst*> &Stores);
-    bool processCopyingStore(Loop *CurLoop, StoreInst *SI, const SCEV *BECount);
-    bool coverLoop(Loop *L, SmallVectorImpl<Instruction*> &Insts) const;
-    bool runOnLoopBlock(Loop *CurLoop, BasicBlock *BB, const SCEV *BECount,
-        SmallVectorImpl<BasicBlock*> &ExitBlocks);
-    bool runOnCountableLoop(Loop *L);
 
     AliasAnalysis *AA;
     const DataLayout *DL;
@@ -110,7 +71,6 @@
     LoopInfo *LF;
     const TargetLibraryInfo *TLI;
     ScalarEvolution *SE;
-    bool HasMemcpy, HasMemmove;
   };
 }
 
@@ -1063,519 +1023,6 @@
   return true;
 }
 
-
-unsigned HexagonLoopIdiomRecognize::getStoreSizeInBytes(StoreInst *SI) {
-  uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
-  assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) &&
-         "Don't overflow unsigned.");
-  return (unsigned)SizeInBits >> 3;
-}
-
-
-int HexagonLoopIdiomRecognize::getSCEVStride(const SCEVAddRecExpr *S) {
-  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getOperand(1)))
-    return SC->getAPInt().getSExtValue();
-  return 0;
-}
-
-
-bool HexagonLoopIdiomRecognize::isLegalStore(Loop *CurLoop, StoreInst *SI) {
-  // Allow volatile stores if HexagonVolatileMemcpy is enabled.
-  if (!(SI->isVolatile() && HexagonVolatileMemcpy) && !SI->isSimple())
-    return false;
-
-  Value *StoredVal = SI->getValueOperand();
-  Value *StorePtr = SI->getPointerOperand();
-
-  // Reject stores that are so large that they overflow an unsigned.
-  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
-  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
-    return false;
-
-  // See if the pointer expression is an AddRec like {base,+,1} on the current
-  // loop, which indicates a strided store.  If we have something else, it's a
-  // random store we can't handle.
-  auto *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
-  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
-    return false;
-
-  // Check to see if the stride matches the size of the store.  If so, then we
-  // know that every byte is touched in the loop.
-  int Stride = getSCEVStride(StoreEv);
-  if (Stride == 0)
-    return false;
-  unsigned StoreSize = getStoreSizeInBytes(SI);
-  if (StoreSize != unsigned(std::abs(Stride)))
-    return false;
-
-  // The store must be feeding a non-volatile load.
-  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
-  if (!LI || !LI->isSimple())
-    return false;
-
-  // See if the pointer expression is an AddRec like {base,+,1} on the current
-  // loop, which indicates a strided load.  If we have something else, it's a
-  // random load we can't handle.
-  Value *LoadPtr = LI->getPointerOperand();
-  auto *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
-  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
-    return false;
-
-  // The store and load must share the same stride.
-  if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
-    return false;
-
-  // Success.  This store can be converted into a memcpy.
-  return true;
-}
-
-
-/// mayLoopAccessLocation - Return true if the specified loop might access the
-/// specified pointer location, which is a loop-strided access.  The 'Access'
-/// argument specifies what the verboten forms of access are (read or write).
-static bool
-mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
-                      const SCEV *BECount, unsigned StoreSize,
-                      AliasAnalysis &AA,
-                      SmallPtrSetImpl<Instruction *> &Ignored) {
-  // Get the location that may be stored across the loop.  Since the access
-  // is strided positively through memory, we say that the modified location
-  // starts at the pointer and has infinite size.
-  uint64_t AccessSize = MemoryLocation::UnknownSize;
-
-  // If the loop iterates a fixed number of times, we can refine the access
-  // size to be exactly the size of the memset, which is (BECount+1)*StoreSize
-  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
-    AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;
-
-  // TODO: For this to be really effective, we have to dive into the pointer
-  // operand in the store.  Store to &A[i] of 100 will always return may alias
-  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
-  // which will then no-alias a store to &A[100].
-  MemoryLocation StoreLoc(Ptr, AccessSize);
-
-  for (auto *B : L->blocks())
-    for (auto &I : *B)
-      if (Ignored.count(&I) == 0 && (AA.getModRefInfo(&I, StoreLoc) & Access))
-        return true;
-
-  return false;
-}
-
-
-void HexagonLoopIdiomRecognize::collectStores(Loop *CurLoop, BasicBlock *BB,
-      SmallVectorImpl<StoreInst*> &Stores) {
-  Stores.clear();
-  for (Instruction &I : *BB)
-    if (StoreInst *SI = dyn_cast<StoreInst>(&I))
-      if (isLegalStore(CurLoop, SI))
-        Stores.push_back(SI);
-}
-
-
-bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop,
-      StoreInst *SI, const SCEV *BECount) {
-  assert((SI->isSimple() || (SI->isVolatile() && HexagonVolatileMemcpy)) &&
-         "Expected only non-volatile stores, or Hexagon-specific memcpy"
-         "to volatile destination.");
-
-  Value *StorePtr = SI->getPointerOperand();
-  auto *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
-  unsigned Stride = getSCEVStride(StoreEv);
-  unsigned StoreSize = getStoreSizeInBytes(SI);
-  if (Stride != StoreSize)
-    return false;
-
-  // See if the pointer expression is an AddRec like {base,+,1} on the current
-  // loop, which indicates a strided load.  If we have something else, it's a
-  // random load we can't handle.
-  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
-  auto *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
-
-  // The trip count of the loop and the base pointer of the addrec SCEV is
-  // guaranteed to be loop invariant, which means that it should dominate the
-  // header.  This allows us to insert code for it in the preheader.
-  BasicBlock *Preheader = CurLoop->getLoopPreheader();
-  Instruction *ExpPt = Preheader->getTerminator();
-  IRBuilder<> Builder(ExpPt);
-  SCEVExpander Expander(*SE, *DL, "hexagon-loop-idiom");
-
-  Type *IntPtrTy = Builder.getIntPtrTy(*DL, SI->getPointerAddressSpace());
-
-  // Okay, we have a strided store "p[i]" of a loaded value.  We can turn
-  // this into a memcpy/memmove in the loop preheader now if we want.  However,
-  // this would be unsafe to do if there is anything else in the loop that may
-  // read or write the memory region we're storing to.  For memcpy, this
-  // includes the load that feeds the stores.  Check for an alias by generating
-  // the base address and checking everything.
-  Value *StoreBasePtr = Expander.expandCodeFor(StoreEv->getStart(),
-      Builder.getInt8PtrTy(SI->getPointerAddressSpace()), ExpPt);
-  Value *LoadBasePtr = nullptr;
-
-  bool Overlap = false;
-  bool DestVolatile = SI->isVolatile();
-  Type *BECountTy = BECount->getType();
-
-  if (DestVolatile) {
-    // The trip count must fit in i32, since it is the type of the "num_words"
-    // argument to hexagon_memcpy_forward_vp4cp4n2.
-    if (StoreSize != 4 || DL->getTypeSizeInBits(BECountTy) > 32) {
-CleanupAndExit:
-      // If we generated new code for the base pointer, clean up.
-      Expander.clear();
-      if (StoreBasePtr && (LoadBasePtr != StoreBasePtr)) {
-        RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
-        StoreBasePtr = nullptr;
-      }
-      if (LoadBasePtr) {
-        RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
-        LoadBasePtr = nullptr;
-      }
-      return false;
-    }
-  }
-
-  SmallPtrSet<Instruction*, 2> Ignore1;
-  Ignore1.insert(SI);
-  if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
-                            StoreSize, *AA, Ignore1)) {
-    // Check if the load is the offending instruction.
-    Ignore1.insert(LI);
-    if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
-                              StoreSize, *AA, Ignore1)) {
-      // Still bad. Nothing we can do.
-      goto CleanupAndExit;
-    }
-    // It worked with the load ignored.
-    Overlap = true;
-  }
-
-  if (!Overlap) {
-    if (DisableMemcpyIdiom || !HasMemcpy)
-      goto CleanupAndExit;
-  } else {
-    // Don't generate memmove if this function will be inlined. This is
-    // because the caller will undergo this transformation after inlining.
-    Function *Func = CurLoop->getHeader()->getParent();
-    if (Func->hasFnAttribute(Attribute::AlwaysInline))
-      goto CleanupAndExit;
-
-    // In case of a memmove, the call to memmove will be executed instead
-    // of the loop, so we need to make sure that there is nothing else in
-    // the loop than the load, store and instructions that these two depend
-    // on.
-    SmallVector<Instruction*,2> Insts;
-    Insts.push_back(SI);
-    Insts.push_back(LI);
-    if (!coverLoop(CurLoop, Insts))
-      goto CleanupAndExit;
-
-    if (DisableMemmoveIdiom || !HasMemmove)
-      goto CleanupAndExit;
-    bool IsNested = CurLoop->getParentLoop() != 0;
-    if (IsNested && OnlyNonNestedMemmove)
-      goto CleanupAndExit;
-  }
-
-  // For a memcpy, we have to make sure that the input array is not being
-  // mutated by the loop.
-  LoadBasePtr = Expander.expandCodeFor(LoadEv->getStart(),
-      Builder.getInt8PtrTy(LI->getPointerAddressSpace()), ExpPt);
-
-  SmallPtrSet<Instruction*, 2> Ignore2;
-  Ignore2.insert(SI);
-  if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
-                            *AA, Ignore2))
-    goto CleanupAndExit;
-
-  // Check the stride.
-  bool StridePos = getSCEVStride(LoadEv) >= 0;
-
-  // Currently, the volatile memcpy only emulates traversing memory forward.
-  if (!StridePos && DestVolatile)
-    goto CleanupAndExit;
-
-  bool RuntimeCheck = (Overlap || DestVolatile);
-
-  BasicBlock *ExitB;
-  if (RuntimeCheck) {
-    // The runtime check needs a single exit block.
-    SmallVector<BasicBlock*, 8> ExitBlocks;
-    CurLoop->getUniqueExitBlocks(ExitBlocks);
-    if (ExitBlocks.size() != 1)
-      goto CleanupAndExit;
-    ExitB = ExitBlocks[0];
-  }
-
-  // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
-  // pointer size if it isn't already.
-  LLVMContext &Ctx = SI->getContext();
-  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
-  unsigned Alignment = std::min(SI->getAlignment(), LI->getAlignment());
-  DebugLoc DLoc = SI->getDebugLoc();
-
-  const SCEV *NumBytesS =
-      SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
-  if (StoreSize != 1)
-    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
-                               SCEV::FlagNUW);
-  Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, ExpPt);
-  if (Instruction *In = dyn_cast<Instruction>(NumBytes))
-    if (Value *Simp = SimplifyInstruction(In, *DL, TLI, DT))
-      NumBytes = Simp;
-
-  CallInst *NewCall;
-
-  if (RuntimeCheck) {
-    unsigned Threshold = RuntimeMemSizeThreshold;
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) {
-      uint64_t C = CI->getZExtValue();
-      if (Threshold != 0 && C < Threshold)
-        goto CleanupAndExit;
-      if (C < CompileTimeMemSizeThreshold)
-        goto CleanupAndExit;
-    }
-
-    BasicBlock *Header = CurLoop->getHeader();
-    Function *Func = Header->getParent();
-    Loop *ParentL = LF->getLoopFor(Preheader);
-    StringRef HeaderName = Header->getName();
-
-    // Create a new (empty) preheader, and update the PHI nodes in the
-    // header to use the new preheader.
-    BasicBlock *NewPreheader = BasicBlock::Create(Ctx, HeaderName+".rtli.ph",
-                                                  Func, Header);
-    if (ParentL)
-      ParentL->addBasicBlockToLoop(NewPreheader, *LF);
-    IRBuilder<>(NewPreheader).CreateBr(Header);
-    for (auto &In : *Header) {
-      PHINode *PN = dyn_cast<PHINode>(&In);
-      if (!PN)
-        break;
-      int bx = PN->getBasicBlockIndex(Preheader);
-      if (bx >= 0)
-        PN->setIncomingBlock(bx, NewPreheader);
-    }
-    DT->addNewBlock(NewPreheader, Preheader);
-    DT->changeImmediateDominator(Header, NewPreheader);
-
-    // Check for safe conditions to execute memmove.
-    // If stride is positive, copying things from higher to lower addresses
-    // is equivalent to memmove.  For negative stride, it's the other way
-    // around.  Copying forward in memory with positive stride may not be
-    // same as memmove since we may be copying values that we just stored
-    // in some previous iteration.
-    Value *LA = Builder.CreatePtrToInt(LoadBasePtr, IntPtrTy);
-    Value *SA = Builder.CreatePtrToInt(StoreBasePtr, IntPtrTy);
-    Value *LowA = StridePos ? SA : LA;
-    Value *HighA = StridePos ? LA : SA;
-    Value *CmpA = Builder.CreateICmpULT(LowA, HighA);
-    Value *Cond = CmpA;
-
-    // Check for distance between pointers.
-    Value *Dist = Builder.CreateSub(HighA, LowA);
-    Value *CmpD = Builder.CreateICmpSLT(NumBytes, Dist);
-    Value *CmpEither = Builder.CreateOr(Cond, CmpD);
-    Cond = CmpEither;
-
-    if (Threshold != 0) {
-      Type *Ty = NumBytes->getType();
-      Value *Thr = ConstantInt::get(Ty, Threshold);
-      Value *CmpB = Builder.CreateICmpULT(Thr, NumBytes);
-      Value *CmpBoth = Builder.CreateAnd(Cond, CmpB);
-      Cond = CmpBoth;
-    }
-    BasicBlock *MemmoveB = BasicBlock::Create(Ctx, Header->getName()+".rtli",
-                                              Func, NewPreheader);
-    if (ParentL)
-      ParentL->addBasicBlockToLoop(MemmoveB, *LF);
-    Instruction *OldT = Preheader->getTerminator();
-    Builder.CreateCondBr(Cond, MemmoveB, NewPreheader);
-    OldT->eraseFromParent();
-    Preheader->setName(Preheader->getName()+".old");
-    DT->addNewBlock(MemmoveB, Preheader);
-    // Find the new immediate dominator of the exit block.
-    BasicBlock *ExitD = Preheader;
-    for (auto PI = pred_begin(ExitB), PE = pred_end(ExitB); PI != PE; ++PI) {
-      BasicBlock *PB = *PI;
-      ExitD = DT->findNearestCommonDominator(ExitD, PB);
-      if (!ExitD)
-        break;
-    }
-    // If the prior immediate dominator of ExitB was dominated by the
-    // old preheader, then the old preheader becomes the new immediate
-    // dominator.  Otherwise don't change anything (because the newly
-    // added blocks are dominated by the old preheader).
-    if (ExitD && DT->dominates(Preheader, ExitD)) {
-      DomTreeNode *BN = DT->getNode(ExitB);
-      DomTreeNode *DN = DT->getNode(ExitD);
-      BN->setIDom(DN);
-    }
-
-    // Add a call to memmove to the conditional block.
-    IRBuilder<> CondBuilder(MemmoveB);
-    CondBuilder.CreateBr(ExitB);
-    CondBuilder.SetInsertPoint(MemmoveB->getTerminator());
-
-    if (DestVolatile) {
-      Type *Int32Ty = Type::getInt32Ty(Ctx);
-      Type *Int32PtrTy = Type::getInt32PtrTy(Ctx);
-      Type *VoidTy = Type::getVoidTy(Ctx);
-      Module *M = Func->getParent();
-      Constant *CF = M->getOrInsertFunction(HexagonVolatileMemcpyName, VoidTy,
-                                            Int32PtrTy, Int32PtrTy, Int32Ty,
-                                            nullptr);
-      Function *Fn = cast<Function>(CF);
-      Fn->setLinkage(Function::ExternalLinkage);
-
-      const SCEV *OneS = SE->getConstant(Int32Ty, 1);
-      const SCEV *BECount32 = SE->getTruncateOrZeroExtend(BECount, Int32Ty);
-      const SCEV *NumWordsS = SE->getAddExpr(BECount32, OneS, SCEV::FlagNUW);
-      Value *NumWords = Expander.expandCodeFor(NumWordsS, Int32Ty,
-                                               MemmoveB->getTerminator());
-      if (Instruction *In = dyn_cast<Instruction>(NumWords))
-        if (Value *Simp = SimplifyInstruction(In, *DL, TLI, DT))
-          NumWords = Simp;
-
-      Value *Op0 = (StoreBasePtr->getType() == Int32PtrTy)
-                      ? StoreBasePtr
-                      : CondBuilder.CreateBitCast(StoreBasePtr, Int32PtrTy);
-      Value *Op1 = (LoadBasePtr->getType() == Int32PtrTy)
-                      ? LoadBasePtr
-                      : CondBuilder.CreateBitCast(LoadBasePtr, Int32PtrTy);
-      NewCall = CondBuilder.CreateCall(Fn, {Op0, Op1, NumWords});
-    } else {
-      NewCall = CondBuilder.CreateMemMove(StoreBasePtr, LoadBasePtr,
-                                          NumBytes, Alignment);
-    }
-  } else {
-    NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr,
-                                   NumBytes, Alignment);
-    // Okay, the memcpy has been formed.  Zap the original store and
-    // anything that feeds into it.
-    RecursivelyDeleteTriviallyDeadInstructions(SI, TLI);
-  }
-
-  NewCall->setDebugLoc(DLoc);
-
-  DEBUG(dbgs() << "  Formed " << (Overlap ? "memmove: " : "memcpy: ")
-               << *NewCall << "\n"
-               << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
-               << "    from store ptr=" << *StoreEv << " at: " << *SI << "\n");
-
-  return true;
-}
-
-
-// \brief Check if the instructions in Insts, together with their dependencies
-// cover the loop in the sense that the loop could be safely eliminated once
-// the instructions in Insts are removed.
-bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
-      SmallVectorImpl<Instruction*> &Insts) const {
-  SmallSet<BasicBlock*,8> LoopBlocks;
-  for (auto *B : L->blocks())
-    LoopBlocks.insert(B);
-
-  SetVector<Instruction*> Worklist(Insts.begin(), Insts.end());
-
-  // Collect all instructions from the loop that the instructions in Insts
-  // depend on (plus their dependencies, etc.).  These instructions will
-  // constitute the expression trees that feed those in Insts, but the trees
-  // will be limited only to instructions contained in the loop.
-  for (unsigned i = 0; i < Worklist.size(); ++i) {
-    Instruction *In = Worklist[i];
-    for (auto I = In->op_begin(), E = In->op_end(); I != E; ++I) {
-      Instruction *OpI = dyn_cast<Instruction>(I);
-      if (!OpI)
-        continue;
-      BasicBlock *PB = OpI->getParent();
-      if (!LoopBlocks.count(PB))
-        continue;
-      Worklist.insert(OpI);
-    }
-  }
-
-  // Scan all instructions in the loop, if any of them have a user outside
-  // of the loop, or outside of the expressions collected above, then either
-  // the loop has a side-effect visible outside of it, or there are
-  // instructions in it that are not involved in the original set Insts.
-  for (auto *B : L->blocks()) {
-    for (auto &In : *B) {
-      if (isa<BranchInst>(In) || isa<DbgInfoIntrinsic>(In))
-        continue;
-      if (!Worklist.count(&In) && In.mayHaveSideEffects())
-        return false;
-      for (const auto &K : In.users()) {
-        Instruction *UseI = dyn_cast<Instruction>(K);
-        if (!UseI)
-          continue;
-        BasicBlock *UseB = UseI->getParent();
-        if (LF->getLoopFor(UseB) != L)
-          return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-/// runOnLoopBlock - Process the specified block, which lives in a counted loop
-/// with the specified backedge count.  This block is known to be in the current
-/// loop and not in any subloops.
-bool HexagonLoopIdiomRecognize::runOnLoopBlock(Loop *CurLoop, BasicBlock *BB,
-      const SCEV *BECount, SmallVectorImpl<BasicBlock*> &ExitBlocks) {
-  // We can only promote stores in this block if they are unconditionally
-  // executed in the loop.  For a block to be unconditionally executed, it has
-  // to dominate all the exit blocks of the loop.  Verify this now.
-  auto DominatedByBB = [this,BB] (BasicBlock *EB) -> bool {
-    return DT->dominates(BB, EB);
-  };
-  if (!std::all_of(ExitBlocks.begin(), ExitBlocks.end(), DominatedByBB))
-    return false;
-
-  bool MadeChange = false;
-  // Look for store instructions, which may be optimized to memset/memcpy.
-  SmallVector<StoreInst*,8> Stores;
-  collectStores(CurLoop, BB, Stores);
-
-  // Optimize the store into a memcpy, if it feeds an similarly strided load.
-  for (auto &SI : Stores)
-    MadeChange |= processCopyingStore(CurLoop, SI, BECount);
-
-  return MadeChange;
-}
-
-
-bool HexagonLoopIdiomRecognize::runOnCountableLoop(Loop *L) {
-  PolynomialMultiplyRecognize PMR(L, *DL, *DT, *TLI, *SE);
-  if (PMR.recognize())
-    return true;
-
-  if (!HasMemcpy && !HasMemmove)
-    return false;
-
-  const SCEV *BECount = SE->getBackedgeTakenCount(L);
-  assert(!isa<SCEVCouldNotCompute>(BECount) &&
-         "runOnCountableLoop() called on a loop without a predictable"
-         "backedge-taken count");
-
-  SmallVector<BasicBlock *, 8> ExitBlocks;
-  L->getUniqueExitBlocks(ExitBlocks);
-
-  bool Changed = false;
-
-  // Scan all the blocks in the loop that are not in subloops.
-  for (auto *BB : L->getBlocks()) {
-    // Ignore blocks in subloops.
-    if (LF->getLoopFor(BB) != L)
-      continue;
-    Changed |= runOnLoopBlock(L, BB, BECount, ExitBlocks);
-  }
-
-  return Changed;
-}
-
-
 bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
   const Module &M = *L->getHeader()->getParent()->getParent();
   if (Triple(M.getTargetTriple()).getArch() != Triple::hexagon)
@@ -1601,11 +1048,12 @@
   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
 
-  HasMemcpy = TLI->has(LibFunc_memcpy);
-  HasMemmove = TLI->has(LibFunc_memmove);
+  if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
+    PolynomialMultiplyRecognize PMR(L, *DL, *DT, *TLI, *SE);
+    if (PMR.recognize())
+      return true;
+  }
 
-  if (SE->hasLoopInvariantBackedgeTakenCount(L))
-    return runOnCountableLoop(L);
   return false;
 }
 
Index: llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
===================================================================
--- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -562,6 +562,8 @@
   // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
   MPM.add(createLoopDistributePass());
 
+  MPM.add(createLoopVersioningIdiomPass());
+
   MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize));
 
   // Eliminate loads by forwarding stores from the previous iteration to loads
Index: llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -22,7 +22,7 @@
 // TODO List:
 //
 // Future loop memory idioms to recognize:
-//   memcmp, memmove, strlen, etc.
+//   memcmp, strlen, etc.
 // Future floating point idioms to recognize in -ffast-math mode:
 //   fpowi
 // Future integer operation idioms to recognize:
@@ -44,7 +44,9 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
@@ -70,6 +72,7 @@
 
 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
+STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
 
 static cl::opt<bool> UseLIRCodeSizeHeurs(
     "use-lir-code-size-heurs",
@@ -77,6 +80,30 @@
              "with -Os/-Oz"),
     cl::init(true), cl::Hidden);
 
+static cl::opt<unsigned>
+    RuntimeMemSizeThreshold("runtime-mem-idiom-threshold", cl::Hidden,
+                            cl::init(0),
+                            cl::desc("Threshold (in bytes) for the runtime "
+                                     "check guarding the memmove."));
+
+static cl::opt<unsigned> CompileTimeMemSizeThreshold(
+    "compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64),
+    cl::desc(
+        "Threshold (in bytes) to perform the transformation, if the "
+        "runtime loop count (mem transfer size) is known at compile-time."));
+
+static cl::opt<bool> DisableMemcpyIdiom(
+    "disable-memcpy-idiom", cl::Hidden, cl::init(false),
+    cl::desc("Disable generation of memcpy in loop idiom recognition"));
+
+static cl::opt<bool> DisableMemmoveIdiom(
+    "disable-memmove-idiom", cl::Hidden, cl::init(false),
+    cl::desc("Disable generation of memmove in loop idiom recognition"));
+
+static cl::opt<bool> OnlyNonNestedMemmove(
+    "only-nonnested-memmove-idiom", cl::Hidden, cl::init(false),
+    cl::desc("Only enable generating memmove in non-nested loops"));
+
 namespace {
 
 class LoopIdiomRecognize {
@@ -88,6 +115,7 @@
   TargetLibraryInfo *TLI;
   const TargetTransformInfo *TTI;
   const DataLayout *DL;
+  bool UseLoopVersioning;
   bool ApplyCodeSizeHeuristics;
 
 public:
@@ -95,9 +123,9 @@
                               LoopInfo *LI, ScalarEvolution *SE,
                               TargetLibraryInfo *TLI,
                               const TargetTransformInfo *TTI,
-                              const DataLayout *DL)
+                              const DataLayout *DL, bool ULV)
       : CurLoop(nullptr), AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI),
-        DL(DL) {}
+        DL(DL), UseLoopVersioning(ULV) {}
 
   bool runOnLoop(Loop *L);
 
@@ -110,6 +138,7 @@
   bool HasMemset;
   bool HasMemsetPattern;
   bool HasMemcpy;
+  bool HasMemmove;
 
   /// \name Countable Loop Idiom Handling
   /// @{
@@ -131,6 +160,7 @@
                                SmallPtrSetImpl<Instruction *> &Stores,
                                const SCEVAddRecExpr *Ev, const SCEV *BECount,
                                bool NegStride, bool IsLoopMemset = false);
+  bool coverLoop(Loop *L, SmallVectorImpl<Instruction*> &Insts) const;
   bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
   bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
                                  bool IsLoopMemset = false);
@@ -171,7 +201,44 @@
             *L->getHeader()->getParent());
     const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
 
-    LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL);
+    LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, false);
+    return LIR.runOnLoop(L);
+  }
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG.
+  ///
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+};
+
+class LoopVersioningIdiomRecognizeLegacyPass : public LoopPass {
+public:
+  static char ID;
+  explicit LoopVersioningIdiomRecognizeLegacyPass() : LoopPass(ID) {
+    initializeLoopVersioningIdiomRecognizeLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+
+    AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    const TargetTransformInfo *TTI =
+        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+            *L->getHeader()->getParent());
+    const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
+
+    LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, true);
     return LIR.runOnLoop(L);
   }
 
@@ -181,9 +248,14 @@
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
 };
+
 } // End anonymous namespace.
 
 PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
@@ -191,7 +263,22 @@
                                               LPMUpdater &) {
   const auto *DL = &L.getHeader()->getModule()->getDataLayout();
 
-  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL);
+  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL,
+                         false);
+  if (!LIR.runOnLoop(&L))
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+PreservedAnalyses
+LoopVersioningIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
+                                      LoopStandardAnalysisResults &AR,
+                                      LPMUpdater &) {
+  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
+
+  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL,
+                         true);
   if (!LIR.runOnLoop(&L))
     return PreservedAnalyses::all();
 
@@ -207,7 +294,27 @@
 INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
                     "Recognize loop idioms", false, false)
 
+char LoopVersioningIdiomRecognizeLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopVersioningIdiomRecognizeLegacyPass,
+                      "loop-versioning-idiom",
+                      "Recognize loop idioms with loop versioning", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoopVersioningIdiomRecognizeLegacyPass,
+                    "loop-versioning-idiom",
+                    "Recognize loop idioms with loop versioning", false, false)
+
 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
+Pass *llvm::createLoopVersioningIdiomPass() {
+  return new LoopVersioningIdiomRecognizeLegacyPass();
+}
 
 static void deleteDeadInstruction(Instruction *I) {
   I->replaceAllUsesWith(UndefValue::get(I->getType()));
@@ -229,7 +336,7 @@
 
   // Disable loop idiom recognition if the function's name is a common idiom.
   StringRef Name = L->getHeader()->getParent()->getName();
-  if (Name == "memset" || Name == "memcpy")
+  if (Name == "memset" || Name == "memcpy" || Name == "memmove")
     return false;
 
   // Determine if code size heuristics need to be applied.
@@ -239,8 +346,9 @@
   HasMemset = TLI->has(LibFunc_memset);
   HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
   HasMemcpy = TLI->has(LibFunc_memcpy);
+  HasMemmove = TLI->has(LibFunc_memmove);
 
-  if (HasMemset || HasMemsetPattern || HasMemcpy)
+  if (HasMemset || HasMemsetPattern || HasMemcpy || HasMemmove)
     if (SE->hasLoopInvariantBackedgeTakenCount(L))
       return runOnCountableLoop();
 
@@ -397,7 +505,7 @@
   }
 
   // Otherwise, see if the store can be turned into a memcpy.
-  if (HasMemcpy) {
+  if (HasMemcpy || HasMemmove) {
     // Check to see if the stride matches the size of the store.  If so, then we
     // know that every byte is touched in the loop.
     APInt Stride = getStoreStride(StoreEv);
@@ -849,6 +957,58 @@
   return true;
 }
 
+// \brief Check if the instructions in Insts, together with their dependencies
+// cover the loop in the sense that the loop could be safely eliminated once
+// the instructions in Insts are removed.
+bool LoopIdiomRecognize::coverLoop(Loop *L,
+      SmallVectorImpl<Instruction*> &Insts) const {
+  SmallSet<BasicBlock*,8> LoopBlocks;
+  for (auto *B : L->blocks())
+    LoopBlocks.insert(B);
+
+  SetVector<Instruction*> Worklist(Insts.begin(), Insts.end());
+
+  // Collect all instructions from the loop that the instructions in Insts
+  // depend on (plus their dependencies, etc.).  These instructions will
+  // constitute the expression trees that feed those in Insts, but the trees
+  // will be limited only to instructions contained in the loop.
+  for (unsigned i = 0; i < Worklist.size(); ++i) {
+    Instruction *In = Worklist[i];
+    for (auto I = In->op_begin(), E = In->op_end(); I != E; ++I) {
+      Instruction *OpI = dyn_cast<Instruction>(I);
+      if (!OpI)
+        continue;
+      BasicBlock *PB = OpI->getParent();
+      if (!LoopBlocks.count(PB))
+        continue;
+      Worklist.insert(OpI);
+    }
+  }
+
+  // Scan all instructions in the loop, if any of them have a user outside
+  // of the loop, or outside of the expressions collected above, then either
+  // the loop has a side-effect visible outside of it, or there are
+  // instructions in it that are not involved in the original set Insts.
+  for (auto *B : L->blocks()) {
+    for (auto &In : *B) {
+      if (isa<BranchInst>(In) || isa<DbgInfoIntrinsic>(In))
+        continue;
+      if (!Worklist.count(&In) && In.mayHaveSideEffects())
+        return false;
+      for (const auto &K : In.users()) {
+        Instruction *UseI = dyn_cast<Instruction>(K);
+        if (!UseI)
+          continue;
+        BasicBlock *UseB = UseI->getParent();
+        if (LI->getLoopFor(UseB) != L)
+          return false;
+      }
+    }
+  }
+
+  return true;
+}
+
 /// If the stored value is a strided load in the same loop with the same stride
 /// this may be transformable into a memcpy.  This kicks in for stuff like
 ///   for (i) A[i] = B[i];
@@ -857,26 +1017,26 @@
   assert(SI->isSimple() && "Expected only non-volatile stores.");
 
   Value *StorePtr = SI->getPointerOperand();
-  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+  auto *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
   APInt Stride = getStoreStride(StoreEv);
   unsigned StoreSize = getStoreSizeInBytes(SI, DL);
   bool NegStride = StoreSize == -Stride;
 
   // The store must be feeding a non-volatile load.
-  LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
-  assert(LI->isSimple() && "Expected only non-volatile stores.");
+  LoadInst *LdI = cast<LoadInst>(SI->getValueOperand());
+  assert(LdI->isSimple() && "Expected only non-volatile stores.");
 
   // See if the pointer expression is an AddRec like {base,+,1} on the current
   // loop, which indicates a strided load.  If we have something else, it's a
   // random load we can't handle.
-  const SCEVAddRecExpr *LoadEv =
-      cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+  auto *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LdI->getPointerOperand()));
 
   // The trip count of the loop and the base pointer of the addrec SCEV is
   // guaranteed to be loop invariant, which means that it should dominate the
   // header.  This allows us to insert code for it in the preheader.
   BasicBlock *Preheader = CurLoop->getLoopPreheader();
-  IRBuilder<> Builder(Preheader->getTerminator());
+  Instruction *ExpPt = Preheader->getTerminator();
+  IRBuilder<> Builder(ExpPt);
   SCEVExpander Expander(*SE, *DL, "loop-idiom");
 
   const SCEV *StrStart = StoreEv->getStart();
@@ -893,21 +1053,61 @@
   // or write the memory region we're storing to.  This includes the load that
   // feeds the stores.  Check for an alias by generating the base address and
   // checking everything.
-  Value *StoreBasePtr = Expander.expandCodeFor(
-      StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
+  Value *StoreBasePtr =
+      Expander.expandCodeFor(StrStart, Builder.getInt8PtrTy(StrAS), ExpPt);
+  Value *LoadBasePtr = nullptr;
 
-  SmallPtrSet<Instruction *, 1> Stores;
-  Stores.insert(SI);
+  bool RuntimeCheck = false;
+  SmallPtrSet<Instruction *, 2> Ignore1;
+  Ignore1.insert(SI);
   if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
-                            StoreSize, *AA, Stores)) {
-    Expander.clear();
-    // If we generated new code for the base pointer, clean up.
-    RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
-    return false;
+                            StoreSize, *AA, Ignore1)) {
+    if (!UseLoopVersioning)
+      goto CleanupAndExit;
+
+    // Check if the load is the offending instruction.
+    Ignore1.insert(LdI);
+    if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
+                              StoreSize, *AA, Ignore1)) {
+      CleanupAndExit:
+      Expander.clear();
+      // If we generated new code for the base pointer, clean up.
+      if (StoreBasePtr && (LoadBasePtr != StoreBasePtr)) {
+        RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
+        StoreBasePtr = nullptr;
+      }
+      if (LoadBasePtr) {
+        RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
+        LoadBasePtr = nullptr;
+      }
+      return false;
+    }
+    // It worked with the load ignored: we need a runtime check to disambiguate
+    // the dependence relation between the load and the store.
+    RuntimeCheck = true;
+
+    // In case of a memmove, the call to memmove will be executed instead
+    // of the loop, so we need to make sure that there is nothing else in
+    // the loop than the load, store and instructions that these two depend
+    // on.
+    SmallVector<Instruction*,2> Insts;
+    Insts.push_back(SI);
+    Insts.push_back(LdI);
+    if (!coverLoop(CurLoop, Insts))
+      goto CleanupAndExit;
   }
 
+  if (RuntimeCheck) {
+    if (DisableMemmoveIdiom)
+      goto CleanupAndExit;
+  } else if (DisableMemcpyIdiom)
+    goto CleanupAndExit;
+
+  if (OnlyNonNestedMemmove && CurLoop->getParentLoop())
+    goto CleanupAndExit;
+
   const SCEV *LdStart = LoadEv->getStart();
-  unsigned LdAS = LI->getPointerAddressSpace();
+  unsigned LdAS = LdI->getPointerAddressSpace();
 
   // Handle negative strided loops.
   if (NegStride)
@@ -915,26 +1115,36 @@
 
   // For a memcpy, we have to make sure that the input array is not being
   // mutated by the loop.
-  Value *LoadBasePtr = Expander.expandCodeFor(
+  LoadBasePtr = Expander.expandCodeFor(
       LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
 
+  SmallPtrSet<Instruction*, 2> Ignore2;
+  Ignore2.insert(SI);
   if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
-                            *AA, Stores)) {
-    Expander.clear();
-    // If we generated new code for the base pointer, clean up.
-    RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
-    RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
-    return false;
-  }
+                            *AA, Ignore2))
+    goto CleanupAndExit;
 
   if (avoidLIRForMultiBlockLoop())
-    return false;
-
-  // Okay, everything is safe, we can transform this!
+    goto CleanupAndExit;
+
+  // The runtime check needs a single exit block.
+  BasicBlock *ExitB = nullptr;
+  if (RuntimeCheck) {
+    ExitB = CurLoop->getUniqueExitBlock();
+    if (!ExitB)
+      goto CleanupAndExit;
+
+    // Make sure the loop is in simple form.
+    assert(CurLoop->isLoopSimplifyForm() &&
+           "loop versioning idiom pass should run on simple loops");
+  }
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
+  LLVMContext &Ctx = SI->getContext();
   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
+  unsigned Align = std::min(SI->getAlignment(), LdI->getAlignment());
+  DebugLoc DLoc = SI->getDebugLoc();
 
   const SCEV *NumBytesS =
       SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
@@ -942,22 +1152,104 @@
     NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
                                SCEV::FlagNUW);
 
-  Value *NumBytes =
-      Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
+  Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, ExpPt);
+  CallInst *NewCall;
 
-  CallInst *NewCall =
-      Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes,
-                           std::min(SI->getAlignment(), LI->getAlignment()));
-  NewCall->setDebugLoc(SI->getDebugLoc());
+  if (!RuntimeCheck) {
+    NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align);
+    NewCall->setDebugLoc(SI->getDebugLoc());
+    ++NumMemCpy;
+    // Okay, the memcpy has been formed.  Zap the original store and anything that
+    // feeds into it.
+    deleteDeadInstruction(SI);
+  } else {
+    unsigned Threshold = RuntimeMemSizeThreshold;
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) {
+      uint64_t C = CI->getZExtValue();
+      if (Threshold != 0 && C < Threshold)
+        goto CleanupAndExit;
+      if (C < CompileTimeMemSizeThreshold)
+         goto CleanupAndExit;
+    }
 
-  DEBUG(dbgs() << "  Formed memcpy: " << *NewCall << "\n"
-               << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
+    BasicBlock *Header = CurLoop->getHeader();
+    Function *Func = Header->getParent();
+    Loop *ParentL = LI->getLoopFor(Preheader);
+    StringRef HeaderName = Header->getName();
+
+    // Create a new (empty) preheader, and update the PHI nodes in the
+    // header to use the new preheader.
+    BasicBlock *NewPreheader = BasicBlock::Create(Ctx, HeaderName+".rtli.ph",
+                                                  Func, Header);
+    if (ParentL)
+      ParentL->addBasicBlockToLoop(NewPreheader, *LI);
+    IRBuilder<>(NewPreheader).CreateBr(Header);
+    for (auto &In : *Header) {
+      PHINode *PN = dyn_cast<PHINode>(&In);
+      if (!PN)
+        break;
+      int bx = PN->getBasicBlockIndex(Preheader);
+      if (bx >= 0)
+        PN->setIncomingBlock(bx, NewPreheader);
+    }
+    DT->addNewBlock(NewPreheader, Preheader);
+    DT->changeImmediateDominator(Header, NewPreheader);
+
+    // Check for safe conditions to execute memmove.
+    // If stride is positive, copying things from higher to lower addresses
+    // is equivalent to memmove.  For negative stride, it's the other way
+    // around.  Copying forward in memory with positive stride may not be
+    // same as memmove since we may be copying values that we just stored
+    // in some previous iteration.
+    Value *LA = Builder.CreatePtrToInt(LoadBasePtr, IntPtrTy);
+    Value *SA = Builder.CreatePtrToInt(StoreBasePtr, IntPtrTy);
+    Value *LowA = NegStride ? LA : SA;
+    Value *HighA = NegStride ? SA : LA;
+    // Check whether the dependence is a Write After Read: a WAR dependence has
+    // the same semantics as memmove.
+    Value *IsWAR = Builder.CreateICmpULT(LowA, HighA);
+
+    // Check for distance between pointers: in the case of a RAW dependence, we
+    // need to make sure that there is no overlap.
+    Value *Dist = Builder.CreateSub(HighA, LowA);
+    Value *CmpD = Builder.CreateICmpSLT(NumBytes, Dist);
+    Value *CmpEither = Builder.CreateOr(IsWAR, CmpD);
+    Value *Cond = CmpEither;
+
+    if (Threshold != 0) {
+      Type *Ty = NumBytes->getType();
+      Value *Thr = ConstantInt::get(Ty, Threshold);
+      Value *CmpB = Builder.CreateICmpULT(Thr, NumBytes);
+      Value *CmpBoth = Builder.CreateAnd(Cond, CmpB);
+      Cond = CmpBoth;
+    }
+
+    BasicBlock *MemmoveB = BasicBlock::Create(Ctx, Header->getName() + ".rtli",
+                                              Func, NewPreheader);
+    if (ParentL)
+      ParentL->addBasicBlockToLoop(MemmoveB, *LI);
+    Instruction *OldT = Preheader->getTerminator();
+    Builder.CreateCondBr(Cond, MemmoveB, NewPreheader);
+    OldT->eraseFromParent();
+    Preheader->setName(Preheader->getName() + ".old");
+    DT->addNewBlock(MemmoveB, Preheader);
+    DT->getNode(ExitB)->setIDom(DT->getNode(Preheader));
+
+    // Add a call to memmove in the conditional block.
+    IRBuilder<> CondBuilder(MemmoveB);
+    CondBuilder.CreateBr(ExitB);
+    CondBuilder.SetInsertPoint(MemmoveB->getTerminator());
+    NewCall =
+        CondBuilder.CreateMemMove(StoreBasePtr, LoadBasePtr, NumBytes, Align);
+    NewCall->setDebugLoc(SI->getDebugLoc());
+    ++NumMemMove;
+  }
+
+  DEBUG(dbgs() << "  Formed " << (!RuntimeCheck ? "memcpy: " : "memmove: ")
+               << *NewCall << "\n"
+               << "    from load ptr=" << *LoadEv << " at: " << *LdI << "\n"
                << "    from store ptr=" << *StoreEv << " at: " << *SI << "\n");
 
-  // Okay, the memcpy has been formed.  Zap the original store and anything that
-  // feeds into it.
-  deleteDeadInstruction(SI);
-  ++NumMemCpy;
   return true;
 }
 
Index: llvm/lib/Transforms/Scalar/Scalar.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/Scalar.cpp
+++ llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -67,6 +67,7 @@
   initializeLoopUnswitchPass(Registry);
   initializeLoopVersioningLICMPass(Registry);
   initializeLoopIdiomRecognizeLegacyPassPass(Registry);
+  initializeLoopVersioningIdiomRecognizeLegacyPassPass(Registry);
   initializeLowerAtomicLegacyPassPass(Registry);
   initializeLowerExpectIntrinsicPass(Registry);
   initializeLowerGuardIntrinsicLegacyPassPass(Registry);
@@ -165,6 +166,10 @@
   unwrap(PM)->add(createLoopIdiomPass());
 }
 
+void LLVMAddLoopVersioningIdiomPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopVersioningIdiomPass());
+}
+
 void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopRotatePass());
 }
Index: llvm/test/CodeGen/Hexagon/loop-idiom/hexagon-memmove1.ll
===================================================================
--- llvm/test/CodeGen/Hexagon/loop-idiom/hexagon-memmove1.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; Check for recognizing the "memmove" idiom.
-; RUN: opt -basicaa -hexagon-loop-idiom -S -mtriple hexagon-unknown-elf < %s \
-; RUN:  | FileCheck %s
-; CHECK: call void @llvm.memmove
-
-; Function Attrs: norecurse nounwind
-define void @foo(i32* nocapture %A, i32* nocapture readonly %B, i32 %n) #0 {
-entry:
-  %cmp1 = icmp sgt i32 %n, 0
-  br i1 %cmp1, label %for.body.preheader, label %for.end
-
-for.body.preheader:                               ; preds = %entry
-  %arrayidx.gep = getelementptr i32, i32* %B, i32 0
-  %arrayidx1.gep = getelementptr i32, i32* %A, i32 0
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %arrayidx.phi = phi i32* [ %arrayidx.gep, %for.body.preheader ], [ %arrayidx.inc, %for.body ]
-  %arrayidx1.phi = phi i32* [ %arrayidx1.gep, %for.body.preheader ], [ %arrayidx1.inc, %for.body ]
-  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %0 = load i32, i32* %arrayidx.phi, align 4
-  store i32 %0, i32* %arrayidx1.phi, align 4
-  %inc = add nuw nsw i32 %i.02, 1
-  %exitcond = icmp ne i32 %inc, %n
-  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
-  %arrayidx1.inc = getelementptr i32, i32* %arrayidx1.phi, i32 1
-  br i1 %exitcond, label %for.body, label %for.end.loopexit
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret void
-}
-
-attributes #0 = { nounwind }
Index: llvm/test/CodeGen/Hexagon/loop-idiom/hexagon-memmove2.ll
===================================================================
--- llvm/test/CodeGen/Hexagon/loop-idiom/hexagon-memmove2.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; RUN: opt -basicaa -hexagon-loop-idiom -S -mtriple hexagon-unknown-elf < %s \
-; RUN:  | FileCheck %s
-
-define void @PR14241(i32* %s, i64 %size) #0 {
-; Ensure that we don't form a memcpy for strided loops. Briefly, when we taught
-; LoopIdiom about memmove and strided loops, this got miscompiled into a memcpy
-; instead of a memmove. If we get the memmove transform back, this will catch
-; regressions.
-;
-; CHECK-LABEL: @PR14241(
-
-entry:
-  %end.idx = add i64 %size, -1
-  %end.ptr = getelementptr inbounds i32, i32* %s, i64 %end.idx
-  br label %while.body
-; CHECK-NOT: memcpy
-; CHECK: memmove
-
-while.body:
-  %phi.ptr = phi i32* [ %s, %entry ], [ %next.ptr, %while.body ]
-  %src.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1
-  %val = load i32, i32* %src.ptr, align 4
-; CHECK: load
-  %dst.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 0
-  store i32 %val, i32* %dst.ptr, align 4
-; CHECK: store
-  %next.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1
-  %cmp = icmp eq i32* %next.ptr, %end.ptr
-  br i1 %cmp, label %exit, label %while.body
-
-exit:
-  ret void
-; CHECK: ret void
-}
-
-attributes #0 = { nounwind }
Index: llvm/test/Other/new-pm-defaults.ll
===================================================================
--- llvm/test/Other/new-pm-defaults.ll
+++ llvm/test/Other/new-pm-defaults.ll
@@ -137,6 +137,7 @@
 ; CHECK-O-NEXT: Running pass: Float2IntPass
 ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
+; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopVersioningIdiomRecognizePass
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis
 ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis
Index: llvm/test/Other/pass-pipelines.ll
===================================================================
--- llvm/test/Other/pass-pipelines.ll
+++ llvm/test/Other/pass-pipelines.ll
@@ -68,6 +68,8 @@
 ; We rotate loops prior to vectorization.
 ; CHECK-O2: Loop Pass Manager
 ; CHECK-O2-NEXT: Rotate Loops
+; CHECK-O2: Loop Pass Manager
+; CHECK-O2-NEXT: Recognize loop idioms with loop versioning
 ; CHECK-O2-NOT: Manager
 ; CHECK-O2: Loop Vectorization
 ; CHECK-O2-NOT: Manager
Index: llvm/test/Transforms/LoopIdiom/basic.ll
===================================================================
--- llvm/test/Transforms/LoopIdiom/basic.ll
+++ llvm/test/Transforms/LoopIdiom/basic.ll
@@ -353,8 +353,6 @@
 ; CHECK: ret void
 }
 
-
-
 ; PR9815 - This is a partial overlap case that cannot be safely transformed
 ; into a memcpy.
 @g_50 = global [7 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0], align 16
Index: llvm/test/Transforms/LoopIdiom/memmove.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopIdiom/memmove.ll
@@ -0,0 +1,43 @@
+; RUN: opt -S -basicaa -loop-versioning-idiom < %s | FileCheck %s
+; RUN: opt -S -basicaa -loop-idiom < %s | FileCheck %s -check-prefix=LOOP-IDIOM
+
+declare i64 @foo() nounwind
+
+; CHECK-LABEL: @test1
+; CHECK: call void @llvm.memmove.p0i8.p0i8.i64(
+; LOOP-IDIOM-LABEL: @test1
+; LOOP-IDIOM-NOT: call void @llvm.memmove.p0i8.p0i8.i64(
+
+; Nested loops
+define void @test1(i8* nocapture %A, i64 %n) nounwind {
+entry:
+  %call8 = tail call i64 @foo() nounwind
+  %tobool9 = icmp eq i64 %call8, 0
+  br i1 %tobool9, label %while.end, label %for.cond.preheader.lr.ph
+
+for.cond.preheader.lr.ph:                         ; preds = %entry
+  %cmp6 = icmp eq i64 %n, 0
+  br label %for.cond.preheader
+
+while.cond.loopexit:                              ; preds = %for.body, %for.cond.preheader
+  %call = tail call i64 @foo() nounwind
+  %tobool = icmp eq i64 %call, 0
+  br i1 %tobool, label %while.end, label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %for.cond.preheader.lr.ph, %while.cond.loopexit
+  br i1 %cmp6, label %while.cond.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.cond.preheader, %for.body
+  %i.07 = phi i64 [ %inc, %for.body ], [ 0, %for.cond.preheader ]
+  %add = add i64 %i.07, 10
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %add
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8* %A, i64 %i.07
+  store i8 %0, i8* %arrayidx1, align 1
+  %inc = add i64 %i.07, 1
+  %exitcond = icmp eq i64 %inc, %n
+  br i1 %exitcond, label %while.cond.loopexit, label %for.body
+
+while.end:                                        ; preds = %while.cond.loopexit, %entry
+  ret void
+}
Index: llvm/test/Transforms/LoopIdiom/memmove1.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopIdiom/memmove1.ll
@@ -0,0 +1,36 @@
+; Check for recognizing the "memmove" idiom.
+; RUN: opt -loop-versioning-idiom -S < %s | FileCheck %s
+
+; CHECK-LABEL: @foo(
+; CHECK: call void @llvm.memmove
+
+define void @foo(i32* nocapture %A, i32* nocapture readonly %B, i32 %n) #0 {
+entry:
+  %cmp1 = icmp sgt i32 %n, 0
+  br i1 %cmp1, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %arrayidx.gep = getelementptr i32, i32* %B, i32 0
+  %arrayidx1.gep = getelementptr i32, i32* %A, i32 0
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %arrayidx.phi = phi i32* [ %arrayidx.gep, %for.body.preheader ], [ %arrayidx.inc, %for.body ]
+  %arrayidx1.phi = phi i32* [ %arrayidx1.gep, %for.body.preheader ], [ %arrayidx1.inc, %for.body ]
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %0 = load i32, i32* %arrayidx.phi, align 4
+  store i32 %0, i32* %arrayidx1.phi, align 4
+  %inc = add nuw nsw i32 %i.02, 1
+  %exitcond = icmp ne i32 %inc, %n
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  %arrayidx1.inc = getelementptr i32, i32* %arrayidx1.phi, i32 1
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+attributes #0 = { nounwind }
Index: llvm/test/Transforms/LoopIdiom/memmove2.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopIdiom/memmove2.ll
@@ -0,0 +1,35 @@
+; RUN: opt -loop-versioning-idiom -S < %s | FileCheck %s
+
+define void @PR14241(i32* %s, i64 %size) #0 {
+; Ensure that we don't form a memcpy for strided loops. Briefly, when we taught
+; LoopIdiom about memmove and strided loops, this got miscompiled into a memcpy
+; instead of a memmove. If we get the memmove transform back, this will catch
+; regressions.
+;
+; CHECK-LABEL: @PR14241(
+
+entry:
+  %end.idx = add i64 %size, -1
+  %end.ptr = getelementptr inbounds i32, i32* %s, i64 %end.idx
+  br label %while.body
+; CHECK-NOT: memcpy
+; CHECK: memmove
+
+while.body:
+  %phi.ptr = phi i32* [ %s, %entry ], [ %next.ptr, %while.body ]
+  %src.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1
+  %val = load i32, i32* %src.ptr, align 4
+; CHECK: load
+  %dst.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 0
+  store i32 %val, i32* %dst.ptr, align 4
+; CHECK: store
+  %next.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1
+  %cmp = icmp eq i32* %next.ptr, %end.ptr
+  br i1 %cmp, label %exit, label %while.body
+
+exit:
+  ret void
+; CHECK: ret void
+}
+
+attributes #0 = { nounwind }
Index: llvm/test/Transforms/LoopIdiom/memset_noidiom.ll
===================================================================
--- llvm/test/Transforms/LoopIdiom/memset_noidiom.ll
+++ llvm/test/Transforms/LoopIdiom/memset_noidiom.ll
@@ -28,3 +28,54 @@
   ret i8* %b
 }
 
+; CHECK-LABEL: @memcpy
+; CHECK-NOT: llvm.memcpy
+define i8* @memcpy(i8* noalias %dst, i8* noalias %src, i64 %n) nounwind {
+entry:
+  %tobool3 = icmp eq i64 %n, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c2.06 = phi i8* [ %incdec.ptr, %while.body ], [ %src, %entry ]
+  %c1.05 = phi i8* [ %incdec.ptr1, %while.body ], [ %dst, %entry ]
+  %n.addr.04 = phi i64 [ %dec, %while.body ], [ %n, %entry ]
+  %dec = add i64 %n.addr.04, -1
+  %incdec.ptr = getelementptr inbounds i8, i8* %c2.06, i64 1
+  %0 = load i8, i8* %c2.06, align 1
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %c1.05, i64 1
+  store i8 %0, i8* %c1.05, align 1
+  %tobool = icmp eq i64 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret i8* %dst
+}
+
+; CHECK-LABEL: @memmove
+; CHECK-NOT: llvm.memmove
+define i8* @memmove(i8* %dst, i8* nocapture %src, i64 %count) nounwind {
+entry:
+  %sub = add i64 %count, -1
+  %tobool9 = icmp eq i64 %count, 0
+  br i1 %tobool9, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %entry
+  %add.ptr2 = getelementptr inbounds i8, i8* %src, i64 %sub
+  %add.ptr = getelementptr inbounds i8, i8* %dst, i64 %sub
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.lr.ph, %while.body
+  %b.012 = phi i8* [ %add.ptr2, %while.body.lr.ph ], [ %incdec.ptr, %while.body ]
+  %a.011 = phi i8* [ %add.ptr, %while.body.lr.ph ], [ %incdec.ptr3, %while.body ]
+  %count.addr.010 = phi i64 [ %count, %while.body.lr.ph ], [ %dec, %while.body ]
+  %dec = add i64 %count.addr.010, -1
+  %incdec.ptr = getelementptr inbounds i8, i8* %b.012, i64 -1
+  %0 = load i8, i8* %b.012, align 1
+  %incdec.ptr3 = getelementptr inbounds i8, i8* %a.011, i64 -1
+  store i8 %0, i8* %a.011, align 1
+  %tobool = icmp eq i64 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret i8* %dst
+}
Index: llvm/test/Transforms/LoopIdiom/pr28196.ll
===================================================================
--- llvm/test/Transforms/LoopIdiom/pr28196.ll
+++ llvm/test/Transforms/LoopIdiom/pr28196.ll
@@ -1,8 +1,5 @@
 ; RUN: opt -loop-idiom -S < %s | FileCheck %s
 
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
 define void @test1() {
 entry:
   br label %for.body.preheader