diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -55,6 +55,14 @@
 
 } // end namespace slpvectorizer
 
+struct SLPVectorizerResult {
+  bool MadeAnyChange;
+  bool MadeCFGChange;
+
+  SLPVectorizerResult(bool MadeAnyChange, bool MadeCFGChange)
+      : MadeAnyChange(MadeAnyChange), MadeCFGChange(MadeCFGChange) {}
+};
+
 struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
   using StoreList = SmallVector<StoreInst *, 8>;
   using StoreListMap = MapVector<Value *, StoreList>;
@@ -75,10 +83,12 @@
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
   // Glue for old PM.
-  bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_,
-               TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_,
-               DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_,
-               OptimizationRemarkEmitter *ORE_);
+  SLPVectorizerResult runImpl(Function &F, ScalarEvolution *SE_,
+                              TargetTransformInfo *TTI_,
+                              TargetLibraryInfo *TLI_, AAResults *AA_,
+                              LoopInfo *LI_, DominatorTree *DT_,
+                              AssumptionCache *AC_, DemandedBits *DB_,
+                              OptimizationRemarkEmitter *ORE_);
 
 private:
   /// Collect store and getelementptr instructions and organize them
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
@@ -62,6 +63,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Operator.h"
@@ -85,8 +87,11 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
 #include <cassert>
@@ -107,6 +112,10 @@
 #define DEBUG_TYPE "SLP"
 
 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
+STATISTIC(NumVersioningSuccessful,
+          "Number of times versioning was tried and beneficial");
+STATISTIC(NumVersioningFailed,
+          "Number of times versioning was tried but was not beneficial");
 
 cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
                                   cl::desc("Run the SLP vectorization passes"));
@@ -175,6 +184,10 @@
     ViewSLPTree("view-slp-tree", cl::Hidden,
                 cl::desc("Display the SLP trees with Graphviz"));
 
+static cl::opt<bool> EnableMemoryVersioning(
+    "slp-memory-versioning", cl::init(false), cl::Hidden,
+    cl::desc("Enable memory versioning for SLP vectorization."));
+
 // Limit the number of alias checks. The limit is chosen so that
 // it has no negative effect on the llvm benchmarks.
 static const unsigned AliasedCheckLimit = 10;
@@ -581,6 +594,44 @@
   return Index;
 }
 
+// Try to add or extend the runtime pointer checking group for \p I, if it is a
+// memory access.
+static bool
+extendMemBounds(Instruction &I, bool Insert, ScalarEvolution &SE,
+                MapVector<Value *, RuntimeCheckingPtrGroup> &MemBounds) {
+
+  BasicBlock *BB = I.getParent();
+  auto GetPtr = [](Instruction *I) -> Value * {
+    if (auto *L = dyn_cast<LoadInst>(I))
+      return L->getPointerOperand();
+    if (auto *S = dyn_cast<StoreInst>(I))
+      return S->getPointerOperand();
+    return nullptr;
+  };
+  auto *Ptr = GetPtr(&I);
+  if (!Ptr)
+    return false;
+  auto *Start = SE.getSCEV(Ptr);
+
+  Value *Obj = getUnderlyingObject(Ptr);
+  if (!Obj)
+    return false;
+
+  if (!SE.properlyDominates(Start, BB))
+    return false;
+
+  unsigned AS = Ptr->getType()->getPointerAddressSpace();
+  // Runtime checks are generated to ensure this property holds.
+  auto *End = SE.getAddExpr(Start, SE.getOne(Ptr->getType()), SCEV::FlagNUW);
+  if (Insert)
+    MemBounds.insert({Obj, {0, Start, End, AS}});
+  auto BoundsIter = MemBounds.find(Obj);
+  if (BoundsIter == MemBounds.end())
+    return false;
+
+  return BoundsIter->second.addPointer(0, Start, End, AS, SE);
+}
+
 namespace slpvectorizer {
 
 /// Bottom Up SLP Vectorizer.
@@ -589,6 +640,16 @@
   struct ScheduleData;
 
 public:
+  // Map of objects to start & end pointers we need to generate runtime checks
+  // for.
+  MapVector<Value *, RuntimeCheckingPtrGroup> MemBounds;
+  /// Cache for alias results.
+  /// TODO: consider moving this to the AliasAnalysis itself.
+  using AliasCacheKey = std::pair<Instruction *, Instruction *>;
+  DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
+
+  bool CollectMemAccess = false;
+
   using ValueList = SmallVector<Value *, 8>;
   using InstrList = SmallVector<Instruction *, 16>;
   using ValueSet = SmallPtrSet<Value *, 16>;
@@ -667,6 +728,7 @@
     }
     MinBWs.clear();
     InstrElementSize.clear();
+    MemBounds.clear();
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -674,6 +736,25 @@
   /// Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
 
+  /// Remove instructions in DeletedInstructions.
+  void removeDeletedInstructions() {
+    for (const auto &Pair : DeletedInstructions) {
+      // Replace operands of ignored instructions with Undefs in case if they
+      // were marked for deletion.
+      if (Pair.getSecond()) {
+        Value *Undef = UndefValue::get(Pair.getFirst()->getType());
+        Pair.getFirst()->replaceAllUsesWith(Undef);
+      }
+      Pair.getFirst()->dropAllReferences();
+    }
+    for (const auto &Pair : DeletedInstructions) {
+      assert(Pair.getFirst()->use_empty() &&
+             "trying to erase instruction with users.");
+      Pair.getFirst()->eraseFromParent();
+    }
+    DeletedInstructions.clear();
+  }
+
   /// \returns The best order of instructions for vectorization.
   Optional<ArrayRef<unsigned>> bestOrder() const {
     assert(llvm::all_of(
@@ -1976,11 +2057,6 @@
     return aliased;
   }
 
-  using AliasCacheKey = std::pair<Instruction *, Instruction *>;
-
-  /// Cache for alias results.
-  /// TODO: consider moving this to the AliasAnalysis itself.
-  DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
 
   /// Removes an instruction from its block and eventually deletes it.
   /// It's like Instruction::eraseFromParent() except that the actual deletion
@@ -2565,27 +2641,7 @@
 
 } // end namespace llvm
 
-BoUpSLP::~BoUpSLP() {
-  for (const auto &Pair : DeletedInstructions) {
-    // Replace operands of ignored instructions with Undefs in case if they were
-    // marked for deletion.
-    if (Pair.getSecond()) {
-      Value *Undef = UndefValue::get(Pair.getFirst()->getType());
-      Pair.getFirst()->replaceAllUsesWith(Undef);
-    }
-    Pair.getFirst()->dropAllReferences();
-  }
-  for (const auto &Pair : DeletedInstructions) {
-    assert(Pair.getFirst()->use_empty() &&
-           "trying to erase instruction with users.");
-    Pair.getFirst()->eraseFromParent();
-  }
-#ifdef EXPENSIVE_CHECKS
-  // If we could guarantee that this call is not extremely slow, we could
-  // remove the ifdef limitation (see PR47712).
-  assert(!verifyFunction(*F, &dbgs()));
-#endif
-}
+BoUpSLP::~BoUpSLP() { removeDeletedInstructions(); }
 
 void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
   for (auto *V : AV) {
@@ -6213,6 +6269,7 @@
           while (DepDest) {
             assert(isInSchedulingRegion(DepDest));
 
+            ScheduleData *DestBundle = DepDest->FirstInBundle;
             // We have two limits to reduce the complexity:
             // 1) AliasedCheckLimit: It's a small limit to reduce calls to
             //    SLP->isAliased (which is the expensive part in this loop).
@@ -6230,9 +6287,41 @@
               // balance between reduced runtime and accurate dependencies.
               numAliased++;
 
+              // If this bundle is not scheduled and no versioned code has been
+              // generated yet, try to collect the bounds of the accesses to
+              // generate runtime checks.
+              if (!DestBundle->IsScheduled && SLP->CollectMemAccess) {
+                // FIXME Naming
+                auto GetPtr = [](Instruction *I) -> Value * {
+                  if (auto *L = dyn_cast<LoadInst>(I))
+                    return L->getPointerOperand();
+                  if (auto *S = dyn_cast<StoreInst>(I))
+                    return S->getPointerOperand();
+                  return nullptr;
+                };
+                auto *Src = GetPtr(SrcInst);
+                auto *Dst = GetPtr(DepDest->Inst);
+
+                if (SrcInst->getParent() == DepDest->Inst->getParent() && Src &&
+                    Dst) {
+                  auto GetPtr = [](Instruction *I) -> Value * {
+                    if (auto *L = dyn_cast<LoadInst>(I))
+                      return getUnderlyingObject(L->getPointerOperand());
+                    if (auto *S = dyn_cast<StoreInst>(I))
+                      return getUnderlyingObject(S->getPointerOperand());
+                    return nullptr;
+                  };
+                  bool AddedSrc =
+                      extendMemBounds(*SrcInst, true, *SLP->SE, SLP->MemBounds);
+                  bool AddedDst = extendMemBounds(*DepDest->Inst, true,
+                                                  *SLP->SE, SLP->MemBounds);
+                  if (!AddedSrc || !AddedDst ||
+                      GetPtr(SrcInst) == GetPtr(DepDest->Inst))
+                    SLP->MemBounds.clear();
+                }
+              }
               DepDest->MemoryDependencies.push_back(BundleMember);
               BundleMember->Dependencies++;
-              ScheduleData *DestBundle = DepDest->FirstInBundle;
               if (!DestBundle->IsScheduled) {
                 BundleMember->incrementUnscheduledDeps(1);
               }
@@ -6672,7 +6761,7 @@
     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
 
-    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
+    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE).MadeAnyChange;
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -6688,9 +6777,7 @@
     AU.addRequired<InjectTLIMappingsLegacy>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addPreserved<AAResultsWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
-    AU.setPreservesCFG();
   }
 };
 
@@ -6707,23 +6794,24 @@
   auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
   auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
-  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
-  if (!Changed)
+  auto Result = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
+  if (!Result.MadeAnyChange)
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
-  PA.preserveSet<CFGAnalyses>();
+  if (!Result.MadeCFGChange)
+    PA.preserveSet<CFGAnalyses>();
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<DominatorTreeAnalysis>();
   return PA;
 }
 
-bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
-                                TargetTransformInfo *TTI_,
-                                TargetLibraryInfo *TLI_, AAResults *AA_,
-                                LoopInfo *LI_, DominatorTree *DT_,
-                                AssumptionCache *AC_, DemandedBits *DB_,
-                                OptimizationRemarkEmitter *ORE_) {
+SLPVectorizerResult SLPVectorizerPass::runImpl(
+    Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_,
+    TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_,
+    AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_) {
   if (!RunSLPVectorization)
-    return false;
+    return {false, false};
   SE = SE_;
   TTI = TTI_;
   TLI = TLI_;
@@ -6737,15 +6825,16 @@
   Stores.clear();
   GEPs.clear();
   bool Changed = false;
+  bool CFGChanged = false;
 
   // If the target claims to have no vector registers don't attempt
   // vectorization.
   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)))
-    return false;
+    return {false, false};
 
   // Don't vectorize when the attribute NoImplicitFloat is used.
   if (F.hasFnAttribute(Attribute::NoImplicitFloat))
-    return false;
+    return {false, false};
 
   LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
 
@@ -6759,6 +6848,8 @@
   // Update DFS numbers now so that we can use them for ordering.
   DT->updateDFSNumbers();
 
+  SmallVector<BasicBlock *, 4> BlocksToRetry;
+  SmallVector<MapVector<Value *, RuntimeCheckingPtrGroup>, 4> BoundsToUse;
   // Scan the blocks in the function in post order.
   for (auto BB : post_order(&F.getEntryBlock())) {
     collectSeedInstructions(BB);
@@ -6767,7 +6858,43 @@
     if (!Stores.empty()) {
       LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
                         << " underlying objects.\n");
-      Changed |= vectorizeStoreChains(R);
+      R.MemBounds.clear();
+
+      auto NoOrSingleSucc = [](BasicBlock *BB) {
+        return succ_begin(BB) == succ_end(BB) ||
+               std::next(succ_begin(BB)) == succ_end(BB);
+      };
+      auto NoOrSinglePred = [](BasicBlock *BB) {
+        return pred_begin(BB) == pred_end(BB) ||
+               std::next(pred_begin(BB)) == pred_end(BB);
+      };
+
+      auto AllUsesInside = [](BasicBlock *BB) {
+        return all_of(*BB, [BB](Instruction &I) {
+          return all_of(I.users(), [BB](User *U) {
+            return cast<Instruction>(U)->getParent() == BB;
+          });
+        });
+      };
+      auto TermSupported = [](BasicBlock *BB) {
+        auto *RetI = dyn_cast<ReturnInst>(BB->getTerminator());
+        return isa<BranchInst>(BB->getTerminator()) ||
+               (RetI && !RetI->getReturnValue());
+      };
+
+      if (EnableMemoryVersioning)
+        R.CollectMemAccess = BB->size() <= 300 && NoOrSingleSucc(BB) &&
+                             NoOrSinglePred(BB) && AllUsesInside(BB) &&
+                             TermSupported(BB);
+
+      bool VectorizedChains = vectorizeStoreChains(R);
+      if (!VectorizedChains && !R.MemBounds.empty()) {
+        BlocksToRetry.push_back(BB);
+        BoundsToUse.push_back(R.MemBounds);
+      }
+      R.CollectMemAccess = false;
+      R.MemBounds.clear();
+      Changed |= VectorizedChains;
     }
 
     // Vectorize trees that end at reductions.
@@ -6783,11 +6910,170 @@
     }
   }
 
-  if (Changed) {
+  R.AliasCache.clear();
+  for (unsigned I = 0; I != BlocksToRetry.size(); I++) {
+    // First, clean up delete instructions, so they are not re-used during SCEV
+    // expansion.
+    R.removeDeletedInstructions();
+    BasicBlock *BB = BlocksToRetry[I];
+    auto &MemBounds = BoundsToUse[I];
+
+    SmallVector<RuntimePointerCheck> PointerChecks;
+    CFGChanged = true;
+    // Minimize/maximize the lower/upper bounds of accesses in the block to
+    // version.
+    for (Instruction &I : *BB)
+      extendMemBounds(I, false, *SE, MemBounds);
+
+    SmallVector<RuntimeCheckingPtrGroup *> BoundGroups;
+    for (auto &B : MemBounds)
+      BoundGroups.emplace_back(&B.second);
+
+    // Create a RuntimePointerCheck for all groups in BoundGroups.
+    for (unsigned I = 0, E = BoundGroups.size(); I != E; ++I)
+      for (unsigned J = I + 1; J != E; ++J)
+        PointerChecks.emplace_back(&*BoundGroups[I], &*BoundGroups[J]);
+
+    LLVMContext &Ctx = BB->getContext();
+    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+    std::string OriginalName = BB->getName().str();
+    auto *CheckBlock = splitBlockBefore(BB, &*BB->getFirstNonPHI(), &DTU, LI,
+                                        nullptr, OriginalName + ".slpmemcheck");
+    auto *MergeBlock = BB;
+    BB = splitBlockBefore(BB, BB->getTerminator(), &DTU, LI, nullptr,
+                          OriginalName + ".slpversioned");
+
+    ValueToValueMapTy VMap;
+    auto *Scalar = CloneBasicBlock(BB, VMap, "", BB->getParent());
+    Scalar->setName(OriginalName + ".scalar");
+    MergeBlock->setName(OriginalName + ".merge");
+    SmallVector<BasicBlock *> Tmp;
+    Tmp.push_back(Scalar);
+    remapInstructionsInBlocks(Tmp, VMap);
+
+    SCEVExpander Exp(*SE, BB->getParent()->getParent()->getDataLayout(),
+                     "memcheck");
+    auto *MemoryRuntimeCheck = addRuntimeChecks(CheckBlock->getTerminator(),
+                                                nullptr, PointerChecks, Exp)
+                                   .second;
+    assert(MemoryRuntimeCheck &&
+           "runtime checks required, but no checks generated in IR?");
+
+    IRBuilder<> ChkBuilder(CheckBlock->getTerminator());
+    Value *NoOverflowCheck = MemoryRuntimeCheck;
+    // Emit checks ensuring that computing the upper bound does not overflow.
+    for (auto &B : MemBounds) {
+      Type *PtrArithTy = Type::getInt8PtrTy(Ctx, B.second.AddressSpace);
+      Value *Low = Exp.expandCodeFor(B.second.Low, PtrArithTy);
+      Value *High = Exp.expandCodeFor(B.second.High, PtrArithTy);
+      NoOverflowCheck = ChkBuilder.CreateAnd(
+          NoOverflowCheck, ChkBuilder.CreateICmpUGT(High, Low, "nowrap"),
+          "check");
+    }
+    ChkBuilder.CreateCondBr(NoOverflowCheck, Scalar, BB);
+    CheckBlock->getTerminator()->eraseFromParent();
+    DTU.applyUpdates({{DT->Insert, CheckBlock, Scalar}});
+    if (auto *L = LI->getLoopFor(CheckBlock))
+      L->addBasicBlockToLoop(Scalar, *LI);
+
+    Changed = true;
+
+    // Add !noalias metadata to memory accesses in the versiond block.
+    MDBuilder MDB(Ctx);
+    MDNode *Domain = MDB.createAnonymousAliasScopeDomain("SLPVerDomain");
+
+    DenseMap<const RuntimeCheckingPtrGroup *, MDNode *> GroupToScope;
+    for (const auto &Group : MemBounds)
+      GroupToScope[&Group.second] = MDB.createAnonymousAliasScope(Domain);
+
+    for (Instruction &I : *BB) {
+      auto GetPtr = [](Instruction *I) -> Value * {
+        if (auto *L = dyn_cast<LoadInst>(I))
+          return L->getPointerOperand();
+        if (auto *S = dyn_cast<StoreInst>(I))
+          return S->getPointerOperand();
+        return nullptr;
+      };
+      auto *Ptr = GetPtr(&I);
+      if (!Ptr)
+        continue;
+      auto *PtrSCEV = SE->getSCEV(Ptr);
+
+      Value *Obj = getUnderlyingObject(Ptr);
+      if (!Obj)
+        continue;
+
+      auto BoundsIter = MemBounds.find(Obj);
+      if (BoundsIter == MemBounds.end())
+        continue;
+      auto *LowerBound = BoundsIter->second.Low;
+      auto *UpperBound = BoundsIter->second.High;
+      auto *Scope = GroupToScope.find(&BoundsIter->second)->second;
+      if (SE->isKnownPredicate(CmpInst::ICMP_UGE, PtrSCEV, LowerBound) &&
+          SE->isKnownPredicate(CmpInst::ICMP_ULE, PtrSCEV, UpperBound)) {
+        I.setMetadata(
+            LLVMContext::MD_alias_scope,
+            MDNode::concatenate(I.getMetadata(LLVMContext::MD_alias_scope),
+                                MDNode::get(Ctx, Scope)));
+
+        SmallVector<Metadata *, 4> NonAliasing;
+        for (auto &KV : GroupToScope) {
+          if (KV.first == &BoundsIter->second)
+            continue;
+          NonAliasing.push_back(KV.second);
+        }
+        I.setMetadata(
+            LLVMContext::MD_noalias,
+            MDNode::concatenate(I.getMetadata(LLVMContext::MD_noalias),
+                                MDNode::get(Ctx, NonAliasing)));
+      }
+    }
+
+    DTU.flush();
+    DT->updateDFSNumbers();
+    collectSeedInstructions(BB);
+
+    // Vectorize trees that end at stores.
+    assert(!Stores.empty() && "should have stores when versioning");
+    LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
+                      << " underlying objects.\n");
+    Changed |= vectorizeStoreChains(R);
+
+    R.removeDeletedInstructions();
+    InstructionCost ScalarCost = 0;
+    for (Instruction &I : *Scalar)
+      ScalarCost += TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+    InstructionCost SLPCost = 0;
+    for (Instruction &I : *CheckBlock)
+      SLPCost += TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+    for (Instruction &I : *BB)
+      SLPCost += TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+
+    if (SLPCost >= ScalarCost) {
+      Instruction *OldTerm = CheckBlock->getTerminator();
+      OldTerm->eraseFromParent();
+      IRBuilder<> Builder(CheckBlock);
+
+      Builder.CreateBr(Scalar);
+      DTU.applyUpdates({{DT->Delete, CheckBlock, BB}});
+      LI->removeBlock(BB);
+      DTU.deleteBB(BB);
+      DTU.applyUpdates({{DT->Delete, BB, MergeBlock}});
+      MergeBlockIntoPredecessor(MergeBlock, &DTU, LI);
+      MergeBlockIntoPredecessor(Scalar, &DTU, LI);
+      NumVersioningFailed++;
+    } else {
+      NumVersioningSuccessful++;
+    }
+    DTU.flush();
+    DT->updateDFSNumbers();
+  }
+
+  if (Changed && BlocksToRetry.empty()) {
     R.optimizeGatherSequence();
     LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
   }
-  return Changed;
+  return {Changed, CFGChanged};
 }
 
 /// Order may have elements assigned special value (size) which is out of
diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll
--- a/llvm/test/Other/opt-O2-pipeline.ll
+++ b/llvm/test/Other/opt-O2-pipeline.ll
@@ -252,7 +252,11 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       SLP Vectorizer
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Optimize scalar/vector ops
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Combine redundant instructions
 ; CHECK-NEXT:       Canonicalize natural loops
diff --git a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
--- a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
+++ b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
@@ -264,7 +264,11 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       SLP Vectorizer
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Optimize scalar/vector ops
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Combine redundant instructions
 ; CHECK-NEXT:       Canonicalize natural loops
diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll
--- a/llvm/test/Other/opt-O3-pipeline.ll
+++ b/llvm/test/Other/opt-O3-pipeline.ll
@@ -257,7 +257,11 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       SLP Vectorizer
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Optimize scalar/vector ops
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Combine redundant instructions
 ; CHECK-NEXT:       Canonicalize natural loops
diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll
--- a/llvm/test/Other/opt-Os-pipeline.ll
+++ b/llvm/test/Other/opt-Os-pipeline.ll
@@ -238,7 +238,11 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       SLP Vectorizer
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Optimize scalar/vector ops
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Combine redundant instructions
 ; CHECK-NEXT:       Canonicalize natural loops
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s
+; RUN: opt -slp-memory-versioning -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64"
@@ -101,57 +101,67 @@
 ;
 define void @f_alias(i8* nocapture %dst, i8* nocapture readonly %src, %struct.weight_t* nocapture readonly %w) {
 ; CHECK-LABEL: @f_alias(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SCALE:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T:%.*]], %struct.weight_t* [[W:%.*]], i64 0, i32 0
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SCALE]], align 16
-; CHECK-NEXT:    [[OFFSET:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T]], %struct.weight_t* [[W]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[OFFSET]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[SRC:%.*]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP2]] to i32
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], [[CONV]]
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[MUL]], [[TMP1]]
-; CHECK-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp ult i32 [[ADD]], 256
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[ADD]], 0
-; CHECK-NEXT:    [[SHR_I:%.*]] = sext i1 [[TMP3]] to i32
-; CHECK-NEXT:    [[COND_I:%.*]] = select i1 [[TOBOOL_NOT_I]], i32 [[ADD]], i32 [[SHR_I]]
-; CHECK-NEXT:    [[CONV_I:%.*]] = trunc i32 [[COND_I]] to i8
-; CHECK-NEXT:    store i8 [[CONV_I]], i8* [[DST:%.*]], align 1
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1
-; CHECK-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP4]] to i32
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[TMP0]], [[CONV_1]]
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[MUL_1]], [[TMP1]]
-; CHECK-NEXT:    [[TOBOOL_NOT_I_1:%.*]] = icmp ult i32 [[ADD_1]], 256
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[ADD_1]], 0
-; CHECK-NEXT:    [[SHR_I_1:%.*]] = sext i1 [[TMP5]] to i32
-; CHECK-NEXT:    [[COND_I_1:%.*]] = select i1 [[TOBOOL_NOT_I_1]], i32 [[ADD_1]], i32 [[SHR_I_1]]
-; CHECK-NEXT:    [[CONV_I_1:%.*]] = trunc i32 [[COND_I_1]] to i8
-; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 1
-; CHECK-NEXT:    store i8 [[CONV_I_1]], i8* [[ARRAYIDX2_1]], align 1
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX_2]], align 1
-; CHECK-NEXT:    [[CONV_2:%.*]] = zext i8 [[TMP6]] to i32
-; CHECK-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[TMP0]], [[CONV_2]]
-; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[MUL_2]], [[TMP1]]
-; CHECK-NEXT:    [[TOBOOL_NOT_I_2:%.*]] = icmp ult i32 [[ADD_2]], 256
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[ADD_2]], 0
-; CHECK-NEXT:    [[SHR_I_2:%.*]] = sext i1 [[TMP7]] to i32
-; CHECK-NEXT:    [[COND_I_2:%.*]] = select i1 [[TOBOOL_NOT_I_2]], i32 [[ADD_2]], i32 [[SHR_I_2]]
-; CHECK-NEXT:    [[CONV_I_2:%.*]] = trunc i32 [[COND_I_2]] to i8
-; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 2
-; CHECK-NEXT:    store i8 [[CONV_I_2]], i8* [[ARRAYIDX2_2]], align 1
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX_3]], align 1
-; CHECK-NEXT:    [[CONV_3:%.*]] = zext i8 [[TMP8]] to i32
-; CHECK-NEXT:    [[MUL_3:%.*]] = mul nsw i32 [[TMP0]], [[CONV_3]]
-; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[MUL_3]], [[TMP1]]
-; CHECK-NEXT:    [[TOBOOL_NOT_I_3:%.*]] = icmp ult i32 [[ADD_3]], 256
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[ADD_3]], 0
-; CHECK-NEXT:    [[SHR_I_3:%.*]] = sext i1 [[TMP9]] to i32
-; CHECK-NEXT:    [[COND_I_3:%.*]] = select i1 [[TOBOOL_NOT_I_3]], i32 [[ADD_3]], i32 [[SHR_I_3]]
-; CHECK-NEXT:    [[CONV_I_3:%.*]] = trunc i32 [[COND_I_3]] to i8
-; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3
-; CHECK-NEXT:    store i8 [[CONV_I_3]], i8* [[ARRAYIDX2_3]], align 1
+; CHECK-NEXT:  entry.slpmemcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, i8* [[DST:%.*]], i64 4
+; CHECK-NEXT:    [[SCEVGEP38:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 4
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[DST]], [[SCEVGEP38]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    [[NOWRAP:%.*]] = icmp ugt i8* [[SCEVGEP]], [[DST]]
+; CHECK-NEXT:    [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]]
+; CHECK-NEXT:    [[NOWRAP39:%.*]] = icmp ugt i8* [[SCEVGEP38]], [[SRC]]
+; CHECK-NEXT:    [[CHECK40:%.*]] = and i1 [[CHECK]], [[NOWRAP39]]
+; CHECK-NEXT:    [[SCALE2:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T:%.*]], %struct.weight_t* [[W:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SCALE2]], align 16
+; CHECK-NEXT:    [[OFFSET3:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T]], %struct.weight_t* [[W]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[OFFSET3]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[SRC]], align 1
+; CHECK-NEXT:    [[CONV4:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[TMP0]], [[CONV4]]
+; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[MUL5]], [[TMP1]]
+; CHECK-NEXT:    [[TOBOOL_NOT_I7:%.*]] = icmp ult i32 [[ADD6]], 256
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[ADD6]], 0
+; CHECK-NEXT:    [[SHR_I8:%.*]] = sext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[COND_I9:%.*]] = select i1 [[TOBOOL_NOT_I7]], i32 [[ADD6]], i32 [[SHR_I8]]
+; CHECK-NEXT:    [[CONV_I10:%.*]] = trunc i32 [[COND_I9]] to i8
+; CHECK-NEXT:    store i8 [[CONV_I10]], i8* [[DST]], align 1
+; CHECK-NEXT:    [[ARRAYIDX_111:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX_111]], align 1
+; CHECK-NEXT:    [[CONV_112:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    [[MUL_113:%.*]] = mul nsw i32 [[TMP0]], [[CONV_112]]
+; CHECK-NEXT:    [[ADD_114:%.*]] = add nsw i32 [[MUL_113]], [[TMP1]]
+; CHECK-NEXT:    [[TOBOOL_NOT_I_115:%.*]] = icmp ult i32 [[ADD_114]], 256
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[ADD_114]], 0
+; CHECK-NEXT:    [[SHR_I_116:%.*]] = sext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[COND_I_117:%.*]] = select i1 [[TOBOOL_NOT_I_115]], i32 [[ADD_114]], i32 [[SHR_I_116]]
+; CHECK-NEXT:    [[CONV_I_118:%.*]] = trunc i32 [[COND_I_117]] to i8
+; CHECK-NEXT:    [[ARRAYIDX2_119:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 1
+; CHECK-NEXT:    store i8 [[CONV_I_118]], i8* [[ARRAYIDX2_119]], align 1
+; CHECK-NEXT:    [[ARRAYIDX_220:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX_220]], align 1
+; CHECK-NEXT:    [[CONV_221:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[MUL_222:%.*]] = mul nsw i32 [[TMP0]], [[CONV_221]]
+; CHECK-NEXT:    [[ADD_223:%.*]] = add nsw i32 [[MUL_222]], [[TMP1]]
+; CHECK-NEXT:    [[TOBOOL_NOT_I_224:%.*]] = icmp ult i32 [[ADD_223]], 256
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[ADD_223]], 0
+; CHECK-NEXT:    [[SHR_I_225:%.*]] = sext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[COND_I_226:%.*]] = select i1 [[TOBOOL_NOT_I_224]], i32 [[ADD_223]], i32 [[SHR_I_225]]
+; CHECK-NEXT:    [[CONV_I_227:%.*]] = trunc i32 [[COND_I_226]] to i8
+; CHECK-NEXT:    [[ARRAYIDX2_228:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 2
+; CHECK-NEXT:    store i8 [[CONV_I_227]], i8* [[ARRAYIDX2_228]], align 1
+; CHECK-NEXT:    [[ARRAYIDX_329:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX_329]], align 1
+; CHECK-NEXT:    [[CONV_330:%.*]] = zext i8 [[TMP8]] to i32
+; CHECK-NEXT:    [[MUL_331:%.*]] = mul nsw i32 [[TMP0]], [[CONV_330]]
+; CHECK-NEXT:    [[ADD_332:%.*]] = add nsw i32 [[MUL_331]], [[TMP1]]
+; CHECK-NEXT:    [[TOBOOL_NOT_I_333:%.*]] = icmp ult i32 [[ADD_332]], 256
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[ADD_332]], 0
+; CHECK-NEXT:    [[SHR_I_334:%.*]] = sext i1 [[TMP9]] to i32
+; CHECK-NEXT:    [[COND_I_335:%.*]] = select i1 [[TOBOOL_NOT_I_333]], i32 [[ADD_332]], i32 [[SHR_I_334]]
+; CHECK-NEXT:    [[CONV_I_336:%.*]] = trunc i32 [[COND_I_335]] to i8
+; CHECK-NEXT:    [[ARRAYIDX2_337:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3
+; CHECK-NEXT:    store i8 [[CONV_I_336]], i8* [[ARRAYIDX2_337]], align 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
@@ -1,17 +1,36 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -scoped-noalias-aa -slp-vectorizer -mtriple=arm64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s
+; RUN: opt -slp-memory-versioning -scoped-noalias-aa -slp-vectorizer -mtriple=arm64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s
+; RUN: opt -slp-memory-versioning=false -scoped-noalias-aa -slp-vectorizer -mtriple=arm64-apple-darwin -enable-new-pm=false -S %s | FileCheck --check-prefix=NOVERSION %s
+
+; NOVERSION-NOT: memcheck
 
 define void @needs_versioning_not_profitable(i32* %dst, i32* %src) {
 ; CHECK-LABEL: @needs_versioning_not_profitable(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SRC_0:%.*]] = load i32, i32* [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[R_0:%.*]] = ashr i32 [[SRC_0]], 16
-; CHECK-NEXT:    store i32 [[R_0]], i32* [[DST:%.*]], align 4
-; CHECK-NEXT:    [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1
-; CHECK-NEXT:    [[SRC_1:%.*]] = load i32, i32* [[SRC_GEP_1]], align 4
-; CHECK-NEXT:    [[R_1:%.*]] = ashr i32 [[SRC_1]], 16
-; CHECK-NEXT:    [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1
-; CHECK-NEXT:    store i32 [[R_1]], i32* [[DST_GEP_1]], align 4
+; CHECK-NEXT:  entry.slpmemcheck:
+; CHECK-NEXT:    [[DST8:%.*]] = bitcast i32* [[DST:%.*]] to i8*
+; CHECK-NEXT:    [[SRC10:%.*]] = bitcast i32* [[SRC:%.*]] to i8*
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[DST]], i64 1
+; CHECK-NEXT:    [[SCEVGEP9:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP9]], i64 1
+; CHECK-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[SRC]], i64 1
+; CHECK-NEXT:    [[SCEVGEP1112:%.*]] = bitcast i32* [[SCEVGEP11]] to i8*
+; CHECK-NEXT:    [[UGLYGEP13:%.*]] = getelementptr i8, i8* [[SCEVGEP1112]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[DST8]], [[UGLYGEP13]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SRC10]], [[UGLYGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    [[NOWRAP:%.*]] = icmp ugt i8* [[UGLYGEP]], [[DST8]]
+; CHECK-NEXT:    [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]]
+; CHECK-NEXT:    [[NOWRAP14:%.*]] = icmp ugt i8* [[UGLYGEP13]], [[SRC10]]
+; CHECK-NEXT:    [[CHECK15:%.*]] = and i1 [[CHECK]], [[NOWRAP14]]
+; CHECK-NEXT:    [[SRC_02:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[R_03:%.*]] = ashr i32 [[SRC_02]], 16
+; CHECK-NEXT:    store i32 [[R_03]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[SRC_GEP_14:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1
+; CHECK-NEXT:    [[SRC_15:%.*]] = load i32, i32* [[SRC_GEP_14]], align 4
+; CHECK-NEXT:    [[R_16:%.*]] = ashr i32 [[SRC_15]], 16
+; CHECK-NEXT:    [[DST_GEP_17:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1
+; CHECK-NEXT:    store i32 [[R_16]], i32* [[DST_GEP_17]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -28,25 +47,41 @@
 
 define void @needs_versioning_profitable(i32* %dst, i32* %src) {
 ; CHECK-LABEL: @needs_versioning_profitable(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SRC_0:%.*]] = load i32, i32* [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[R_0:%.*]] = ashr i32 [[SRC_0]], 16
-; CHECK-NEXT:    store i32 [[R_0]], i32* [[DST:%.*]], align 4
-; CHECK-NEXT:    [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1
-; CHECK-NEXT:    [[SRC_1:%.*]] = load i32, i32* [[SRC_GEP_1]], align 4
-; CHECK-NEXT:    [[R_1:%.*]] = ashr i32 [[SRC_1]], 16
-; CHECK-NEXT:    [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1
-; CHECK-NEXT:    store i32 [[R_1]], i32* [[DST_GEP_1]], align 4
-; CHECK-NEXT:    [[SRC_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[SRC_2:%.*]] = load i32, i32* [[SRC_GEP_2]], align 4
-; CHECK-NEXT:    [[R_2:%.*]] = ashr i32 [[SRC_2]], 16
-; CHECK-NEXT:    [[DST_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[R_2]], i32* [[DST_GEP_2]], align 4
-; CHECK-NEXT:    [[SRC_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[SRC_3:%.*]] = load i32, i32* [[SRC_GEP_3]], align 4
-; CHECK-NEXT:    [[R_3:%.*]] = ashr i32 [[SRC_3]], 16
-; CHECK-NEXT:    [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[R_3]], i32* [[DST_GEP_3]], align 4
+; CHECK-NEXT:  entry.slpmemcheck:
+; CHECK-NEXT:    [[DST16:%.*]] = bitcast i32* [[DST:%.*]] to i8*
+; CHECK-NEXT:    [[SRC18:%.*]] = bitcast i32* [[SRC:%.*]] to i8*
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    [[SCEVGEP17:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP17]], i64 1
+; CHECK-NEXT:    [[SCEVGEP19:%.*]] = getelementptr i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[SCEVGEP1920:%.*]] = bitcast i32* [[SCEVGEP19]] to i8*
+; CHECK-NEXT:    [[UGLYGEP21:%.*]] = getelementptr i8, i8* [[SCEVGEP1920]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[DST16]], [[UGLYGEP21]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SRC18]], [[UGLYGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    [[NOWRAP:%.*]] = icmp ugt i8* [[UGLYGEP]], [[DST16]]
+; CHECK-NEXT:    [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]]
+; CHECK-NEXT:    [[NOWRAP22:%.*]] = icmp ugt i8* [[UGLYGEP21]], [[SRC18]]
+; CHECK-NEXT:    [[CHECK23:%.*]] = and i1 [[CHECK]], [[NOWRAP22]]
+; CHECK-NEXT:    [[SRC_02:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[R_03:%.*]] = ashr i32 [[SRC_02]], 16
+; CHECK-NEXT:    store i32 [[R_03]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[SRC_GEP_14:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1
+; CHECK-NEXT:    [[SRC_15:%.*]] = load i32, i32* [[SRC_GEP_14]], align 4
+; CHECK-NEXT:    [[R_16:%.*]] = ashr i32 [[SRC_15]], 16
+; CHECK-NEXT:    [[DST_GEP_17:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1
+; CHECK-NEXT:    store i32 [[R_16]], i32* [[DST_GEP_17]], align 4
+; CHECK-NEXT:    [[SRC_GEP_28:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[SRC_29:%.*]] = load i32, i32* [[SRC_GEP_28]], align 4
+; CHECK-NEXT:    [[R_210:%.*]] = ashr i32 [[SRC_29]], 16
+; CHECK-NEXT:    [[DST_GEP_211:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[R_210]], i32* [[DST_GEP_211]], align 4
+; CHECK-NEXT:    [[SRC_GEP_312:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[SRC_313:%.*]] = load i32, i32* [[SRC_GEP_312]], align 4
+; CHECK-NEXT:    [[R_314:%.*]] = ashr i32 [[SRC_313]], 16
+; CHECK-NEXT:    [[DST_GEP_315:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[R_314]], i32* [[DST_GEP_315]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -99,30 +134,65 @@
 
 define void @version_multiple(i32* nocapture %out_block, i32* nocapture readonly %counter) {
 ; CHECK-LABEL: @version_multiple(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[COUNTER:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK:%.*]], align 4
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4
+; CHECK-NEXT:  entry.slpmemcheck:
+; CHECK-NEXT:    [[OUT_BLOCK12:%.*]] = bitcast i32* [[OUT_BLOCK:%.*]] to i8*
+; CHECK-NEXT:    [[COUNTER14:%.*]] = bitcast i32* [[COUNTER:%.*]] to i8*
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[OUT_BLOCK]], i64 3
+; CHECK-NEXT:    [[SCEVGEP13:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP13]], i64 1
+; CHECK-NEXT:    [[SCEVGEP15:%.*]] = getelementptr i32, i32* [[COUNTER]], i64 3
+; CHECK-NEXT:    [[SCEVGEP1516:%.*]] = bitcast i32* [[SCEVGEP15]] to i8*
+; CHECK-NEXT:    [[UGLYGEP17:%.*]] = getelementptr i8, i8* [[SCEVGEP1516]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[OUT_BLOCK12]], [[UGLYGEP17]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[COUNTER14]], [[UGLYGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    [[NOWRAP:%.*]] = icmp ugt i8* [[UGLYGEP]], [[OUT_BLOCK12]]
+; CHECK-NEXT:    [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]]
+; CHECK-NEXT:    [[NOWRAP18:%.*]] = icmp ugt i8* [[UGLYGEP17]], [[COUNTER14]]
+; CHECK-NEXT:    [[CHECK19:%.*]] = and i1 [[CHECK]], [[NOWRAP18]]
+; CHECK-NEXT:    br i1 [[CHECK19]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED:%.*]]
+; CHECK:       entry.slpversioned:
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2_1]], align 4
-; CHECK-NEXT:    [[XOR_1:%.*]] = xor i32 [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    store i32 [[XOR_1]], i32* [[ARRAYIDX2_1]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX2_2]], align 4
-; CHECK-NEXT:    [[XOR_2:%.*]] = xor i32 [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    store i32 [[XOR_2]], i32* [[ARRAYIDX2_2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[COUNTER]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !alias.scope !0, !noalias !3
 ; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4
-; CHECK-NEXT:    [[XOR_3:%.*]] = xor i32 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    store i32 [[XOR_3]], i32* [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    br label [[ENTRY_MERGE:%.*]]
+; CHECK:       entry.merge:
 ; CHECK-NEXT:    ret void
+; CHECK:       entry.scalar:
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[COUNTER]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    store i32 [[XOR2]], i32* [[OUT_BLOCK]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX_13]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_14:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX2_14]], align 4
+; CHECK-NEXT:    [[XOR_15:%.*]] = xor i32 [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    store i32 [[XOR_15]], i32* [[ARRAYIDX2_14]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX_26]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_27:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX2_27]], align 4
+; CHECK-NEXT:    [[XOR_28:%.*]] = xor i32 [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    store i32 [[XOR_28]], i32* [[ARRAYIDX2_27]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_39:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX_39]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_310:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2_310]], align 4
+; CHECK-NEXT:    [[XOR_311:%.*]] = xor i32 [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    store i32 [[XOR_311]], i32* [[ARRAYIDX2_310]], align 4
+; CHECK-NEXT:    br label [[ENTRY_MERGE]]
 ;
 entry:
   %0 = load i32, i32* %counter, align 4
@@ -294,13 +364,31 @@
 
 define void @slp_not_beneficial(i32* %A, i32* %B) {
 ; CHECK-LABEL: @slp_not_beneficial(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 4
-; CHECK-NEXT:    store i32 0, i32* [[TMP]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 5
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 8
-; CHECK-NEXT:    store i32 [[TMP5]], i32* [[TMP3]], align 8
+; CHECK-NEXT:  bb.slpmemcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 4
+; CHECK-NEXT:    [[SCEVGEP6:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[A]], i64 5
+; CHECK-NEXT:    [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8*
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP78]], i64 1
+; CHECK-NEXT:    [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 4
+; CHECK-NEXT:    [[SCEVGEP910:%.*]] = bitcast i32* [[SCEVGEP9]] to i8*
+; CHECK-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[B]], i64 4
+; CHECK-NEXT:    [[SCEVGEP1112:%.*]] = bitcast i32* [[SCEVGEP11]] to i8*
+; CHECK-NEXT:    [[UGLYGEP13:%.*]] = getelementptr i8, i8* [[SCEVGEP1112]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP6]], [[UGLYGEP13]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP910]], [[UGLYGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    [[NOWRAP:%.*]] = icmp ugt i8* [[UGLYGEP]], [[SCEVGEP6]]
+; CHECK-NEXT:    [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]]
+; CHECK-NEXT:    [[NOWRAP14:%.*]] = icmp ugt i8* [[UGLYGEP13]], [[SCEVGEP910]]
+; CHECK-NEXT:    [[CHECK15:%.*]] = and i1 [[CHECK]], [[NOWRAP14]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 4
+; CHECK-NEXT:    store i32 0, i32* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 5
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 4
+; CHECK-NEXT:    [[TMP55:%.*]] = load i32, i32* [[TMP44]], align 8
+; CHECK-NEXT:    store i32 [[TMP55]], i32* [[TMP33]], align 8
 ; CHECK-NEXT:    ret void
 ;
 bb:
@@ -315,20 +403,34 @@
 
 define void @widget(double* %ptr, double* %ptr.2) {
 ; CHECK-LABEL: @widget(
-; CHECK-NEXT:  bb1:
-; CHECK-NEXT:    [[TMP3:%.*]] = load double, double* null, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul double undef, [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = load double, double* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd double [[TMP6]], [[TMP4]]
-; CHECK-NEXT:    store double [[TMP7]], double* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, double* [[PTR_2:%.*]], i64 0
-; CHECK-NEXT:    [[TMP9:%.*]] = load double, double* [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul double undef, [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[PTR]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = fadd double [[TMP12]], [[TMP10]]
-; CHECK-NEXT:    store double [[TMP13]], double* [[TMP11]], align 8
+; CHECK-NEXT:  bb1.slpmemcheck:
+; CHECK-NEXT:    [[PTR13:%.*]] = bitcast double* [[PTR:%.*]] to i8*
+; CHECK-NEXT:    [[PTR_215:%.*]] = bitcast double* [[PTR_2:%.*]] to i8*
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[PTR]], i64 1
+; CHECK-NEXT:    [[SCEVGEP14:%.*]] = bitcast double* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP14]], i64 1
+; CHECK-NEXT:    [[UGLYGEP16:%.*]] = getelementptr i8, i8* [[PTR_215]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[PTR13]], [[UGLYGEP16]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[PTR_215]], [[UGLYGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    [[NOWRAP:%.*]] = icmp ugt i8* [[UGLYGEP]], [[PTR13]]
+; CHECK-NEXT:    [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]]
+; CHECK-NEXT:    [[NOWRAP17:%.*]] = icmp ugt i8* [[UGLYGEP16]], [[PTR_215]]
+; CHECK-NEXT:    [[CHECK18:%.*]] = and i1 [[CHECK]], [[NOWRAP17]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load double, double* null, align 8
+; CHECK-NEXT:    [[TMP43:%.*]] = fmul double undef, [[TMP32]]
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds double, double* [[PTR]], i32 0
+; CHECK-NEXT:    [[TMP65:%.*]] = load double, double* [[TMP54]], align 8
+; CHECK-NEXT:    [[TMP76:%.*]] = fadd double [[TMP65]], [[TMP43]]
+; CHECK-NEXT:    store double [[TMP76]], double* [[TMP54]], align 8
+; CHECK-NEXT:    [[TMP87:%.*]] = getelementptr inbounds double, double* [[PTR_2]], i64 0
+; CHECK-NEXT:    [[TMP98:%.*]] = load double, double* [[TMP87]], align 8
+; CHECK-NEXT:    [[TMP109:%.*]] = fmul double undef, [[TMP98]]
+; CHECK-NEXT:    [[TMP1110:%.*]] = getelementptr inbounds double, double* [[PTR]], i32 1
+; CHECK-NEXT:    [[TMP1211:%.*]] = load double, double* [[TMP1110]], align 8
+; CHECK-NEXT:    [[TMP1312:%.*]] = fadd double [[TMP1211]], [[TMP109]]
+; CHECK-NEXT:    store double [[TMP1312]], double* [[TMP1110]], align 8
 ; CHECK-NEXT:    br label [[BB15:%.*]]
 ; CHECK:       bb15:
 ; CHECK-NEXT:    br label [[BB15]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
@@ -1,32 +1,70 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -scoped-noalias-aa -slp-vectorizer -mtriple=x86_64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s
+; RUN: opt -slp-memory-versioning -scoped-noalias-aa -slp-vectorizer -mtriple=x86_64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s
+; RUN: opt -slp-memory-versioning=false -scoped-noalias-aa -slp-vectorizer -mtriple=x86_64-apple-darwin -enable-new-pm=false -S %s | FileCheck --check-prefix=NOVERSION %s
+
+; NOVERSION-NOT: memcheck
 
 define void @version_multiple(i32* nocapture %out_block, i32* nocapture readonly %counter) {
 ; CHECK-LABEL: @version_multiple(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[COUNTER:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK:%.*]], align 4
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4
+; CHECK-NEXT:  entry.slpmemcheck:
+; CHECK-NEXT:    [[OUT_BLOCK12:%.*]] = bitcast i32* [[OUT_BLOCK:%.*]] to i8*
+; CHECK-NEXT:    [[COUNTER14:%.*]] = bitcast i32* [[COUNTER:%.*]] to i8*
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[OUT_BLOCK]], i64 3
+; CHECK-NEXT:    [[SCEVGEP13:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP13]], i64 1
+; CHECK-NEXT:    [[SCEVGEP15:%.*]] = getelementptr i32, i32* [[COUNTER]], i64 3
+; CHECK-NEXT:    [[SCEVGEP1516:%.*]] = bitcast i32* [[SCEVGEP15]] to i8*
+; CHECK-NEXT:    [[UGLYGEP17:%.*]] = getelementptr i8, i8* [[SCEVGEP1516]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[OUT_BLOCK12]], [[UGLYGEP17]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[COUNTER14]], [[UGLYGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    [[NOWRAP:%.*]] = icmp ugt i8* [[UGLYGEP]], [[OUT_BLOCK12]]
+; CHECK-NEXT:    [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]]
+; CHECK-NEXT:    [[NOWRAP18:%.*]] = icmp ugt i8* [[UGLYGEP17]], [[COUNTER14]]
+; CHECK-NEXT:    [[CHECK19:%.*]] = and i1 [[CHECK]], [[NOWRAP18]]
+; CHECK-NEXT:    br i1 [[CHECK19]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED:%.*]]
+; CHECK:       entry.slpversioned:
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2_1]], align 4
-; CHECK-NEXT:    [[XOR_1:%.*]] = xor i32 [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    store i32 [[XOR_1]], i32* [[ARRAYIDX2_1]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX2_2]], align 4
-; CHECK-NEXT:    [[XOR_2:%.*]] = xor i32 [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    store i32 [[XOR_2]], i32* [[ARRAYIDX2_2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[COUNTER]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !alias.scope !0, !noalias !3
 ; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4
-; CHECK-NEXT:    [[XOR_3:%.*]] = xor i32 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    store i32 [[XOR_3]], i32* [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    br label [[ENTRY_MERGE:%.*]]
+; CHECK:       entry.merge:
 ; CHECK-NEXT:    ret void
+; CHECK:       entry.scalar:
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[COUNTER]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    store i32 [[XOR2]], i32* [[OUT_BLOCK]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX_13]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_14:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX2_14]], align 4
+; CHECK-NEXT:    [[XOR_15:%.*]] = xor i32 [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    store i32 [[XOR_15]], i32* [[ARRAYIDX2_14]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX_26]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_27:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX2_27]], align 4
+; CHECK-NEXT:    [[XOR_28:%.*]] = xor i32 [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    store i32 [[XOR_28]], i32* [[ARRAYIDX2_27]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_39:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX_39]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_310:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2_310]], align 4
+; CHECK-NEXT:    [[XOR_311:%.*]] = xor i32 [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    store i32 [[XOR_311]], i32* [[ARRAYIDX2_310]], align 4
+; CHECK-NEXT:    br label [[ENTRY_MERGE]]
 ;
 entry:
   %0 = load i32, i32* %counter, align 4
@@ -61,7 +99,7 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float*> poison, float* [[B:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float*> [[TMP0]], float* [[B]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, <2 x float*> [[TMP1]], <2 x i64> <i64 10, i64 14>
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN_SLPMEMCHECK:%.*]], label [[ELSE:%.*]]
 ; CHECK:       else:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
@@ -69,17 +107,35 @@
 ; CHECK-NEXT:    [[I71:%.*]] = shufflevector <8 x float> undef, <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 5, i32 6, i32 13>
 ; CHECK-NEXT:    call void @use(<8 x float> [[I71]])
 ; CHECK-NEXT:    ret void
-; CHECK:       then:
-; CHECK-NEXT:    [[A_8:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 8
-; CHECK-NEXT:    store float 0.000000e+00, float* [[A_8]], align 4
+; CHECK:       then.slpmemcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 5
+; CHECK-NEXT:    [[SCEVGEP8:%.*]] = bitcast float* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[SCEVGEP9:%.*]] = getelementptr float, float* [[A]], i64 8
+; CHECK-NEXT:    [[SCEVGEP910:%.*]] = bitcast float* [[SCEVGEP9]] to i8*
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP910]], i64 1
+; CHECK-NEXT:    [[SCEVGEP11:%.*]] = getelementptr float, float* [[B]], i64 14
+; CHECK-NEXT:    [[SCEVGEP1112:%.*]] = bitcast float* [[SCEVGEP11]] to i8*
+; CHECK-NEXT:    [[SCEVGEP13:%.*]] = getelementptr float, float* [[B]], i64 14
+; CHECK-NEXT:    [[SCEVGEP1314:%.*]] = bitcast float* [[SCEVGEP13]] to i8*
+; CHECK-NEXT:    [[UGLYGEP15:%.*]] = getelementptr i8, i8* [[SCEVGEP1314]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP8]], [[UGLYGEP15]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP1112]], [[UGLYGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    [[NOWRAP:%.*]] = icmp ugt i8* [[UGLYGEP]], [[SCEVGEP8]]
+; CHECK-NEXT:    [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]]
+; CHECK-NEXT:    [[NOWRAP16:%.*]] = icmp ugt i8* [[UGLYGEP15]], [[SCEVGEP1112]]
+; CHECK-NEXT:    [[CHECK17:%.*]] = and i1 [[CHECK]], [[NOWRAP16]]
+; CHECK-NEXT:    [[A_83:%.*]] = getelementptr inbounds float, float* [[A]], i64 8
+; CHECK-NEXT:    store float 0.000000e+00, float* [[A_83]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float*> [[TMP2]], i32 1
-; CHECK-NEXT:    [[L6:%.*]] = load float, float* [[TMP5]], align 4
-; CHECK-NEXT:    [[A_5:%.*]] = getelementptr inbounds float, float* [[A]], i64 5
-; CHECK-NEXT:    store float [[L6]], float* [[A_5]], align 4
-; CHECK-NEXT:    [[A_6:%.*]] = getelementptr inbounds float, float* [[A]], i64 6
-; CHECK-NEXT:    store float 0.000000e+00, float* [[A_6]], align 4
-; CHECK-NEXT:    [[A_7:%.*]] = getelementptr inbounds float, float* [[A]], i64 7
-; CHECK-NEXT:    store float 0.000000e+00, float* [[A_7]], align 4
+; CHECK-NEXT:    [[L64:%.*]] = load float, float* [[TMP5]], align 4
+; CHECK-NEXT:    [[A_55:%.*]] = getelementptr inbounds float, float* [[A]], i64 5
+; CHECK-NEXT:    store float [[L64]], float* [[A_55]], align 4
+; CHECK-NEXT:    [[A_66:%.*]] = getelementptr inbounds float, float* [[A]], i64 6
+; CHECK-NEXT:    store float 0.000000e+00, float* [[A_66]], align 4
+; CHECK-NEXT:    [[A_77:%.*]] = getelementptr inbounds float, float* [[A]], i64 7
+; CHECK-NEXT:    store float 0.000000e+00, float* [[A_77]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -122,6 +178,7 @@
 ; CHECK-LABEL: @preserve_loop_info(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP:%.*]] = alloca [3 x double], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast [3 x double]* [[TMP]] to i8*
 ; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
 ; CHECK:       outer.header:
 ; CHECK-NEXT:    br label [[INNER:%.*]]
@@ -133,14 +190,30 @@
 ; CHECK-NEXT:    [[TMP5:%.*]] = load [3 x double]*, [3 x double]** undef, align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[TMP]], i64 0, i64 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[TMP]], i64 0, i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr [3 x double], [3 x double]* [[TMP]], i64 0, i64 1
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = bitcast double* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP5]], i64 1
+; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr [3 x double], [3 x double]* [[TMP5]], i64 undef, i64 1
+; CHECK-NEXT:    [[SCEVGEP67:%.*]] = bitcast double* [[SCEVGEP6]] to i8*
+; CHECK-NEXT:    [[SCEVGEP8:%.*]] = getelementptr [3 x double], [3 x double]* [[TMP5]], i64 undef, i64 1
+; CHECK-NEXT:    [[SCEVGEP89:%.*]] = bitcast double* [[SCEVGEP8]] to i8*
+; CHECK-NEXT:    [[UGLYGEP10:%.*]] = getelementptr i8, i8* [[SCEVGEP89]], i64 1
 ; CHECK-NEXT:    br label [[LOOP_3HEADER:%.*]]
 ; CHECK:       loop.3header:
-; CHECK-NEXT:    br i1 undef, label [[LOOP_3LATCH:%.*]], label [[BB9:%.*]]
-; CHECK:       bb9:
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[TMP5]], i64 undef, i64 1
+; CHECK-NEXT:    br i1 undef, label [[LOOP_3LATCH:%.*]], label [[BB9_SLPMEMCHECK:%.*]]
+; CHECK:       bb9.slpmemcheck:
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[TMP4]], [[UGLYGEP10]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP67]], [[UGLYGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    [[NOWRAP:%.*]] = icmp ugt i8* [[UGLYGEP]], [[TMP4]]
+; CHECK-NEXT:    [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]]
+; CHECK-NEXT:    [[NOWRAP11:%.*]] = icmp ugt i8* [[UGLYGEP10]], [[SCEVGEP67]]
+; CHECK-NEXT:    [[CHECK12:%.*]] = and i1 [[CHECK]], [[NOWRAP11]]
+; CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[TMP5]], i64 undef, i64 1
 ; CHECK-NEXT:    store double undef, double* [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP10]], align 8
-; CHECK-NEXT:    store double [[TMP12]], double* [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP123:%.*]] = load double, double* [[TMP102]], align 8
+; CHECK-NEXT:    store double [[TMP123]], double* [[TMP7]], align 8
 ; CHECK-NEXT:    br label [[LOOP_3LATCH]]
 ; CHECK:       loop.3latch:
 ; CHECK-NEXT:    br i1 undef, label [[BB14:%.*]], label [[LOOP_3HEADER]]