diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -335,6 +335,11 @@
   /// Create a new pointer checking group containing a single
   /// pointer, with index \p Index in RtCheck.
   RuntimeCheckingPtrGroup(unsigned Index, RuntimePointerChecking &RtCheck);
+  RuntimeCheckingPtrGroup(unsigned Index, const SCEV *Start, const SCEV *End,
+                          unsigned AS, bool NeedsFreeze)
+      : High(End), Low(Start), AddressSpace(AS), NeedsFreeze(NeedsFreeze) {
+    Members.push_back(Index);
+  }
 
   /// Tries to add the pointer recorded in RtCheck at index
   /// \p Index to this pointer checking group. We can only add a pointer
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -54,6 +54,14 @@
 
 } // end namespace slpvectorizer
 
+struct SLPVectorizerResult {
+  bool MadeAnyChange;
+  bool MadeCFGChange;
+
+  SLPVectorizerResult(bool MadeAnyChange, bool MadeCFGChange)
+      : MadeAnyChange(MadeAnyChange), MadeCFGChange(MadeCFGChange) {}
+};
+
 struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
   using StoreList = SmallVector<StoreInst *, 8>;
   using StoreListMap = MapVector<Value *, StoreList>;
@@ -75,10 +83,12 @@
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
   // Glue for old PM.
-  bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_,
-               TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_,
-               DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_,
-               OptimizationRemarkEmitter *ORE_);
+  SLPVectorizerResult runImpl(Function &F, ScalarEvolution *SE_,
+                              TargetTransformInfo *TTI_,
+                              TargetLibraryInfo *TLI_, AAResults *AA_,
+                              LoopInfo *LI_, DominatorTree *DT_,
+                              AssumptionCache *AC_, DemandedBits *DB_,
+                              OptimizationRemarkEmitter *ORE_);
 
 private:
   /// Collect store and getelementptr instructions and organize them
@@ -139,6 +149,11 @@
 
   bool vectorizeStores(ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R);
 
+  SLPVectorizerResult
+  vectorizeBlockWithVersioning(BasicBlock *BB,
+                               const SmallPtrSetImpl<Value *> &TrackedObjects,
+                               slpvectorizer::BoUpSLP &R);
+
   /// The store instructions in a basic block organized by base pointer.
   StoreListMap Stores;
 
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -388,7 +388,7 @@
                                          const SCEV *End, unsigned AS,
                                          bool NeedsFreeze,
                                          ScalarEvolution &SE) {
-  assert(AddressSpace == AS &&
+  assert((Members.empty() || AddressSpace == AS) &&
          "all pointers in a checking group must be in the same address space");
 
   // Compare the starts and ends with the known minimum and maximum
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -36,6 +36,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
@@ -62,6 +63,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
@@ -85,9 +87,12 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
 #include <cassert>
@@ -108,6 +113,10 @@
 #define DEBUG_TYPE "SLP"
 
 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
+STATISTIC(NumVersioningSuccessful,
+          "Number of times versioning was tried and beneficial");
+STATISTIC(NumVersioningFailed,
+          "Number of times versioning was tried but was not beneficial");
 
 cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
                                   cl::desc("Run the SLP vectorization passes"));
@@ -177,6 +186,10 @@
     ViewSLPTree("view-slp-tree", cl::Hidden,
                 cl::desc("Display the SLP trees with Graphviz"));
 
+static cl::opt<bool> EnableMemoryVersioning(
+    "slp-memory-versioning", cl::init(false), cl::Hidden,
+    cl::desc("Enable memory versioning for SLP vectorization."));
+
 // Limit the number of alias checks. The limit is chosen so that
 // it has no negative effect on the llvm benchmarks.
 static const unsigned AliasedCheckLimit = 10;
@@ -833,6 +846,52 @@
          (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
 }
 
+namespace {
+/// Models a memory access to an underlying object with SCEV pointer expression
+/// and access type.
+struct AccessInfo {
+  Value *UnderlyingObj;
+  const SCEV *PtrSCEV;
+  Type *AccessTy;
+
+  AccessInfo(Value *UnderlyingObj = nullptr, const SCEV *PtrSCEV = nullptr,
+             Type *AccessTy = nullptr)
+      : UnderlyingObj(UnderlyingObj), PtrSCEV(PtrSCEV), AccessTy(AccessTy) {}
+
+  /// Returns the AccessInfo for \p I. If \p I isn't a memory instruction or the
+  /// pointer cannot be converted to a SCEV, return an empty object.
+  static AccessInfo get(Instruction &I, ScalarEvolution &SE,
+                        DominatorTree &DT) {
+    BasicBlock *BB = I.getParent();
+    auto GetPtrAndAccessTy = [](Instruction *I) -> std::pair<Value *, Type *> {
+      if (auto *L = dyn_cast<LoadInst>(I)) {
+        if (isValidElementType(L->getType()))
+          return {L->getPointerOperand(), L->getType()};
+      }
+      if (auto *S = dyn_cast<StoreInst>(I))
+        if (isValidElementType(S->getValueOperand()->getType()))
+          return {S->getPointerOperand(), S->getValueOperand()->getType()};
+      return {nullptr, nullptr};
+    };
+    Value *Ptr;
+    Type *AccessTy;
+    std::tie(Ptr, AccessTy) = GetPtrAndAccessTy(&I);
+    if (!Ptr)
+      return {};
+    Value *Obj = getUnderlyingObject(Ptr);
+    if (!Obj)
+      return {};
+    auto *Start = SE.getSCEV(Ptr);
+
+    PHINode *PN = dyn_cast<PHINode>(Obj);
+    if (!SE.properlyDominates(Start, BB) &&
+        !(PN && DT.dominates(PN->getParent(), BB)))
+      return {};
+    return {Obj, Start, AccessTy};
+  }
+};
+} // anonymous namespace
+
 namespace slpvectorizer {
 
 /// Bottom Up SLP Vectorizer.
@@ -841,6 +900,18 @@
   struct ScheduleData;
 
 public:
+  /// Set of objects we need to generate runtime checks for.
+  SmallPtrSet<Value *, 8> TrackedObjects;
+
+  SmallSet<std::pair<Value *, Value *>, 8> DepObjs;
+
+  /// Cache for alias results.
+  /// TODO: consider moving this to the AliasAnalysis itself.
+  using AliasCacheKey = std::pair<Instruction *, Instruction *>;
+  DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
+
+  bool CollectMemAccess = false;
+
   using ValueList = SmallVector<Value *, 8>;
   using InstrList = SmallVector<Instruction *, 16>;
   using ValueSet = SmallPtrSet<Value *, 16>;
@@ -960,6 +1031,17 @@
   /// during analysis.
   void reorderBottomToTop(bool IgnoreReorder = false);
 
+  void removeDeletedInstructions() {
+    for (auto *I : DeletedInstructions) {
+      I->dropAllReferences();
+    }
+    for (auto *I : DeletedInstructions) {
+      assert(I->use_empty() && "trying to erase instruction with users.");
+      I->eraseFromParent();
+    }
+    DeletedInstructions.clear();
+  }
+
   /// \return The vector element size in bits to use when vectorizing the
   /// expression tree ending at \p V. If V is a store, the size is the width of
   /// the stored value. Otherwise, the size is the width of the largest loaded
@@ -2647,12 +2729,6 @@
     return aliased;
   }
 
-  using AliasCacheKey = std::pair<Instruction *, Instruction *>;
-
-  /// Cache for alias results.
-  /// TODO: consider moving this to the AliasAnalysis itself.
-  DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
-
   // Cache for pointerMayBeCaptured calls inside AA.  This is preserved
   // globally through SLP because we don't perform any action which
   // invalidates capture results.
@@ -3353,15 +3429,11 @@
     }
     I->dropAllReferences();
   }
-  for (auto *I : DeletedInstructions) {
-    assert(I->use_empty() &&
-           "trying to erase instruction with users.");
-    I->eraseFromParent();
-  }
 
   // Cleanup any dead scalar code feeding the vectorized instructions
   RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
 
+  removeDeletedInstructions();
 #ifdef EXPENSIVE_CHECKS
   // If we could guarantee that this call is not extremely slow, we could
   // remove the ifdef limitation (see PR47712).
@@ -9510,9 +9582,37 @@
           // balance between reduced runtime and accurate dependencies.
           numAliased++;
 
+          ScheduleData *DestBundle = DepDest->FirstInBundle;
+          // If this bundle is not scheduled and no versioned code has been
+          // generated yet, try to collect the bounds of the accesses to
+          // generate runtime checks.
+          if (!DestBundle->IsScheduled && SLP->CollectMemAccess) {
+            auto *Src = getLoadStorePointerOperand(SrcInst);
+            auto *Dst = getLoadStorePointerOperand(DepDest->Inst);
+
+            if (SrcInst->getParent() == DepDest->Inst->getParent() && Src &&
+                Dst) {
+              auto SrcObjAndPtr = AccessInfo::get(*SrcInst, *SLP->SE, *SLP->DT);
+              auto DstObjAndPtr =
+                  AccessInfo::get(*DepDest->Inst, *SLP->SE, *SLP->DT);
+              if (!SrcObjAndPtr.UnderlyingObj || !DstObjAndPtr.UnderlyingObj ||
+                  SrcObjAndPtr.UnderlyingObj == DstObjAndPtr.UnderlyingObj)
+                SLP->TrackedObjects.clear();
+              else {
+                SLP->TrackedObjects.insert(SrcObjAndPtr.UnderlyingObj);
+                SLP->TrackedObjects.insert(DstObjAndPtr.UnderlyingObj);
+
+                Value *A = SrcObjAndPtr.UnderlyingObj;
+                Value *B = DstObjAndPtr.UnderlyingObj;
+                if (A > B)
+                  std::swap(A, B);
+                SLP->DepObjs.insert({A, B});
+              }
+            }
+          }
+
           DepDest->MemoryDependencies.push_back(BundleMember);
           BundleMember->Dependencies++;
-          ScheduleData *DestBundle = DepDest->FirstInBundle;
           if (!DestBundle->IsScheduled) {
             BundleMember->incrementUnscheduledDeps(1);
           }
@@ -9958,7 +10058,7 @@
     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
 
-    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
+    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE).MadeAnyChange;
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -9974,9 +10074,11 @@
     AU.addRequired<InjectTLIMappingsLegacy>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addPreserved<AAResultsWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
-    AU.setPreservesCFG();
+    if (!EnableMemoryVersioning) {
+      AU.addPreserved<AAResultsWrapperPass>();
+      AU.setPreservesCFG();
+    }
   }
 };
 
@@ -9993,23 +10095,374 @@
   auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
   auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
-  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
-  if (!Changed)
+  auto Result = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
+  if (!Result.MadeAnyChange)
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
-  PA.preserveSet<CFGAnalyses>();
+  if (!Result.MadeCFGChange)
+    PA.preserveSet<CFGAnalyses>();
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<DominatorTreeAnalysis>();
   return PA;
 }
 
-bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
-                                TargetTransformInfo *TTI_,
-                                TargetLibraryInfo *TLI_, AAResults *AA_,
-                                LoopInfo *LI_, DominatorTree *DT_,
-                                AssumptionCache *AC_, DemandedBits *DB_,
-                                OptimizationRemarkEmitter *ORE_) {
+/// Restore the original CFG by removing \p VectorBB and folding \p CheckBB, \p
+/// ScalarBB, \p MergeBB and \p Tail into a single block, like in the original
+/// IR.
+static void undoVersionedBlocks(BasicBlock *CheckBB, BasicBlock *ScalarBB,
+                                DomTreeUpdater &DTU, LoopInfo *LI,
+                                BasicBlock *VectorBB, StringRef OriginalBBName,
+                                BasicBlock *MergeBB, BasicBlock *Tail) {
+  CheckBB->setName(OriginalBBName);
+  CheckBB->getTerminator()->eraseFromParent();
+  ;
+  {
+    IRBuilder<> Builder(CheckBB);
+    Builder.CreateBr(ScalarBB);
+  }
+  DTU.applyUpdates({{DominatorTree::Delete, CheckBB, VectorBB}});
+  LI->removeBlock(VectorBB);
+  VectorBB->getTerminator()->eraseFromParent();
+  ;
+  {
+    IRBuilder<> Builder(VectorBB);
+    Builder.CreateUnreachable();
+  }
+  DTU.applyUpdates({{DominatorTree::Delete, VectorBB, MergeBB}});
+  DTU.deleteBB(VectorBB);
+  MergeBlockIntoPredecessor(MergeBB, &DTU, LI);
+  if (Tail)
+    MergeBlockIntoPredecessor(Tail, &DTU, LI);
+  MergeBlockIntoPredecessor(ScalarBB, &DTU, LI);
+  NumVersioningFailed++;
+}
+
+SLPVectorizerResult SLPVectorizerPass::vectorizeBlockWithVersioning(
+    BasicBlock *BB, const SmallPtrSetImpl<Value *> &TrackedObjects,
+    slpvectorizer::BoUpSLP &R) {
+  // Try to vectorize BB with versioning.
+  //
+  // First, collect all memory bounds for accesses in the block.
+  //
+  // Next, split off the region between the first and last tracked memory
+  // access.
+  //
+  // Then, duplicate the split off region, one will remain scalar and one will
+  // be annotated with noalias metadata.
+  //
+  // Then introduce placeholder blocks for the memory runtime checks (branch to
+  // either scalar or versioned blocks) and a merge block joining the control
+  // flow from scalar and versioned blocks.
+  //
+  // Then, add noalias metadata for memory accessed in the versioned block and
+  // run SLP vectorization on the versioned block.
+  //
+  // Now compare the cost of the scalar block against the cost of the vector
+  // block + the cost of the runtime checks. If the vector cost is less than the
+  // scalar cost, generate runtime checks in the check block. Otherwise remove
+  // all temporary blocks and restore the original IR.
+
+  bool Changed = false;
+  bool CFGChanged = false;
+  R.AliasCache.clear();
+
+  // First, clean up deleted instructions, so they are not re-used during SCEV
+  // expansion.
+  R.optimizeGatherSequence();
+  R.removeDeletedInstructions();
+
+  auto &DL = BB->getModule()->getDataLayout();
+  // Collect up-to-date memory bounds for tracked objects. Also collect the
+  // first and last memory instruction using a tracked object.
+  MapVector<Value *, RuntimeCheckingPtrGroup> MemBounds;
+  SmallPtrSet<Value *, 4> WrittenObjs;
+  // First instruction that accesses an object we collect bounds for.
+  Instruction *FirstTrackedInst = nullptr;
+  // Last instruction that accesses an object we collect bounds for.
+  Instruction *LastTrackedInst = nullptr;
+
+  DenseMap<Value *, unsigned> ObjOrder;
+  unsigned Order = 0;
+  for (Instruction &I : *BB) {
+    auto ObjAndStart = AccessInfo::get(I, *SE, *DT);
+    if (!ObjAndStart.UnderlyingObj)
+      continue;
+    auto *Obj = ObjAndStart.UnderlyingObj;
+    const auto *Start = ObjAndStart.PtrSCEV;
+
+    if (I.mayWriteToMemory())
+      WrittenObjs.insert(Obj);
+
+    unsigned AS = Obj->getType()->getPointerAddressSpace();
+
+    // We know that the Start is dereferenced, hence adding one should not
+    // overflow:
+    Type *IdxTy = DL.getIndexType(Obj->getType());
+    const SCEV *EltSizeSCEV =
+        SE->getStoreSizeOfExpr(IdxTy, ObjAndStart.AccessTy);
+    auto *End = SE->getAddExpr(Start, EltSizeSCEV);
+
+    if (TrackedObjects.find(Obj) != TrackedObjects.end())
+      MemBounds.insert({Obj, {0, Start, End, AS, false}});
+    auto BoundsIter = MemBounds.find(Obj);
+    if (BoundsIter == MemBounds.end())
+      continue;
+    BoundsIter->second.addPointer(0, Start, End, AS, false, *SE);
+
+    if (ObjOrder.find(Obj) == ObjOrder.end()) {
+      ObjOrder[Obj] = Order++;
+    }
+    if (!FirstTrackedInst)
+      FirstTrackedInst = &I;
+    LastTrackedInst = &I;
+  }
+
+  // Not enough memory access bounds for runtime checks.
+  if (MemBounds.size() < 2 || WrittenObjs.empty())
+    return {Changed, CFGChanged};
+
+  // Check if all uses between the first and last tracked instruction are inside
+  // the region. If that is not the case, PHIs would need to be added when
+  // duplicating the block.
+  auto AllUsesInside = [FirstTrackedInst, LastTrackedInst](BasicBlock *BB) {
+    return all_of(make_range(FirstTrackedInst->getIterator(),
+                             std::next(LastTrackedInst->getIterator())),
+                  [LastTrackedInst, BB](Instruction &I) {
+                    return all_of(I.users(), [LastTrackedInst, BB](User *U) {
+                      if (auto *UserI = dyn_cast<Instruction>(U))
+                        return UserI->getParent() == BB &&
+                               !isa<PHINode>(UserI) &&
+                               (UserI->comesBefore(LastTrackedInst) ||
+                                UserI == LastTrackedInst);
+                      return true;
+                    });
+                  });
+  };
+  if (!AllUsesInside(BB))
+    return {Changed, CFGChanged};
+
+  SmallVector<std::pair<Value *, RuntimeCheckingPtrGroup *>> BoundGroups;
+  for (auto &B : MemBounds)
+    BoundGroups.emplace_back(B.first, &B.second);
+
+  // Create a RuntimePointerCheck for all groups in BoundGroups.
+  SmallVector<PointerDiffInfo> PointerChecks;
+  uint64_t MaxDist = 0;
+
+  for (auto &P : R.DepObjs) {
+    Value *SrcObj = P.first;
+    Value *SinkObj = P.second;
+    if (ObjOrder[SrcObj] > ObjOrder[SinkObj])
+      std::swap(SrcObj, SinkObj);
+
+    auto &SrcGroup = MemBounds.find(SrcObj)->second;
+    auto &SinkGroup = MemBounds.find(SinkObj)->second;
+    bool SrcWrites = WrittenObjs.contains(SrcObj);
+    bool SinkWrites = WrittenObjs.contains(SinkObj);
+    if (!SrcWrites && !SinkWrites)
+      continue;
+    const SCEV *CurDist =
+        SE->getUMaxExpr(SE->getMinusSCEV(SrcGroup.High, SrcGroup.Low),
+                        SE->getMinusSCEV(SinkGroup.High, SinkGroup.Low));
+    if (auto *C = dyn_cast<SCEVConstant>(CurDist)) {
+      MaxDist = std::max(MaxDist, C->getValue()->getZExtValue());
+      IntegerType *IntTy = IntegerType::get(
+          BB->getContext(), DL.getPointerSizeInBits(SinkGroup.AddressSpace));
+      const SCEV *SinkStartInt = SE->getPtrToIntExpr(SinkGroup.Low, IntTy);
+      const SCEV *SrcStartInt = SE->getPtrToIntExpr(SrcGroup.Low, IntTy);
+      if (isa<SCEVCouldNotCompute>(SinkStartInt) ||
+          isa<SCEVCouldNotCompute>(SrcStartInt)) {
+        return {Changed, CFGChanged};
+      }
+
+      PointerChecks.emplace_back(SinkStartInt, SrcStartInt, 1, false);
+    } else
+      return {Changed, CFGChanged};
+  }
+
+  // Duplicate BB now and set up block and branches for memory checks.
+  std::string OriginalBBName = BB->getName().str();
+  IRBuilder<> ChkBuilder(BB->getFirstNonPHI());
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+
+  BasicBlock *Tail = nullptr;
+  if (LastTrackedInst->getNextNode() != BB->getTerminator())
+    Tail = SplitBlock(BB, LastTrackedInst->getNextNode(), &DTU, LI, nullptr,
+                      OriginalBBName + ".tail");
+  auto *CheckBB = BB;
+  BB = SplitBlock(BB, FirstTrackedInst, &DTU, LI, nullptr,
+                  OriginalBBName + ".slpmemcheck");
+  for (Use &U : make_early_inc_range(BB->uses())) {
+    BasicBlock *UserBB = cast<Instruction>(U.getUser())->getParent();
+    if (UserBB == CheckBB)
+      continue;
+
+    U.set(CheckBB);
+    DTU.applyUpdates({{DT->Delete, UserBB, BB}});
+    DTU.applyUpdates({{DT->Insert, UserBB, CheckBB}});
+  }
+  CFGChanged = true;
+
+  auto *MergeBB = BB;
+  BasicBlock *ScalarBB =
+      splitBlockBefore(BB, BB->getTerminator(), &DTU, LI, nullptr,
+                       OriginalBBName + ".slpversioned");
+
+  ValueToValueMapTy VMap;
+  BasicBlock *VectorBB = CloneBasicBlock(ScalarBB, VMap, "", BB->getParent());
+  ScalarBB->setName(OriginalBBName + ".scalar");
+  MergeBB->setName(OriginalBBName + ".merge");
+  SmallVector<BasicBlock *> Tmp;
+  Tmp.push_back(VectorBB);
+  remapInstructionsInBlocks(Tmp, VMap);
+  auto *Term = CheckBB->getTerminator();
+  ChkBuilder.SetInsertPoint(CheckBB->getTerminator());
+  ChkBuilder.CreateCondBr(ChkBuilder.getTrue(), ScalarBB, VectorBB);
+  Term->eraseFromParent();
+  DTU.applyUpdates({{DT->Insert, CheckBB, VectorBB}});
+  if (auto *L = LI->getLoopFor(CheckBB))
+    L->addBasicBlockToLoop(VectorBB, *LI);
+  Changed = true;
+
+  // Add !noalias metadata to memory accesses in the versioned block.
+  LLVMContext &Ctx = BB->getContext();
+  MDBuilder MDB(Ctx);
+  MDNode *Domain = MDB.createAnonymousAliasScopeDomain("SLPVerDomain");
+
+  DenseMap<const RuntimeCheckingPtrGroup *, MDNode *> GroupToScope;
+  for (const auto &Group : MemBounds)
+    GroupToScope[&Group.second] = MDB.createAnonymousAliasScope(Domain);
+
+  for (Instruction &I : *VectorBB) {
+    auto *Ptr = getLoadStorePointerOperand(&I);
+    if (!Ptr)
+      continue;
+
+    auto *PtrSCEV = SE->getSCEV(Ptr);
+    Value *Obj = getUnderlyingObject(Ptr);
+    if (!Obj) {
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr))
+        Obj = GEP->getOperand(0);
+      else
+        continue;
+    }
+
+    auto BoundsIter = MemBounds.find(Obj);
+    if (BoundsIter == MemBounds.end())
+      continue;
+    auto *LowerBound = BoundsIter->second.Low;
+    auto *UpperBound = BoundsIter->second.High;
+    auto *Scope = GroupToScope.find(&BoundsIter->second)->second;
+
+    auto *LowerSub = SE->getMinusSCEV(PtrSCEV, LowerBound);
+    auto *UpperSub = SE->getMinusSCEV(UpperBound, PtrSCEV);
+    if (!isa<SCEVCouldNotCompute>(LowerSub) &&
+        !isa<SCEVCouldNotCompute>(UpperSub) &&
+        SE->isKnownNonNegative(LowerSub) && SE->isKnownNonNegative(UpperSub)) {
+      I.setMetadata(
+          LLVMContext::MD_alias_scope,
+          MDNode::concatenate(I.getMetadata(LLVMContext::MD_alias_scope),
+                              MDNode::get(Ctx, Scope)));
+
+      SmallVector<Metadata *, 4> NonAliasing;
+      for (auto &KV : GroupToScope) {
+        if (KV.first == &BoundsIter->second)
+          continue;
+        NonAliasing.push_back(KV.second);
+      }
+      I.setMetadata(LLVMContext::MD_noalias,
+                    MDNode::concatenate(I.getMetadata(LLVMContext::MD_noalias),
+                                        MDNode::get(Ctx, NonAliasing)));
+    }
+  }
+
+  DTU.flush();
+  DT->updateDFSNumbers();
+  collectSeedInstructions(VectorBB);
+
+  // Vectorize trees that end at stores.
+  assert(!Stores.empty() && "should have stores when versioning");
+  LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
+                    << " underlying objects.\n");
+  bool AnyVectorized = vectorizeStoreChains(R);
+  Changed |= AnyVectorized;
+
+  InstructionCost SLPCost = 0;
+  InstructionCost ScalarCost = 0;
+  if (AnyVectorized) {
+    R.optimizeGatherSequence();
+    R.removeDeletedInstructions();
+    for (Instruction &I : *ScalarBB)
+      ScalarCost += TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+    for (Instruction &I : make_early_inc_range(reverse(*VectorBB))) {
+      if (isInstructionTriviallyDead(&I, TLI)) {
+        I.eraseFromParent();
+        continue;
+      }
+      SLPCost += TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+    }
+
+    // Estimate the size of the runtime checks, consisting of computing lower &
+    // upper bounds (2), the overlap checks (2) and the AND/OR to combine the
+    // checks.
+    SLPCost += 5 * PointerChecks.size() + MemBounds.size();
+  }
+
+  if (!AnyVectorized || SLPCost >= ScalarCost) {
+    // Vectorization not beneficial or possible. Restore original state by
+    // removing the introduced blocks.
+    R.getORE()->emit([&]() {
+      OptimizationRemarkMissed Rem(SV_NAME, "VersioningNotBeneficial",
+                                   &*ScalarBB->begin());
+      Rem << "Tried to version block but was not beneficial";
+      if (AnyVectorized) {
+        Rem << ore::NV("VectorCost", SLPCost)
+            << " >= " << ore::NV("ScalarCost", ScalarCost);
+      } else
+        Rem << "(nothing vectorized)";
+      return Rem;
+    });
+    Changed = false;
+    CFGChanged = false;
+    undoVersionedBlocks(CheckBB, ScalarBB, DTU, LI, VectorBB, OriginalBBName,
+                        MergeBB, Tail);
+  } else {
+    R.getORE()->emit(
+        OptimizationRemark(SV_NAME, "VersioningSuccessful", &*ScalarBB->begin())
+        << "SLP vectorization with versioning is beneficial "
+        << ore::NV("VectorCost", SLPCost) << " < "
+        << ore::NV("ScalarCost", ScalarCost)
+        << ore::NV("AnyVectorized", AnyVectorized));
+
+    ChkBuilder.SetInsertPoint(CheckBB->getTerminator());
+    SCEVExpander Exp(*SE, BB->getParent()->getParent()->getDataLayout(),
+                     "memcheck");
+    Value *MemoryOverlap = addDiffRuntimeChecks(
+        CheckBB->getTerminator(), PointerChecks, Exp,
+        [MaxDist](IRBuilderBase &B, unsigned Bits) {
+          return B.getIntN(Bits, MaxDist);
+        },
+        1);
+    /*    Value *MemoryOverlap =*/
+    /*addRuntimeChecks(CheckBB->getTerminator(), nullptr, PointerChecks, Exp);*/
+    assert(MemoryOverlap &&
+           "runtime checks required, but no checks generated in IR?");
+    cast<BranchInst>(CheckBB->getTerminator())->setCondition(MemoryOverlap);
+    NumVersioningSuccessful++;
+  }
+  DTU.flush();
+  DT->updateDFSNumbers();
+
+  return {Changed, CFGChanged};
+}
+
+SLPVectorizerResult SLPVectorizerPass::runImpl(
+    Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_,
+    TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_,
+    AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_) {
   if (!RunSLPVectorization)
-    return false;
+    return {false, false};
   SE = SE_;
   TTI = TTI_;
   TLI = TLI_;
@@ -10023,18 +10476,19 @@
   Stores.clear();
   GEPs.clear();
   bool Changed = false;
+  bool CFGChanged = false;
 
   // If the target claims to have no vector registers don't attempt
   // vectorization.
   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
     LLVM_DEBUG(
         dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
-    return false;
+    return {false, false};
   }
 
   // Don't vectorize when the attribute NoImplicitFloat is used.
   if (F.hasFnAttribute(Attribute::NoImplicitFloat))
-    return false;
+    return {false, false};
 
   LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
 
@@ -10048,21 +10502,31 @@
   // Update DFS numbers now so that we can use them for ordering.
   DT->updateDFSNumbers();
 
+  SmallVector<BasicBlock *, 4> BlocksToRetry;
+  SmallVector<SmallPtrSet<Value *, 8>, 4> BoundsToUse;
   // Scan the blocks in the function in post order.
   for (auto BB : post_order(&F.getEntryBlock())) {
     // Start new block - clear the list of reduction roots.
     R.clearReductionData();
     collectSeedInstructions(BB);
 
+    bool VectorizedBlock = false;
     // Vectorize trees that end at stores.
     if (!Stores.empty()) {
       LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
                         << " underlying objects.\n");
-      Changed |= vectorizeStoreChains(R);
+      R.TrackedObjects.clear();
+
+      if (EnableMemoryVersioning)
+        R.CollectMemAccess = BB->size() <= 300;
+
+      VectorizedBlock = vectorizeStoreChains(R);
+
+      R.CollectMemAccess = false;
     }
 
     // Vectorize trees that end at reductions.
-    Changed |= vectorizeChainsInBlock(BB, R);
+    VectorizedBlock |= vectorizeChainsInBlock(BB, R);
 
     // Vectorize the index computations of getelementptr instructions. This
     // is primarily intended to catch gather-like idioms ending at
@@ -10070,15 +10534,30 @@
     if (!GEPs.empty()) {
       LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
                         << " underlying objects.\n");
-      Changed |= vectorizeGEPIndices(BB, R);
+      VectorizedBlock |= vectorizeGEPIndices(BB, R);
     }
+
+    if (!VectorizedBlock && !R.TrackedObjects.empty()) {
+      BlocksToRetry.push_back(BB);
+      BoundsToUse.push_back(R.TrackedObjects);
+    }
+    R.TrackedObjects.clear();
+    Changed |= VectorizedBlock;
+  }
+
+  for (unsigned I = 0; I != BlocksToRetry.size(); I++) {
+    auto Status =
+        vectorizeBlockWithVersioning(BlocksToRetry[I], BoundsToUse[I], R);
+    Changed |= Status.MadeAnyChange;
+    CFGChanged |= Status.MadeCFGChange;
   }
 
   if (Changed) {
     R.optimizeGatherSequence();
     LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
   }
-  return Changed;
+
+  return {Changed, CFGChanged};
 }
 
 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s
+; RUN: opt -slp-memory-versioning -scoped-noalias-aa -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -enable-new-pm=false < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64"
@@ -92,57 +92,83 @@
 define void @f_alias(i8* nocapture %dst, i8* nocapture readonly %src, %struct.weight_t* nocapture readonly %w) {
 ; CHECK-LABEL: @f_alias(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DST38:%.*]] = ptrtoint i8* [[DST:%.*]] to i64
+; CHECK-NEXT:    [[SRC37:%.*]] = ptrtoint i8* [[SRC:%.*]] to i64
 ; CHECK-NEXT:    [[SCALE:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T:%.*]], %struct.weight_t* [[W:%.*]], i64 0, i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SCALE]], align 16
 ; CHECK-NEXT:    [[OFFSET:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T]], %struct.weight_t* [[W]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[OFFSET]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[SRC:%.*]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[SRC37]], [[DST38]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]]
+; CHECK:       entry.scalar:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, i8* [[SRC]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP3]] to i32
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], [[CONV]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[MUL]], [[TMP1]]
 ; CHECK-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp ult i32 [[ADD]], 256
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[ADD]], 0
-; CHECK-NEXT:    [[SHR_I:%.*]] = sext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[ADD]], 0
+; CHECK-NEXT:    [[SHR_I:%.*]] = sext i1 [[TMP4]] to i32
 ; CHECK-NEXT:    [[COND_I:%.*]] = select i1 [[TOBOOL_NOT_I]], i32 [[ADD]], i32 [[SHR_I]]
 ; CHECK-NEXT:    [[CONV_I:%.*]] = trunc i32 [[COND_I]] to i8
-; CHECK-NEXT:    store i8 [[CONV_I]], i8* [[DST:%.*]], align 1
+; CHECK-NEXT:    store i8 [[CONV_I]], i8* [[DST]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1
-; CHECK-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1
+; CHECK-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP5]] to i32
 ; CHECK-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[TMP0]], [[CONV_1]]
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[MUL_1]], [[TMP1]]
 ; CHECK-NEXT:    [[TOBOOL_NOT_I_1:%.*]] = icmp ult i32 [[ADD_1]], 256
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[ADD_1]], 0
-; CHECK-NEXT:    [[SHR_I_1:%.*]] = sext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[ADD_1]], 0
+; CHECK-NEXT:    [[SHR_I_1:%.*]] = sext i1 [[TMP6]] to i32
 ; CHECK-NEXT:    [[COND_I_1:%.*]] = select i1 [[TOBOOL_NOT_I_1]], i32 [[ADD_1]], i32 [[SHR_I_1]]
 ; CHECK-NEXT:    [[CONV_I_1:%.*]] = trunc i32 [[COND_I_1]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 1
 ; CHECK-NEXT:    store i8 [[CONV_I_1]], i8* [[ARRAYIDX2_1]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX_2]], align 1
-; CHECK-NEXT:    [[CONV_2:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[ARRAYIDX_2]], align 1
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i8 [[TMP7]] to i32
 ; CHECK-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[TMP0]], [[CONV_2]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[MUL_2]], [[TMP1]]
 ; CHECK-NEXT:    [[TOBOOL_NOT_I_2:%.*]] = icmp ult i32 [[ADD_2]], 256
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[ADD_2]], 0
-; CHECK-NEXT:    [[SHR_I_2:%.*]] = sext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[ADD_2]], 0
+; CHECK-NEXT:    [[SHR_I_2:%.*]] = sext i1 [[TMP8]] to i32
 ; CHECK-NEXT:    [[COND_I_2:%.*]] = select i1 [[TOBOOL_NOT_I_2]], i32 [[ADD_2]], i32 [[SHR_I_2]]
 ; CHECK-NEXT:    [[CONV_I_2:%.*]] = trunc i32 [[COND_I_2]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 2
 ; CHECK-NEXT:    store i8 [[CONV_I_2]], i8* [[ARRAYIDX2_2]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX_3]], align 1
-; CHECK-NEXT:    [[CONV_3:%.*]] = zext i8 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX_3]], align 1
+; CHECK-NEXT:    [[CONV_3:%.*]] = zext i8 [[TMP9]] to i32
 ; CHECK-NEXT:    [[MUL_3:%.*]] = mul nsw i32 [[TMP0]], [[CONV_3]]
 ; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[MUL_3]], [[TMP1]]
 ; CHECK-NEXT:    [[TOBOOL_NOT_I_3:%.*]] = icmp ult i32 [[ADD_3]], 256
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[ADD_3]], 0
-; CHECK-NEXT:    [[SHR_I_3:%.*]] = sext i1 [[TMP9]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[ADD_3]], 0
+; CHECK-NEXT:    [[SHR_I_3:%.*]] = sext i1 [[TMP10]] to i32
 ; CHECK-NEXT:    [[COND_I_3:%.*]] = select i1 [[TOBOOL_NOT_I_3]], i32 [[ADD_3]], i32 [[SHR_I_3]]
 ; CHECK-NEXT:    [[CONV_I_3:%.*]] = trunc i32 [[COND_I_3]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3
 ; CHECK-NEXT:    store i8 [[CONV_I_3]], i8* [[ARRAYIDX2_3]], align 1
+; CHECK-NEXT:    br label [[ENTRY_MERGE:%.*]]
+; CHECK:       entry.merge:
 ; CHECK-NEXT:    ret void
+; CHECK:       entry.slpversioned1:
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8* [[SRC]] to <4 x i8>*
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[SHUFFLE36:%.*]] = shufflevector <4 x i32> [[TMP16]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[TMP15]], [[SHUFFLE36]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ult <4 x i32> [[TMP17]], <i32 256, i32 256, i32 256, i32 256>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp sgt <4 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[TMP18]], <4 x i32> [[TMP17]], <4 x i32> [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = trunc <4 x i32> [[TMP21]] to <4 x i8>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i8* [[DST]] to <4 x i8>*
+; CHECK-NEXT:    store <4 x i8> [[TMP22]], <4 x i8>* [[TMP23]], align 1, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    br label [[ENTRY_MERGE]]
 ;
 entry:
   %scale = getelementptr inbounds %struct.weight_t, %struct.weight_t* %w, i64 0, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks-in-loops.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks-in-loops.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks-in-loops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks-in-loops.ll
@@ -1,16 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -slp-vectorizer  -mtriple=arm64-apple-ios -S %s | FileCheck %s
-; RUN: opt -aa-pipeline='basic-aa,scoped-noalias-aa' -passes=slp-vectorizer -mtriple=arm64-apple-darwin -S %s | FileCheck %s
+; RUN: opt -scoped-noalias-aa -slp-vectorizer  -slp-memory-versioning -enable-new-pm=false -mtriple=arm64-apple-ios -S %s | FileCheck %s
+; RUN: opt -aa-pipeline='basic-aa,scoped-noalias-aa' -slp-memory-versioning -passes=slp-vectorizer -mtriple=arm64-apple-darwin -S %s | FileCheck %s
 
 define void @loop1(i32* %A, i32* %B, i64 %N) {
 ; CHECK-LABEL: @loop1(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A29:%.*]] = ptrtoint i32* [[A:%.*]] to i64
+; CHECK-NEXT:    [[B28:%.*]] = ptrtoint i32* [[B:%.*]] to i64
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[B_GEP_0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[IV]]
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[LOOP_TAIL:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP_TAIL]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVAR]], 6
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[B28]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[A29]], [[TMP0]]
+; CHECK-NEXT:    [[B_GEP_0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], 16
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[LOOP_SCALAR:%.*]], label [[LOOP_SLPVERSIONED1:%.*]]
+; CHECK:       loop.scalar:
 ; CHECK-NEXT:    [[B_0:%.*]] = load i32, i32* [[B_GEP_0]], align 4
-; CHECK-NEXT:    [[A_GEP_0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IV]]
+; CHECK-NEXT:    [[A_GEP_0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
 ; CHECK-NEXT:    [[A_0:%.*]] = load i32, i32* [[A_GEP_0]], align 4
 ; CHECK-NEXT:    [[ADD_0:%.*]] = add i32 [[A_0]], 20
 ; CHECK-NEXT:    [[XOR_0:%.*]] = xor i32 [[ADD_0]], [[B_0]]
@@ -39,11 +49,27 @@
 ; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[A_3]], 20
 ; CHECK-NEXT:    [[XOR_3:%.*]] = xor i32 [[ADD_3]], [[B_3]]
 ; CHECK-NEXT:    store i32 [[XOR_3]], i32* [[A_GEP_3]], align 4
+; CHECK-NEXT:    br label [[LOOP_MERGE:%.*]]
+; CHECK:       loop.merge:
+; CHECK-NEXT:    br label [[LOOP_TAIL]]
+; CHECK:       loop.tail:
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 16
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult i64 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
 ; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
+; CHECK:       loop.slpversioned1:
+; CHECK-NEXT:    [[A_GEP_03:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[B_GEP_0]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A_GEP_03]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 20, i32 20, i32 20, i32 20>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[A_GEP_03]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP10]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    br label [[LOOP_MERGE]]
 ;
 entry:
   br label %loop
@@ -92,16 +118,22 @@
 define void @loop_iv_update_at_start(float* %src, float* %dst) #0 {
 ; CHECK-LABEL: @loop_iv_update_at_start(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DST27:%.*]] = ptrtoint float* [[DST:%.*]] to i64
+; CHECK-NEXT:    [[SRC26:%.*]] = ptrtoint float* [[SRC:%.*]] to i64
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_MERGE:%.*]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[IV]], 2000
-; CHECK-NEXT:    [[SRC_GEP_0:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 0
+; CHECK-NEXT:    [[SRC_GEP_0:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[SRC26]], [[DST27]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 20
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[LOOP_SCALAR:%.*]], label [[LOOP_SLPVERSIONED1:%.*]]
+; CHECK:       loop.scalar:
 ; CHECK-NEXT:    [[SRC_0:%.*]] = load float, float* [[SRC_GEP_0]], align 8
 ; CHECK-NEXT:    [[ADD_0:%.*]] = fadd float [[SRC_0]], 1.000000e+00
 ; CHECK-NEXT:    [[MUL_0:%.*]] = fmul float [[ADD_0]], [[SRC_0]]
-; CHECK-NEXT:    [[DST_GEP_0:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 0
+; CHECK-NEXT:    [[DST_GEP_0:%.*]] = getelementptr inbounds float, float* [[DST]], i64 0
 ; CHECK-NEXT:    store float [[MUL_0]], float* [[DST_GEP_0]], align 8
 ; CHECK-NEXT:    [[SRC_GEP_1:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 1
 ; CHECK-NEXT:    [[SRC_1:%.*]] = load float, float* [[SRC_GEP_1]], align 8
@@ -127,9 +159,26 @@
 ; CHECK-NEXT:    [[MUL_4:%.*]] = fmul float [[ADD_4]], [[SRC_4]]
 ; CHECK-NEXT:    [[DST_GEP_4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 4
 ; CHECK-NEXT:    store float [[MUL_4]], float* [[DST_GEP_4]], align 8
+; CHECK-NEXT:    br label [[LOOP_MERGE]]
+; CHECK:       loop.merge:
 ; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
+; CHECK:       loop.slpversioned1:
+; CHECK-NEXT:    [[DST_GEP_05:%.*]] = getelementptr inbounds float, float* [[DST]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[SRC_GEP_0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 8, !alias.scope !5, !noalias !8
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP2]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <4 x float> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[DST_GEP_05]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 8, !alias.scope !8, !noalias !5
+; CHECK-NEXT:    [[SRC_GEP_421:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 4
+; CHECK-NEXT:    [[SRC_422:%.*]] = load float, float* [[SRC_GEP_421]], align 8, !alias.scope !5, !noalias !8
+; CHECK-NEXT:    [[ADD_423:%.*]] = fadd float [[SRC_422]], 1.000000e+00
+; CHECK-NEXT:    [[MUL_424:%.*]] = fmul float [[ADD_423]], [[SRC_422]]
+; CHECK-NEXT:    [[DST_GEP_425:%.*]] = getelementptr inbounds float, float* [[DST]], i64 4
+; CHECK-NEXT:    store float [[MUL_424]], float* [[DST_GEP_425]], align 8, !alias.scope !8, !noalias !5
+; CHECK-NEXT:    br label [[LOOP_MERGE]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -scoped-noalias-aa -slp-vectorizer -mtriple=arm64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s
-; RUN: opt -aa-pipeline='basic-aa,scoped-noalias-aa' -passes=slp-vectorizer -mtriple=arm64-apple-darwin -S %s | FileCheck %s
+; RUN: opt -aa-pipeline='basic-aa,scoped-noalias-aa' -slp-memory-versioning -passes=slp-vectorizer -mtriple=arm64-apple-darwin -S %s | FileCheck %s
+; RUN: opt -aa-pipeline='basic-aa,scoped-noalias-aa' -slp-memory-versioning=false -passes=slp-vectorizer -mtriple=arm64-apple-darwin -S %s | FileCheck --check-prefix=NOVERSION %s
+
+; NOVERSION-NOT: slpversioned
 
 define void @needs_versioning_not_profitable(i32* %dst, i32* %src) {
 ; CHECK-LABEL: @needs_versioning_not_profitable(
@@ -30,9 +32,15 @@
 define void @needs_versioning_profitable(i32* %dst, i32* %src) {
 ; CHECK-LABEL: @needs_versioning_profitable(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SRC_0:%.*]] = load i32, i32* [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[DST17:%.*]] = ptrtoint i32* [[DST:%.*]] to i64
+; CHECK-NEXT:    [[SRC16:%.*]] = ptrtoint i32* [[SRC:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[SRC16]], [[DST17]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]]
+; CHECK:       entry.scalar:
+; CHECK-NEXT:    [[SRC_0:%.*]] = load i32, i32* [[SRC]], align 4
 ; CHECK-NEXT:    [[R_0:%.*]] = ashr i32 [[SRC_0]], 16
-; CHECK-NEXT:    store i32 [[R_0]], i32* [[DST:%.*]], align 4
+; CHECK-NEXT:    store i32 [[R_0]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1
 ; CHECK-NEXT:    [[SRC_1:%.*]] = load i32, i32* [[SRC_GEP_1]], align 4
 ; CHECK-NEXT:    [[R_1:%.*]] = ashr i32 [[SRC_1]], 16
@@ -48,7 +56,16 @@
 ; CHECK-NEXT:    [[R_3:%.*]] = ashr i32 [[SRC_3]], 16
 ; CHECK-NEXT:    [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
 ; CHECK-NEXT:    store i32 [[R_3]], i32* [[DST_GEP_3]], align 4
+; CHECK-NEXT:    br label [[ENTRY_MERGE:%.*]]
+; CHECK:       entry.merge:
 ; CHECK-NEXT:    ret void
+; CHECK:       entry.slpversioned1:
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    br label [[ENTRY_MERGE]]
 ;
 entry:
   %src.0 = load i32, i32* %src, align 4
@@ -76,11 +93,21 @@
 define void @needs_versioning_profitable_2_sources(i32* %dst, i32* %A, i32* %B) {
 ; CHECK-LABEL: @needs_versioning_profitable_2_sources(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_0:%.*]] = load i32, i32* [[A:%.*]], align 4
-; CHECK-NEXT:    [[B_0:%.*]] = load i32, i32* [[B:%.*]], align 4
+; CHECK-NEXT:    [[B29:%.*]] = ptrtoint i32* [[B:%.*]] to i64
+; CHECK-NEXT:    [[DST28:%.*]] = ptrtoint i32* [[DST:%.*]] to i64
+; CHECK-NEXT:    [[A27:%.*]] = ptrtoint i32* [[A:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[A27]], [[DST28]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[B29]], [[DST28]]
+; CHECK-NEXT:    [[DIFF_CHECK30:%.*]] = icmp ult i64 [[TMP1]], 16
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK30]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]]
+; CHECK:       entry.scalar:
+; CHECK-NEXT:    [[A_0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    [[B_0:%.*]] = load i32, i32* [[B]], align 4
 ; CHECK-NEXT:    [[R_0:%.*]] = add i32 [[A_0]], [[B_0]]
 ; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[R_0]], 2
-; CHECK-NEXT:    store i32 [[MUL_0]], i32* [[DST:%.*]], align 4
+; CHECK-NEXT:    store i32 [[MUL_0]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[A_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
 ; CHECK-NEXT:    [[A_1:%.*]] = load i32, i32* [[A_GEP_1]], align 4
 ; CHECK-NEXT:    [[B_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1
@@ -105,7 +132,19 @@
 ; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[R_3]], 2
 ; CHECK-NEXT:    [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
 ; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[DST_GEP_3]], align 4
+; CHECK-NEXT:    br label [[ENTRY_MERGE:%.*]]
+; CHECK:       entry.merge:
 ; CHECK-NEXT:    ret void
+; CHECK:       entry.slpversioned1:
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !alias.scope !5, !noalias !8
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !11, !noalias !12
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP6]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !alias.scope !13, !noalias !14
+; CHECK-NEXT:    br label [[ENTRY_MERGE]]
 ;
 entry:
   %A.0 = load i32, i32* %A, align 4
@@ -148,12 +187,18 @@
 define void @needs_versioning_profitable_split_points(i32* %dst, i32* %src) {
 ; CHECK-LABEL: @needs_versioning_profitable_split_points(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DST17:%.*]] = ptrtoint i32* [[DST:%.*]] to i64
+; CHECK-NEXT:    [[SRC16:%.*]] = ptrtoint i32* [[SRC:%.*]] to i64
 ; CHECK-NEXT:    call void @bar()
 ; CHECK-NEXT:    call void @bar()
 ; CHECK-NEXT:    call void @bar()
-; CHECK-NEXT:    [[SRC_0:%.*]] = load i32, i32* [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[SRC16]], [[DST17]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]]
+; CHECK:       entry.scalar:
+; CHECK-NEXT:    [[SRC_0:%.*]] = load i32, i32* [[SRC]], align 4
 ; CHECK-NEXT:    [[R_0:%.*]] = ashr i32 [[SRC_0]], 16
-; CHECK-NEXT:    store i32 [[R_0]], i32* [[DST:%.*]], align 4
+; CHECK-NEXT:    store i32 [[R_0]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1
 ; CHECK-NEXT:    [[SRC_1:%.*]] = load i32, i32* [[SRC_GEP_1]], align 4
 ; CHECK-NEXT:    [[R_1:%.*]] = ashr i32 [[SRC_1]], 16
@@ -169,8 +214,19 @@
 ; CHECK-NEXT:    [[R_3:%.*]] = ashr i32 [[SRC_3]], 16
 ; CHECK-NEXT:    [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
 ; CHECK-NEXT:    store i32 [[R_3]], i32* [[DST_GEP_3]], align 4
+; CHECK-NEXT:    br label [[ENTRY_MERGE:%.*]]
+; CHECK:       entry.merge:
+; CHECK-NEXT:    br label [[ENTRY_TAIL:%.*]]
+; CHECK:       entry.tail:
 ; CHECK-NEXT:    call void @bar()
 ; CHECK-NEXT:    ret void
+; CHECK:       entry.slpversioned1:
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4, !alias.scope !18, !noalias !15
+; CHECK-NEXT:    br label [[ENTRY_MERGE]]
 ;
 entry:
   call void @bar()
@@ -347,29 +403,46 @@
 define void @version_multiple(i32* nocapture %out_block, i32* nocapture readonly %counter) {
 ; CHECK-LABEL: @version_multiple(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[COUNTER:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK:%.*]], align 4
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[OUT_BLOCK13:%.*]] = ptrtoint i32* [[OUT_BLOCK:%.*]] to i64
+; CHECK-NEXT:    [[COUNTER12:%.*]] = ptrtoint i32* [[COUNTER:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[COUNTER12]], [[OUT_BLOCK13]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]]
+; CHECK:       entry.scalar:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[COUNTER]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2_1]], align 4
-; CHECK-NEXT:    [[XOR_1:%.*]] = xor i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[XOR_1:%.*]] = xor i32 [[TMP4]], [[TMP3]]
 ; CHECK-NEXT:    store i32 [[XOR_1]], i32* [[ARRAYIDX2_1]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX2_2]], align 4
-; CHECK-NEXT:    [[XOR_2:%.*]] = xor i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[XOR_2:%.*]] = xor i32 [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    store i32 [[XOR_2]], i32* [[ARRAYIDX2_2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4
-; CHECK-NEXT:    [[XOR_3:%.*]] = xor i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    [[XOR_3:%.*]] = xor i32 [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    store i32 [[XOR_3]], i32* [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    br label [[ENTRY_MERGE:%.*]]
+; CHECK:       entry.merge:
 ; CHECK-NEXT:    ret void
+; CHECK:       entry.slpversioned1:
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[COUNTER]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !20, !noalias !23
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !23, !noalias !20
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <4 x i32> [[TMP12]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !alias.scope !23, !noalias !20
+; CHECK-NEXT:    br label [[ENTRY_MERGE]]
 ;
 entry:
   %0 = load i32, i32* %counter, align 4
@@ -608,6 +681,7 @@
 ; CHECK-LABEL: @test_bounds_removed_before_runtime_checks(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT:%.*]], %struct* [[A:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT]], %struct* [[A]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[TMP11]] to <2 x i32>*
 ; CHECK-NEXT:    store <2 x i32> <i32 10, i32 300>, <2 x i32>* [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32*, i32** [[B:%.*]], align 8
@@ -710,31 +784,49 @@
 ; CHECK-NEXT:    [[PTR_PHI:%.*]] = phi %struct.2* [ [[A:%.*]], [[BB:%.*]] ], [ null, [[LOOP]] ]
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[B_GEP_0:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 0
+; CHECK-NEXT:    [[PTR_PHI_LCSSA:%.*]] = phi %struct.2* [ [[PTR_PHI]], [[LOOP]] ]
+; CHECK-NEXT:    [[PTR_PHI_LCSSA22:%.*]] = ptrtoint %struct.2* [[PTR_PHI_LCSSA]] to i64
+; CHECK-NEXT:    [[B_GEP_0:%.*]] = getelementptr float, float* [[B:%.*]], i64 0
+; CHECK-NEXT:    [[B_GEP_021:%.*]] = ptrtoint float* [[B_GEP_0]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[B_GEP_021]], [[PTR_PHI_LCSSA22]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[EXIT_SCALAR:%.*]], label [[EXIT_SLPVERSIONED1:%.*]]
+; CHECK:       exit.scalar:
 ; CHECK-NEXT:    [[L_0:%.*]] = load float, float* [[B_GEP_0]], align 8
 ; CHECK-NEXT:    [[ADD_0:%.*]] = fadd float [[L_0]], 1.000000e+01
 ; CHECK-NEXT:    [[MUL_0:%.*]] = fmul float [[ADD_0]], 3.000000e+01
-; CHECK-NEXT:    [[A_GEP_0:%.*]] = getelementptr inbounds [[STRUCT_2:%.*]], %struct.2* [[PTR_PHI]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    [[A_GEP_0:%.*]] = getelementptr inbounds [[STRUCT_2:%.*]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 0
 ; CHECK-NEXT:    store float [[MUL_0]], float* [[A_GEP_0]], align 8
 ; CHECK-NEXT:    [[B_GEP_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 1
 ; CHECK-NEXT:    [[L_1:%.*]] = load float, float* [[B_GEP_1]], align 8
 ; CHECK-NEXT:    [[ADD_1:%.*]] = fadd float [[L_1]], 1.000000e+01
 ; CHECK-NEXT:    [[MUL_1:%.*]] = fmul float [[ADD_1]], 3.000000e+01
-; CHECK-NEXT:    [[A_GEP_1:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI]], i64 0, i32 0, i32 1
+; CHECK-NEXT:    [[A_GEP_1:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 1
 ; CHECK-NEXT:    store float [[MUL_1]], float* [[A_GEP_1]], align 8
 ; CHECK-NEXT:    [[B_GEP_2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
 ; CHECK-NEXT:    [[L_2:%.*]] = load float, float* [[B_GEP_2]], align 8
 ; CHECK-NEXT:    [[ADD_2:%.*]] = fadd float [[L_2]], 1.000000e+01
 ; CHECK-NEXT:    [[MUL_2:%.*]] = fmul float [[ADD_2]], 3.000000e+01
-; CHECK-NEXT:    [[A_GEP_2:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI]], i64 0, i32 0, i32 2
+; CHECK-NEXT:    [[A_GEP_2:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 2
 ; CHECK-NEXT:    store float [[MUL_2]], float* [[A_GEP_2]], align 8
 ; CHECK-NEXT:    [[B_GEP_3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
 ; CHECK-NEXT:    [[L_3:%.*]] = load float, float* [[B_GEP_3]], align 8
 ; CHECK-NEXT:    [[ADD_3:%.*]] = fadd float [[L_3]], 1.000000e+01
 ; CHECK-NEXT:    [[MUL_3:%.*]] = fmul float [[ADD_3]], 3.000000e+01
-; CHECK-NEXT:    [[A_GEP_3:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI]], i64 0, i32 0, i32 3
+; CHECK-NEXT:    [[A_GEP_3:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 3
 ; CHECK-NEXT:    store float [[MUL_3]], float* [[A_GEP_3]], align 8
+; CHECK-NEXT:    br label [[EXIT_MERGE:%.*]]
+; CHECK:       exit.merge:
 ; CHECK-NEXT:    ret void
+; CHECK:       exit.slpversioned1:
+; CHECK-NEXT:    [[A_GEP_05:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[B_GEP_0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 8, !alias.scope !25, !noalias !28
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP2]], <float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <4 x float> [[TMP3]], <float 3.000000e+01, float 3.000000e+01, float 3.000000e+01, float 3.000000e+01>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[A_GEP_05]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 8, !alias.scope !28, !noalias !25
+; CHECK-NEXT:    br label [[EXIT_MERGE]]
 ;
 bb:
   br label %loop
@@ -781,7 +873,13 @@
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[PTR_PHI_LCSSA:%.*]] = phi %struct.2* [ [[PTR_PHI]], [[LOOP]] ]
-; CHECK-NEXT:    [[B_GEP_0:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 0
+; CHECK-NEXT:    [[PTR_PHI_LCSSA22:%.*]] = ptrtoint %struct.2* [[PTR_PHI_LCSSA]] to i64
+; CHECK-NEXT:    [[B_GEP_0:%.*]] = getelementptr float, float* [[B:%.*]], i64 0
+; CHECK-NEXT:    [[B_GEP_021:%.*]] = ptrtoint float* [[B_GEP_0]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[B_GEP_021]], [[PTR_PHI_LCSSA22]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[EXIT_SCALAR:%.*]], label [[EXIT_SLPVERSIONED1:%.*]]
+; CHECK:       exit.scalar:
 ; CHECK-NEXT:    [[L_0:%.*]] = load float, float* [[B_GEP_0]], align 8
 ; CHECK-NEXT:    [[ADD_0:%.*]] = fadd float [[L_0]], 1.000000e+01
 ; CHECK-NEXT:    [[MUL_0:%.*]] = fmul float [[ADD_0]], 3.000000e+01
@@ -805,7 +903,18 @@
 ; CHECK-NEXT:    [[MUL_3:%.*]] = fmul float [[ADD_3]], 3.000000e+01
 ; CHECK-NEXT:    [[A_GEP_3:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 3
 ; CHECK-NEXT:    store float [[MUL_3]], float* [[A_GEP_3]], align 8
+; CHECK-NEXT:    br label [[EXIT_MERGE:%.*]]
+; CHECK:       exit.merge:
 ; CHECK-NEXT:    ret void
+; CHECK:       exit.slpversioned1:
+; CHECK-NEXT:    [[A_GEP_05:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[B_GEP_0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 8, !alias.scope !30, !noalias !33
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP2]], <float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <4 x float> [[TMP3]], <float 3.000000e+01, float 3.000000e+01, float 3.000000e+01, float 3.000000e+01>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[A_GEP_05]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 8, !alias.scope !33, !noalias !30
+; CHECK-NEXT:    br label [[EXIT_MERGE]]
 ;
 bb:
   br label %loop
@@ -1185,10 +1294,13 @@
 define void @crash_instructions_deleted(float* %t, i32* %a, i32** noalias %ptr) {
 ; CHECK-LABEL: @crash_instructions_deleted(
 ; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[T42:%.*]] = ptrtoint float* [[T:%.*]] to i64
 ; CHECK-NEXT:    [[T15:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 2
+; CHECK-NEXT:    [[T16:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[T15]] to <2 x i32>*
 ; CHECK-NEXT:    store <2 x i32> <i32 0, i32 10>, <2 x i32>* [[TMP0]], align 8
 ; CHECK-NEXT:    [[T17:%.*]] = load i32*, i32** [[PTR:%.*]], align 8
+; CHECK-NEXT:    [[T1718:%.*]] = ptrtoint i32* [[T17]] to i64
 ; CHECK-NEXT:    br label [[BB18:%.*]]
 ; CHECK:       bb18:
 ; CHECK-NEXT:    [[T19:%.*]] = sext i32 0 to i64
@@ -1198,10 +1310,15 @@
 ; CHECK-NEXT:    [[T23:%.*]] = getelementptr inbounds i8, i8* [[T22]], i64 1
 ; CHECK-NEXT:    [[T24:%.*]] = getelementptr inbounds i8, i8* [[T22]], i64 2
 ; CHECK-NEXT:    [[T25:%.*]] = getelementptr inbounds i8, i8* [[T22]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[T1718]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], [[T42]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[BB18_SCALAR:%.*]], label [[BB18_SLPVERSIONED1:%.*]]
+; CHECK:       bb18.scalar:
 ; CHECK-NEXT:    [[T26:%.*]] = load i8, i8* [[T22]], align 1
 ; CHECK-NEXT:    [[T27:%.*]] = uitofp i8 [[T26]] to float
 ; CHECK-NEXT:    [[T28:%.*]] = fdiv float [[T27]], 2.550000e+02
-; CHECK-NEXT:    [[T29:%.*]] = getelementptr inbounds float, float* [[T:%.*]], i64 0
+; CHECK-NEXT:    [[T29:%.*]] = getelementptr inbounds float, float* [[T]], i64 0
 ; CHECK-NEXT:    store float [[T28]], float* [[T29]], align 8
 ; CHECK-NEXT:    [[T30:%.*]] = load i8, i8* [[T23]], align 1
 ; CHECK-NEXT:    [[T31:%.*]] = uitofp i8 [[T30]] to float
@@ -1218,7 +1335,18 @@
 ; CHECK-NEXT:    [[T40:%.*]] = fdiv float [[T39]], 2.550000e+02
 ; CHECK-NEXT:    [[T41:%.*]] = getelementptr inbounds float, float* [[T]], i64 3
 ; CHECK-NEXT:    store float [[T40]], float* [[T41]], align 4
+; CHECK-NEXT:    br label [[BB18_MERGE:%.*]]
+; CHECK:       bb18.merge:
 ; CHECK-NEXT:    ret void
+; CHECK:       bb18.slpversioned1:
+; CHECK-NEXT:    [[T295:%.*]] = getelementptr inbounds float, float* [[T]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[T22]] to <4 x i8>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1, !alias.scope !35, !noalias !38
+; CHECK-NEXT:    [[TMP5:%.*]] = uitofp <4 x i8> [[TMP4]] to <4 x float>
+; CHECK-NEXT:    [[TMP6:%.*]] = fdiv <4 x float> [[TMP5]], <float 2.550000e+02, float 2.550000e+02, float 2.550000e+02, float 2.550000e+02>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[T295]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP6]], <4 x float>* [[TMP7]], align 8, !alias.scope !38, !noalias !35
+; CHECK-NEXT:    br label [[BB18_MERGE]]
 ;
 bb:
   %t6 = icmp slt i32 10, 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
@@ -1,33 +1,52 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -scoped-noalias-aa -slp-vectorizer -mtriple=x86_64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s
-; RUN: opt -aa-pipeline='basic-aa,scoped-noalias-aa' -passes=slp-vectorizer -mtriple=x86_64-apple-darwin -S %s | FileCheck %s
+; RUN: opt -aa-pipeline='basic-aa,scoped-noalias-aa' -slp-memory-versioning -passes=slp-vectorizer -mtriple=x86_64-apple-darwin -S %s | FileCheck %s
+; RUN: opt -aa-pipeline='basic-aa,scoped-noalias-aa' -slp-memory-versioning=false -passes=slp-vectorizer -mtriple=x86_64-apple-darwin -S %s | FileCheck --check-prefix=NOVERSION %s
+
+; NOVERSION-NOT: memcheck
 
 define void @version_multiple(i32* nocapture %out_block, i32* nocapture readonly %counter) {
 ; CHECK-LABEL: @version_multiple(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[COUNTER:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK:%.*]], align 4
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[OUT_BLOCK13:%.*]] = ptrtoint i32* [[OUT_BLOCK:%.*]] to i64
+; CHECK-NEXT:    [[COUNTER12:%.*]] = ptrtoint i32* [[COUNTER:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[COUNTER12]], [[OUT_BLOCK13]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]]
+; CHECK:       entry.scalar:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[COUNTER]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2_1]], align 4
-; CHECK-NEXT:    [[XOR_1:%.*]] = xor i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[XOR_1:%.*]] = xor i32 [[TMP4]], [[TMP3]]
 ; CHECK-NEXT:    store i32 [[XOR_1]], i32* [[ARRAYIDX2_1]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX2_2]], align 4
-; CHECK-NEXT:    [[XOR_2:%.*]] = xor i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[XOR_2:%.*]] = xor i32 [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    store i32 [[XOR_2]], i32* [[ARRAYIDX2_2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4
-; CHECK-NEXT:    [[XOR_3:%.*]] = xor i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    [[XOR_3:%.*]] = xor i32 [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    store i32 [[XOR_3]], i32* [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    br label [[ENTRY_MERGE:%.*]]
+; CHECK:       entry.merge:
 ; CHECK-NEXT:    ret void
+; CHECK:       entry.slpversioned1:
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[COUNTER]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <4 x i32> [[TMP12]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    br label [[ENTRY_MERGE]]
 ;
 entry:
   %0 = load i32, i32* %counter, align 4