diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -55,6 +55,14 @@
 
 } // end namespace slpvectorizer
 
+struct SLPVectorizerResult {
+  bool MadeAnyChange;
+  bool MadeCFGChange;
+
+  SLPVectorizerResult(bool MadeAnyChange, bool MadeCFGChange)
+      : MadeAnyChange(MadeAnyChange), MadeCFGChange(MadeCFGChange) {}
+};
+
 struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
   using StoreList = SmallVector<StoreInst *, 8>;
   using StoreListMap = MapVector<Value *, StoreList>;
@@ -75,10 +83,12 @@
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
   // Glue for old PM.
-  bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_,
-               TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_,
-               DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_,
-               OptimizationRemarkEmitter *ORE_);
+  SLPVectorizerResult runImpl(Function &F, ScalarEvolution *SE_,
+                              TargetTransformInfo *TTI_,
+                              TargetLibraryInfo *TLI_, AAResults *AA_,
+                              LoopInfo *LI_, DominatorTree *DT_,
+                              AssumptionCache *AC_, DemandedBits *DB_,
+                              OptimizationRemarkEmitter *ORE_);
 
 private:
   /// Collect store and getelementptr instructions and organize them
@@ -137,6 +147,11 @@
 
   bool vectorizeStores(ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R);
 
+  SLPVectorizerResult
+  vectorizeBlockWithVersioning(BasicBlock *BB,
+                               const SmallPtrSetImpl<Value *> &TrackedObjects,
+                               slpvectorizer::BoUpSLP &R);
+
   /// The store instructions in a basic block organized by base pointer.
   StoreListMap Stores;
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
@@ -62,6 +63,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Operator.h"
@@ -85,8 +87,11 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
 #include <cassert>
@@ -107,6 +112,10 @@
 #define DEBUG_TYPE "SLP"
 
 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
+STATISTIC(NumVersioningSuccessful,
+          "Number of times versioning was tried and beneficial");
+STATISTIC(NumVersioningFailed,
+          "Number of times versioning was tried but was not beneficial");
 
 cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
                                   cl::desc("Run the SLP vectorization passes"));
@@ -175,6 +184,10 @@
     ViewSLPTree("view-slp-tree", cl::Hidden,
                 cl::desc("Display the SLP trees with Graphviz"));
 
+static cl::opt<bool> EnableMemoryVersioning(
+    "slp-memory-versioning", cl::init(false), cl::Hidden,
+    cl::desc("Enable memory versioning for SLP vectorization."));
+
 // Limit the number of alias checks. The limit is chosen so that
 // it has no negative effect on the llvm benchmarks.
 static const unsigned AliasedCheckLimit = 10;
@@ -581,6 +594,45 @@
   return Index;
 }
 
+struct AccessInfo {
+  Value *UnderlyingObj;
+  const SCEV *PtrSCEV;
+  Type *AccessTy;
+
+  AccessInfo() : UnderlyingObj(nullptr), PtrSCEV(nullptr), AccessTy(nullptr) {}
+  AccessInfo(Value *UnderlyingObj, const SCEV *PtrSCEV, Type *AccessTy)
+      : UnderlyingObj(UnderlyingObj), PtrSCEV(PtrSCEV), AccessTy(AccessTy) {}
+};
+static AccessInfo getObject(Instruction &I, ScalarEvolution &SE,
+                            DominatorTree &DT) {
+  BasicBlock *BB = I.getParent();
+  auto GetPtrAndAccessTy = [](Instruction *I) -> std::pair<Value *, Type *> {
+    if (auto *L = dyn_cast<LoadInst>(I)) {
+      if (!L->getType()->isVectorTy())
+        return {L->getPointerOperand(), L->getType()};
+    }
+    if (auto *S = dyn_cast<StoreInst>(I))
+      if (!S->getValueOperand()->getType()->isVectorTy())
+        return {S->getPointerOperand(), S->getValueOperand()->getType()};
+    return {nullptr, nullptr};
+  };
+  Value *Ptr;
+  Type *AccessTy;
+  std::tie(Ptr, AccessTy) = GetPtrAndAccessTy(&I);
+  if (!Ptr)
+    return {};
+  Value *Obj = getUnderlyingObject(Ptr);
+  if (!Obj)
+    return {};
+  auto *Start = SE.getSCEV(Ptr);
+
+  PHINode *PN = dyn_cast<PHINode>(Obj);
+  if (!SE.properlyDominates(Start, BB) &&
+      !(PN && DT.dominates(PN->getParent(), BB)))
+    return {};
+  return {Obj, Start, AccessTy};
+}
+
 namespace slpvectorizer {
 
 /// Bottom Up SLP Vectorizer.
@@ -589,6 +641,16 @@
   struct ScheduleData;
 
 public:
+  // Map of objects to start & end pointers we need to generate runtime checks
+  // for.
+  SmallPtrSet<Value *, 8> TrackedObjects;
+  /// Cache for alias results.
+  /// TODO: consider moving this to the AliasAnalysis itself.
+  using AliasCacheKey = std::pair<Instruction *, Instruction *>;
+  DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
+
+  bool CollectMemAccess = false;
+
   using ValueList = SmallVector<Value *, 8>;
   using InstrList = SmallVector<Instruction *, 16>;
   using ValueSet = SmallPtrSet<Value *, 16>;
@@ -772,6 +834,24 @@
            "All indices must be initialized");
   }
 
+  void removeDeletedInstructions() {
+    for (const auto &Pair : DeletedInstructions) {
+      // Replace operands of ignored instructions with Undefs in case if they
+      // were marked for deletion.
+      if (Pair.getSecond()) {
+        Value *Undef = UndefValue::get(Pair.getFirst()->getType());
+        Pair.getFirst()->replaceAllUsesWith(Undef);
+      }
+      Pair.getFirst()->dropAllReferences();
+    }
+    for (const auto &Pair : DeletedInstructions) {
+      assert(Pair.getFirst()->use_empty() &&
+             "trying to erase instruction with users.");
+      Pair.getFirst()->eraseFromParent();
+    }
+    DeletedInstructions.clear();
+  }
+
   /// \return The vector element size in bits to use when vectorizing the
   /// expression tree ending at \p V. If V is a store, the size is the width of
   /// the stored value. Otherwise, the size is the width of the largest loaded
@@ -1976,11 +2056,6 @@
     return aliased;
   }
 
-  using AliasCacheKey = std::pair<Instruction *, Instruction *>;
-
-  /// Cache for alias results.
-  /// TODO: consider moving this to the AliasAnalysis itself.
-  DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
 
   /// Removes an instruction from its block and eventually deletes it.
   /// It's like Instruction::eraseFromParent() except that the actual deletion
@@ -2565,27 +2640,7 @@
 
 } // end namespace llvm
 
-BoUpSLP::~BoUpSLP() {
-  for (const auto &Pair : DeletedInstructions) {
-    // Replace operands of ignored instructions with Undefs in case if they were
-    // marked for deletion.
-    if (Pair.getSecond()) {
-      Value *Undef = UndefValue::get(Pair.getFirst()->getType());
-      Pair.getFirst()->replaceAllUsesWith(Undef);
-    }
-    Pair.getFirst()->dropAllReferences();
-  }
-  for (const auto &Pair : DeletedInstructions) {
-    assert(Pair.getFirst()->use_empty() &&
-           "trying to erase instruction with users.");
-    Pair.getFirst()->eraseFromParent();
-  }
-#ifdef EXPENSIVE_CHECKS
-  // If we could guarantee that this call is not extremely slow, we could
-  // remove the ifdef limitation (see PR47712).
-  assert(!verifyFunction(*F, &dbgs()));
-#endif
-}
+BoUpSLP::~BoUpSLP() { removeDeletedInstructions(); }
 
 void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
   for (auto *V : AV) {
@@ -6242,6 +6297,7 @@
           while (DepDest) {
             assert(isInSchedulingRegion(DepDest));
 
+            ScheduleData *DestBundle = DepDest->FirstInBundle;
             // We have two limits to reduce the complexity:
             // 1) AliasedCheckLimit: It's a small limit to reduce calls to
             //    SLP->isAliased (which is the expensive part in this loop).
@@ -6259,9 +6315,38 @@
               // balance between reduced runtime and accurate dependencies.
               numAliased++;
 
+              // If this bundle is not scheduled and no versioned code has been
+              // generated yet, try to collect the bounds of the accesses to
+              // generate runtime checks.
+              if (!DestBundle->IsScheduled && SLP->CollectMemAccess) {
+                // FIXME Naming
+                auto GetPtr = [](Instruction *I) -> Value * {
+                  if (auto *L = dyn_cast<LoadInst>(I))
+                    return L->getPointerOperand();
+                  if (auto *S = dyn_cast<StoreInst>(I))
+                    return S->getPointerOperand();
+                  return nullptr;
+                };
+                auto *Src = GetPtr(SrcInst);
+                auto *Dst = GetPtr(DepDest->Inst);
+
+                if (SrcInst->getParent() == DepDest->Inst->getParent() && Src &&
+                    Dst) {
+                  auto SrcObjAndPtr = getObject(*SrcInst, *SLP->SE, *SLP->DT);
+                  auto DstObjAndPtr =
+                      getObject(*DepDest->Inst, *SLP->SE, *SLP->DT);
+                  if (!SrcObjAndPtr.UnderlyingObj ||
+                      !DstObjAndPtr.UnderlyingObj ||
+                      SrcObjAndPtr.UnderlyingObj == DstObjAndPtr.UnderlyingObj)
+                    SLP->TrackedObjects.clear();
+                  else {
+                    SLP->TrackedObjects.insert(SrcObjAndPtr.UnderlyingObj);
+                    SLP->TrackedObjects.insert(DstObjAndPtr.UnderlyingObj);
+                  }
+                }
+              }
               DepDest->MemoryDependencies.push_back(BundleMember);
               BundleMember->Dependencies++;
-              ScheduleData *DestBundle = DepDest->FirstInBundle;
               if (!DestBundle->IsScheduled) {
                 BundleMember->incrementUnscheduledDeps(1);
               }
@@ -6701,7 +6786,7 @@
     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
 
-    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
+    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE).MadeAnyChange;
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -6719,7 +6804,8 @@
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<AAResultsWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
-    AU.setPreservesCFG();
+    if (!EnableMemoryVersioning)
+      AU.setPreservesCFG();
   }
 };
 
@@ -6736,23 +6822,305 @@
   auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
   auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
-  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
-  if (!Changed)
+  auto Result = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
+  if (!Result.MadeAnyChange)
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
-  PA.preserveSet<CFGAnalyses>();
+  if (!Result.MadeCFGChange)
+    PA.preserveSet<CFGAnalyses>();
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<DominatorTreeAnalysis>();
   return PA;
 }
 
-bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
-                                TargetTransformInfo *TTI_,
-                                TargetLibraryInfo *TLI_, AAResults *AA_,
-                                LoopInfo *LI_, DominatorTree *DT_,
-                                AssumptionCache *AC_, DemandedBits *DB_,
-                                OptimizationRemarkEmitter *ORE_) {
+SLPVectorizerResult SLPVectorizerPass::vectorizeBlockWithVersioning(
+    BasicBlock *BB, const SmallPtrSetImpl<Value *> &TrackedObjects,
+    slpvectorizer::BoUpSLP &R) {
+  bool Changed = false;
+  bool CFGChanged = false;
+  R.AliasCache.clear();
+
+  // First, clean up delete instructions, so they are not re-used during SCEV
+  // expansion.
+  R.removeDeletedInstructions();
+
+  // Collect up-to-date memory bounds for tracked objects. Also collect the
+  // first and last memory instruction using a tracked object.
+  MapVector<Value *, RuntimeCheckingPtrGroup> MemBounds;
+  SmallPtrSet<Value *, 4> WrittenObjs;
+  Instruction *FirstTrackedInst = nullptr;
+  Instruction *LastTrackedInst = nullptr;
+  for (Instruction &I : *BB) {
+    auto ObjAndStart = getObject(I, *SE, *DT);
+    if (!ObjAndStart.UnderlyingObj)
+      continue;
+    auto *Obj = ObjAndStart.UnderlyingObj;
+    const auto *Start = ObjAndStart.PtrSCEV;
+
+    if (I.mayWriteToMemory())
+      WrittenObjs.insert(Obj);
+
+    unsigned AS = Obj->getType()->getPointerAddressSpace();
+    // Runtime checks are generated to ensure this property holds.
+    auto &DL = BB->getModule()->getDataLayout();
+    Type *IdxTy = DL.getIndexType(Obj->getType());
+    const SCEV *EltSizeSCEV =
+        SE->getStoreSizeOfExpr(IdxTy, ObjAndStart.AccessTy);
+    auto *End = SE->getAddExpr(Start, EltSizeSCEV);
+
+    if (TrackedObjects.find(Obj) != TrackedObjects.end())
+      MemBounds.insert({Obj, {0, Start, End, AS}});
+    auto BoundsIter = MemBounds.find(Obj);
+    if (BoundsIter == MemBounds.end())
+      continue;
+    BoundsIter->second.addPointer(0, Start, End, AS, *SE);
+    if (!FirstTrackedInst)
+      FirstTrackedInst = &I;
+    LastTrackedInst = &I;
+  }
+
+  // Not enough memory access bounds for runtime checks.
+  if (MemBounds.size() < 2 || WrittenObjs.empty())
+    return {Changed, CFGChanged};
+
+  // Check if all uses between the first and last tracked instruction are inside
+  // the region. If that is not the case, PHIs would need to be added when
+  // duplicating the block.
+  auto AllUsesInside = [FirstTrackedInst, LastTrackedInst](BasicBlock *BB) {
+    return all_of(make_range(FirstTrackedInst->getIterator(),
+                             std::next(LastTrackedInst->getIterator())),
+                  [LastTrackedInst, BB](Instruction &I) {
+                    return all_of(I.users(), [LastTrackedInst, BB](User *U) {
+                      if (auto *UserI = dyn_cast<Instruction>(U))
+                        return UserI->getParent() == BB &&
+                               (UserI->comesBefore(LastTrackedInst) ||
+                                UserI == LastTrackedInst);
+                      return true;
+                    });
+                  });
+  };
+  if (!AllUsesInside(BB))
+    return {Changed, CFGChanged};
+
+  SmallVector<std::pair<Value *, RuntimeCheckingPtrGroup *>> BoundGroups;
+  for (auto &B : MemBounds)
+    BoundGroups.emplace_back(B.first, &B.second);
+
+  // Create a RuntimePointerCheck for all groups in BoundGroups.
+  SmallVector<RuntimePointerCheck> PointerChecks;
+  for (unsigned I = 0, E = BoundGroups.size(); I != E; ++I) {
+    bool AWrites = WrittenObjs.contains(BoundGroups[I].first);
+    for (unsigned J = I + 1; J != E; ++J)
+      if (AWrites || WrittenObjs.contains(BoundGroups[J].first))
+        PointerChecks.emplace_back(&*BoundGroups[I].second,
+                                   &*BoundGroups[J].second);
+  }
+
+  // Duplicate BB now and set up block and branches for memory checks.
+  std::string OriginalBBName = BB->getName().str();
+  IRBuilder<> ChkBuilder(BB->getFirstNonPHI());
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+
+  BasicBlock *Tail = nullptr;
+  if (LastTrackedInst->getNextNode() != BB->getTerminator())
+    Tail = SplitBlock(BB, LastTrackedInst->getNextNode(), &DTU, LI, nullptr,
+                      OriginalBBName + ".tail");
+  auto *CheckBlock = BB;
+  BB = SplitBlock(BB, FirstTrackedInst, &DTU, LI, nullptr,
+                  OriginalBBName + ".slpmemcheck");
+  for (Use &U : make_early_inc_range(BB->uses())) {
+    BasicBlock *UserBB = cast<Instruction>(U.getUser())->getParent();
+    if (UserBB == CheckBlock)
+      continue;
+
+    U.set(CheckBlock);
+    DTU.applyUpdates({{DT->Delete, UserBB, BB}});
+    DTU.applyUpdates({{DT->Insert, UserBB, CheckBlock}});
+  }
+  CFGChanged = true;
+
+  auto *MergeBlock = BB;
+  BasicBlock *ScalarBB =
+      splitBlockBefore(BB, BB->getTerminator(), &DTU, LI, nullptr,
+                       OriginalBBName + ".slpversioned");
+
+  ValueToValueMapTy VMap;
+  BasicBlock *VectorBB = CloneBasicBlock(ScalarBB, VMap, "", BB->getParent());
+  ScalarBB->setName(OriginalBBName + ".scalar");
+  MergeBlock->setName(OriginalBBName + ".merge");
+  SmallVector<BasicBlock *> Tmp;
+  Tmp.push_back(VectorBB);
+  remapInstructionsInBlocks(Tmp, VMap);
+  auto *Term = CheckBlock->getTerminator();
+  ChkBuilder.SetInsertPoint(CheckBlock->getTerminator());
+  ChkBuilder.CreateCondBr(ChkBuilder.getTrue(), ScalarBB, VectorBB);
+  Term->eraseFromParent();
+  DTU.applyUpdates({{DT->Insert, CheckBlock, VectorBB}});
+  if (auto *L = LI->getLoopFor(CheckBlock))
+    L->addBasicBlockToLoop(VectorBB, *LI);
+  Changed = true;
+
+  // Add !noalias metadata to memory accesses in the versiond block.
+  LLVMContext &Ctx = BB->getContext();
+  MDBuilder MDB(Ctx);
+  MDNode *Domain = MDB.createAnonymousAliasScopeDomain("SLPVerDomain");
+
+  DenseMap<const RuntimeCheckingPtrGroup *, MDNode *> GroupToScope;
+  for (const auto &Group : MemBounds)
+    GroupToScope[&Group.second] = MDB.createAnonymousAliasScope(Domain);
+
+  for (Instruction &I : *VectorBB) {
+    auto GetPtr = [](Instruction *I) -> Value * {
+      if (auto *L = dyn_cast<LoadInst>(I))
+        return L->getPointerOperand();
+      if (auto *S = dyn_cast<StoreInst>(I))
+        return S->getPointerOperand();
+      return nullptr;
+    };
+    auto *Ptr = GetPtr(&I);
+    if (!Ptr)
+      continue;
+
+    auto *PtrSCEV = SE->getSCEV(Ptr);
+    Value *Obj = getUnderlyingObject(Ptr);
+    if (!Obj) {
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr))
+        Obj = GEP->getOperand(0);
+      else
+        continue;
+    }
+
+    auto BoundsIter = MemBounds.find(Obj);
+    if (BoundsIter == MemBounds.end())
+      continue;
+    auto *LowerBound = BoundsIter->second.Low;
+    auto *UpperBound = BoundsIter->second.High;
+    auto *Scope = GroupToScope.find(&BoundsIter->second)->second;
+
+    auto *LowerSub = SE->getMinusSCEV(PtrSCEV, LowerBound);
+    auto *UpperSub = SE->getMinusSCEV(UpperBound, PtrSCEV);
+    if (!isa<SCEVCouldNotCompute>(LowerSub) &&
+        !isa<SCEVCouldNotCompute>(UpperSub) &&
+        SE->isKnownNonNegative(LowerSub) && SE->isKnownNonNegative(UpperSub)) {
+      I.setMetadata(
+          LLVMContext::MD_alias_scope,
+          MDNode::concatenate(I.getMetadata(LLVMContext::MD_alias_scope),
+                              MDNode::get(Ctx, Scope)));
+
+      SmallVector<Metadata *, 4> NonAliasing;
+      for (auto &KV : GroupToScope) {
+        if (KV.first == &BoundsIter->second)
+          continue;
+        NonAliasing.push_back(KV.second);
+      }
+      I.setMetadata(LLVMContext::MD_noalias,
+                    MDNode::concatenate(I.getMetadata(LLVMContext::MD_noalias),
+                                        MDNode::get(Ctx, NonAliasing)));
+    }
+  }
+
+  DTU.flush();
+  DT->updateDFSNumbers();
+  collectSeedInstructions(VectorBB);
+
+  // Vectorize trees that end at stores.
+  assert(!Stores.empty() && "should have stores when versioning");
+  LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
+                    << " underlying objects.\n");
+  bool AnyVectorized = vectorizeStoreChains(R);
+  Changed |= AnyVectorized;
+
+  R.removeDeletedInstructions();
+  InstructionCost ScalarCost = 0;
+  for (Instruction &I : *ScalarBB)
+    ScalarCost += TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+  InstructionCost SLPCost = 0;
+  for (Instruction &I : make_early_inc_range(reverse(*VectorBB))) {
+    if (!I.getType()->isVoidTy() && I.use_empty()) {
+      I.eraseFromParent();
+      continue;
+    }
+    SLPCost += TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+  }
+
+  // Estimate the size of the runtime checks, consisting of computing lower &
+  // upper bounds (2), the overlap checks (2) and the AND/OR to combine the
+  // checks.
+  SLPCost += 5 * PointerChecks.size() + MemBounds.size();
+  if (SLPCost >= ScalarCost) {
+    // Vectorization not beneficial or possible. Restore original state by
+    // removing the introduced blocks.
+    R.getORE()->emit([&]() {
+      return OptimizationRemarkMissed(SV_NAME, "VersioningNotBeneficial",
+                                      &*ScalarBB->begin())
+             << "Tried to version block but was not beneficial"
+             << ore::NV("VectorCost", SLPCost)
+             << " >= " << ore::NV("ScalarCost", ScalarCost)
+             << ore::NV("AnyVectorized", AnyVectorized);
+    });
+
+    Changed = false;
+    CFGChanged = false;
+    CheckBlock->setName(OriginalBBName);
+    Instruction *OldTerm = CheckBlock->getTerminator();
+    OldTerm->eraseFromParent();
+    IRBuilder<> Builder(CheckBlock);
+    Builder.CreateBr(ScalarBB);
+    DTU.applyUpdates({{DT->Delete, CheckBlock, VectorBB}});
+    LI->removeBlock(VectorBB);
+    DTU.deleteBB(VectorBB);
+    DTU.applyUpdates({{DT->Delete, VectorBB, MergeBlock}});
+    MergeBlockIntoPredecessor(MergeBlock, &DTU, LI);
+    if (Tail)
+      MergeBlockIntoPredecessor(Tail, &DTU, LI);
+    MergeBlockIntoPredecessor(ScalarBB, &DTU, LI);
+    NumVersioningFailed++;
+  } else {
+    R.getORE()->emit(
+        OptimizationRemark(SV_NAME, "VersioningSuccessful", &*ScalarBB->begin())
+        << "SLP vectorization with versioning is beneficial "
+        << ore::NV("VectorCost", SLPCost) << " < "
+        << ore::NV("ScalarCost", ScalarCost)
+        << ore::NV("AnyVectorized", AnyVectorized));
+
+    ChkBuilder.SetInsertPoint(CheckBlock->getTerminator());
+    SCEVExpander Exp(*SE, BB->getParent()->getParent()->getDataLayout(),
+                     "memcheck");
+    Value *MemoryOverlap = addRuntimeChecks(CheckBlock->getTerminator(),
+                                            nullptr, PointerChecks, Exp)
+                               .second;
+    assert(MemoryOverlap &&
+           "runtime checks required, but no checks generated in IR?");
+
+    Value *NoOverflowCheck = MemoryOverlap;
+    //   https://alive2.llvm.org/ce/z/dTuGLx
+    //   // Emit checks ensuring that computing the upper bound does not
+    //   overflow.
+    // for (auto &B : MemBounds) {
+    // Type *PtrArithTy = Type::getInt8PtrTy(Ctx, B.second.AddressSpace);
+    // Value *Low = Exp.expandCodeFor(B.second.Low, PtrArithTy);
+    // Value *High = Exp.expandCodeFor(B.second.High, PtrArithTy);
+    // NoOverflowCheck = ChkBuilder.CreateOr(
+    // NoOverflowCheck, ChkBuilder.CreateICmpUGT(Low, High, "wrap"),
+    //"check");
+    //}
+    cast<BranchInst>(CheckBlock->getTerminator())
+        ->setCondition(NoOverflowCheck);
+    NumVersioningSuccessful++;
+  }
+  DTU.flush();
+  DT->updateDFSNumbers();
+
+  return {Changed, CFGChanged};
+}
+
+SLPVectorizerResult SLPVectorizerPass::runImpl(
+    Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_,
+    TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_,
+    AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_) {
   if (!RunSLPVectorization)
-    return false;
+    return {false, false};
   SE = SE_;
   TTI = TTI_;
   TLI = TLI_;
@@ -6766,15 +7134,16 @@
   Stores.clear();
   GEPs.clear();
   bool Changed = false;
+  bool CFGChanged = false;
 
   // If the target claims to have no vector registers don't attempt
   // vectorization.
   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)))
-    return false;
+    return {false, false};
 
   // Don't vectorize when the attribute NoImplicitFloat is used.
   if (F.hasFnAttribute(Attribute::NoImplicitFloat))
-    return false;
+    return {false, false};
 
   LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
 
@@ -6788,19 +7157,29 @@
   // Update DFS numbers now so that we can use them for ordering.
   DT->updateDFSNumbers();
 
+  SmallVector<BasicBlock *, 4> BlocksToRetry;
+  SmallVector<SmallPtrSet<Value *, 8>, 4> BoundsToUse;
   // Scan the blocks in the function in post order.
   for (auto BB : post_order(&F.getEntryBlock())) {
     collectSeedInstructions(BB);
 
+    bool VectorizedBlock = false;
     // Vectorize trees that end at stores.
     if (!Stores.empty()) {
       LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
                         << " underlying objects.\n");
-      Changed |= vectorizeStoreChains(R);
+      R.TrackedObjects.clear();
+
+      if (EnableMemoryVersioning)
+        R.CollectMemAccess = BB->size() <= 300;
+
+      VectorizedBlock = vectorizeStoreChains(R);
+
+      R.CollectMemAccess = false;
     }
 
     // Vectorize trees that end at reductions.
-    Changed |= vectorizeChainsInBlock(BB, R);
+    VectorizedBlock |= vectorizeChainsInBlock(BB, R);
 
     // Vectorize the index computations of getelementptr instructions. This
     // is primarily intended to catch gather-like idioms ending at
@@ -6808,15 +7187,30 @@
     if (!GEPs.empty()) {
       LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
                         << " underlying objects.\n");
-      Changed |= vectorizeGEPIndices(BB, R);
+      VectorizedBlock |= vectorizeGEPIndices(BB, R);
+    }
+
+    if (!VectorizedBlock && !R.TrackedObjects.empty()) {
+      BlocksToRetry.push_back(BB);
+      BoundsToUse.push_back(R.TrackedObjects);
     }
+    R.TrackedObjects.clear();
+    Changed |= VectorizedBlock;
   }
 
-  if (Changed) {
+  for (unsigned I = 0; I != BlocksToRetry.size(); I++) {
+    auto Status =
+        vectorizeBlockWithVersioning(BlocksToRetry[I], BoundsToUse[I], R);
+    Changed |= Status.MadeAnyChange;
+    CFGChanged |= Status.MadeCFGChange;
+  }
+
+  if (Changed && BlocksToRetry.empty()) {
     R.optimizeGatherSequence();
     LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
   }
-  return Changed;
+
+  return {Changed, CFGChanged};
 }
 
 /// Order may have elements assigned special value (size) which is out of
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s
+; RUN: opt -slp-memory-versioning -scoped-noalias-aa -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -enable-new-pm=false < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64"
@@ -102,7 +102,15 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SCALE]], align 16
 ; CHECK-NEXT:    [[OFFSET:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T]], %struct.weight_t* [[W]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[OFFSET]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[SRC:%.*]], align 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 4
+; CHECK-NEXT:    [[SCEVGEP37:%.*]] = getelementptr i8, i8* [[DST:%.*]], i64 4
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[SRC]], [[SCEVGEP37]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[DST]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]]
+; CHECK:       entry.scalar:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[SRC]], align 1
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP2]] to i32
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], [[CONV]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[MUL]], [[TMP1]]
@@ -111,7 +119,7 @@
 ; CHECK-NEXT:    [[SHR_I:%.*]] = sext i1 [[TMP3]] to i32
 ; CHECK-NEXT:    [[COND_I:%.*]] = select i1 [[TOBOOL_NOT_I]], i32 [[ADD]], i32 [[SHR_I]]
 ; CHECK-NEXT:    [[CONV_I:%.*]] = trunc i32 [[COND_I]] to i8
-; CHECK-NEXT:    store i8 [[CONV_I]], i8* [[DST:%.*]], align 1
+; CHECK-NEXT:    store i8 [[CONV_I]], i8* [[DST]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1
 ; CHECK-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP4]] to i32
@@ -148,7 +156,27 @@
 ; CHECK-NEXT:    [[CONV_I_3:%.*]] = trunc i32 [[COND_I_3]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3
 ; CHECK-NEXT:    store i8 [[CONV_I_3]], i8* [[ARRAYIDX2_3]], align 1
+; CHECK-NEXT:    br label [[ENTRY_MERGE:%.*]]
+; CHECK:       entry.merge:
 ; CHECK-NEXT:    ret void
+; CHECK:       entry.slpversioned1:
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[SRC]] to <4 x i8>*
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP10]], align 1, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[TMP12:%.*]] = zext <4 x i8> [[TMP11]] to <4 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[SHUFFLE36:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = add nsw <4 x i32> [[TMP14]], [[SHUFFLE36]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ult <4 x i32> [[TMP16]], <i32 256, i32 256, i32 256, i32 256>
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp sgt <4 x i32> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <4 x i1> [[TMP18]] to <4 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = select <4 x i1> [[TMP17]], <4 x i32> [[TMP16]], <4 x i32> [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = trunc <4 x i32> [[TMP20]] to <4 x i8>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8* [[DST]] to <4 x i8>*
+; CHECK-NEXT:    store <4 x i8> [[TMP21]], <4 x i8>* [[TMP22]], align 1, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    br label [[ENTRY_MERGE]]
 ;
 entry:
   %scale = getelementptr inbounds %struct.weight_t, %struct.weight_t* %w, i64 0, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks-in-loops.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks-in-loops.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks-in-loops.ll
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -scoped-noalias-aa -slp-vectorizer  -slp-memory-versioning -enable-new-pm=false -mtriple=arm64-apple-ios -S %s | FileCheck %s
+
+define void @loop1(i32* %A, i32* %B, i64 %N) {
+; CHECK-LABEL: @loop1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[LOOP_TAIL:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP_TAIL]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw i64 [[INDVAR]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[SCEVGEP28:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], 4
+; CHECK-NEXT:    [[SCEVGEP29:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[SCEVGEP2930:%.*]] = bitcast i32* [[SCEVGEP29]] to i8*
+; CHECK-NEXT:    [[SCEVGEP31:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[SCEVGEP3132:%.*]] = bitcast i32* [[SCEVGEP31]] to i8*
+; CHECK-NEXT:    [[SCEVGEP33:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[SCEVGEP3334:%.*]] = bitcast i32* [[SCEVGEP33]] to i8*
+; CHECK-NEXT:    [[B_GEP_0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP28]], [[SCEVGEP3334]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP3132]], [[SCEVGEP2930]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[LOOP_SCALAR:%.*]], label [[LOOP_SLPVERSIONED1:%.*]]
+; CHECK:       loop.scalar:
+; CHECK-NEXT:    [[B_0:%.*]] = load i32, i32* [[B_GEP_0]], align 4
+; CHECK-NEXT:    [[A_GEP_0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[A_0:%.*]] = load i32, i32* [[A_GEP_0]], align 4
+; CHECK-NEXT:    [[ADD_0:%.*]] = add i32 [[A_0]], 20
+; CHECK-NEXT:    [[XOR_0:%.*]] = xor i32 [[ADD_0]], [[B_0]]
+; CHECK-NEXT:    store i32 [[XOR_0]], i32* [[A_GEP_0]], align 4
+; CHECK-NEXT:    [[IV_1:%.*]] = or i64 [[IV]], 1
+; CHECK-NEXT:    [[B_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV_1]]
+; CHECK-NEXT:    [[B_1:%.*]] = load i32, i32* [[B_GEP_1]], align 4
+; CHECK-NEXT:    [[A_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV_1]]
+; CHECK-NEXT:    [[A_1:%.*]] = load i32, i32* [[A_GEP_1]], align 4
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[A_1]], 20
+; CHECK-NEXT:    [[XOR_1:%.*]] = xor i32 [[ADD_1]], [[B_1]]
+; CHECK-NEXT:    store i32 [[XOR_1]], i32* [[A_GEP_1]], align 4
+; CHECK-NEXT:    [[IV_2:%.*]] = or i64 [[IV]], 2
+; CHECK-NEXT:    [[B_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV_2]]
+; CHECK-NEXT:    [[B_2:%.*]] = load i32, i32* [[B_GEP_2]], align 4
+; CHECK-NEXT:    [[A_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV_2]]
+; CHECK-NEXT:    [[A_2:%.*]] = load i32, i32* [[A_GEP_2]], align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[A_2]], 20
+; CHECK-NEXT:    [[XOR_2:%.*]] = xor i32 [[ADD_2]], [[B_2]]
+; CHECK-NEXT:    store i32 [[XOR_2]], i32* [[A_GEP_2]], align 4
+; CHECK-NEXT:    [[IV_3:%.*]] = or i64 [[IV]], 3
+; CHECK-NEXT:    [[B_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV_3]]
+; CHECK-NEXT:    [[B_3:%.*]] = load i32, i32* [[B_GEP_3]], align 4
+; CHECK-NEXT:    [[A_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV_3]]
+; CHECK-NEXT:    [[A_3:%.*]] = load i32, i32* [[A_GEP_3]], align 4
+; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[A_3]], 20
+; CHECK-NEXT:    [[XOR_3:%.*]] = xor i32 [[ADD_3]], [[B_3]]
+; CHECK-NEXT:    store i32 [[XOR_3]], i32* [[A_GEP_3]], align 4
+; CHECK-NEXT:    br label [[LOOP_MERGE:%.*]]
+; CHECK:       loop.merge:
+; CHECK-NEXT:    br label [[LOOP_TAIL]]
+; CHECK:       loop.tail:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 16
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i64 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
+; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       loop.slpversioned1:
+; CHECK-NEXT:    [[A_GEP_03:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[B_GEP_0]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[A_GEP_03]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP5]], <i32 20, i32 20, i32 20, i32 20>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[A_GEP_03]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    br label [[LOOP_MERGE]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %B.gep.0 = getelementptr inbounds i32, i32* %B, i64 %iv
+  %B.0 = load i32, i32* %B.gep.0, align 4
+  %A.gep.0 = getelementptr inbounds i32, i32* %A, i64 %iv
+  %A.0 = load i32, i32* %A.gep.0, align 4
+  %add.0 = add i32  %A.0, 20
+  %xor.0 = xor i32 %add.0, %B.0
+  store i32 %xor.0, i32* %A.gep.0, align 4
+  %iv.1 = or i64 %iv, 1
+  %B.gep.1 = getelementptr inbounds i32, i32* %B, i64 %iv.1
+  %B.1 = load i32, i32* %B.gep.1, align 4
+  %A.gep.1 = getelementptr inbounds i32, i32* %A, i64 %iv.1
+  %A.1 = load i32, i32* %A.gep.1, align 4
+  %add.1 = add i32  %A.1, 20
+  %xor.1 = xor i32 %add.1, %B.1
+  store i32 %xor.1, i32* %A.gep.1, align 4
+  %iv.2 = or i64 %iv, 2
+  %B.gep.2 = getelementptr inbounds i32, i32* %B, i64 %iv.2
+  %B.2 = load i32, i32* %B.gep.2, align 4
+  %A.gep.2 = getelementptr inbounds i32, i32* %A, i64 %iv.2
+  %A.2 = load i32, i32* %A.gep.2, align 4
+  %add.2 = add i32  %A.2, 20
+  %xor.2 = xor i32 %add.2, %B.2
+  store i32 %xor.2, i32* %A.gep.2, align 4
+  %iv.3 = or i64 %iv, 3
+  %B.gep.3 = getelementptr inbounds i32, i32* %B, i64 %iv.3
+  %B.3 = load i32, i32* %B.gep.3, align 4
+  %A.gep.3 = getelementptr inbounds i32, i32* %A, i64 %iv.3
+  %A.3 = load i32, i32* %A.gep.3, align 4
+  %add.3 = add i32  %A.3, 20
+  %xor.3 = xor i32 %add.3, %B.3
+  store i32 %xor.3, i32* %A.gep.3, align 4
+  %iv.next = add nuw nsw i64 %iv, 16
+  %cond = icmp ult i64 %iv.next, %N
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @loop_iv_update_at_start(float* %src, float* %dst) #0 {
+; CHECK-LABEL: @loop_iv_update_at_start(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC26:%.*]] = bitcast float* [[SRC:%.*]] to i8*
+; CHECK-NEXT:    [[DST28:%.*]] = bitcast float* [[DST:%.*]] to i8*
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[SRC]], i64 5
+; CHECK-NEXT:    [[SCEVGEP27:%.*]] = bitcast float* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[SCEVGEP29:%.*]] = getelementptr float, float* [[DST]], i64 5
+; CHECK-NEXT:    [[SCEVGEP2930:%.*]] = bitcast float* [[SCEVGEP29]] to i8*
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_MERGE:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[IV]], 2000
+; CHECK-NEXT:    [[SRC_GEP_0:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 0
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[SRC26]], [[SCEVGEP2930]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[DST28]], [[SCEVGEP27]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[LOOP_SCALAR:%.*]], label [[LOOP_SLPVERSIONED1:%.*]]
+; CHECK:       loop.scalar:
+; CHECK-NEXT:    [[SRC_0:%.*]] = load float, float* [[SRC_GEP_0]], align 8
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd float [[SRC_0]], 1.000000e+00
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul float [[ADD_0]], [[SRC_0]]
+; CHECK-NEXT:    [[DST_GEP_0:%.*]] = getelementptr inbounds float, float* [[DST]], i64 0
+; CHECK-NEXT:    store float [[MUL_0]], float* [[DST_GEP_0]], align 8
+; CHECK-NEXT:    [[SRC_GEP_1:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 1
+; CHECK-NEXT:    [[SRC_1:%.*]] = load float, float* [[SRC_GEP_1]], align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd float [[SRC_1]], 1.000000e+00
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul float [[ADD_1]], [[SRC_1]]
+; CHECK-NEXT:    [[DST_GEP_1:%.*]] = getelementptr inbounds float, float* [[DST]], i64 1
+; CHECK-NEXT:    store float [[MUL_1]], float* [[DST_GEP_1]], align 8
+; CHECK-NEXT:    [[SRC_GEP_2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[SRC_2:%.*]] = load float, float* [[SRC_GEP_2]], align 8
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd float [[SRC_2]], 1.000000e+00
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul float [[ADD_2]], [[SRC_2]]
+; CHECK-NEXT:    [[DST_GEP_2:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[MUL_2]], float* [[DST_GEP_2]], align 8
+; CHECK-NEXT:    [[SRC_GEP_3:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[SRC_3:%.*]] = load float, float* [[SRC_GEP_3]], align 8
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd float [[SRC_3]], 1.000000e+00
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul float [[ADD_3]], [[SRC_3]]
+; CHECK-NEXT:    [[DST_GEP_3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[MUL_3]], float* [[DST_GEP_3]], align 8
+; CHECK-NEXT:    [[SRC_GEP_4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 4
+; CHECK-NEXT:    [[SRC_4:%.*]] = load float, float* [[SRC_GEP_4]], align 8
+; CHECK-NEXT:    [[ADD_4:%.*]] = fadd float [[SRC_4]], 1.000000e+00
+; CHECK-NEXT:    [[MUL_4:%.*]] = fmul float [[ADD_4]], [[SRC_4]]
+; CHECK-NEXT:    [[DST_GEP_4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 4
+; CHECK-NEXT:    store float [[MUL_4]], float* [[DST_GEP_4]], align 8
+; CHECK-NEXT:    br label [[LOOP_MERGE]]
+; CHECK:       loop.merge:
+; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       loop.slpversioned1:
+; CHECK-NEXT:    [[DST_GEP_05:%.*]] = getelementptr inbounds float, float* [[DST]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC_GEP_0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 8, !alias.scope !5, !noalias !8
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[DST_GEP_05]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 8, !alias.scope !8, !noalias !5
+; CHECK-NEXT:    [[SRC_GEP_421:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 4
+; CHECK-NEXT:    [[SRC_422:%.*]] = load float, float* [[SRC_GEP_421]], align 8, !alias.scope !5, !noalias !8
+; CHECK-NEXT:    [[ADD_423:%.*]] = fadd float [[SRC_422]], 1.000000e+00
+; CHECK-NEXT:    [[MUL_424:%.*]] = fmul float [[ADD_423]], [[SRC_422]]
+; CHECK-NEXT:    [[DST_GEP_425:%.*]] = getelementptr inbounds float, float* [[DST]], i64 4
+; CHECK-NEXT:    store float [[MUL_424]], float* [[DST_GEP_425]], align 8, !alias.scope !8, !noalias !5
+; CHECK-NEXT:    br label [[LOOP_MERGE]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i32 %iv, 1
+  %cond = icmp ult i32 %iv, 2000
+
+  %src.gep.0 = getelementptr inbounds float, float* %src, i64 0
+  %src.0 = load float, float* %src.gep.0, align 8
+  %add.0 = fadd float %src.0, 1.0
+  %mul.0 = fmul float %add.0, %src.0
+  %dst.gep.0 = getelementptr inbounds float, float* %dst, i64 0
+  store float %mul.0, float* %dst.gep.0, align 8
+
+  %src.gep.1 = getelementptr inbounds float, float* %src, i64 1
+  %src.1 = load float, float* %src.gep.1, align 8
+  %add.1 = fadd float %src.1, 1.0
+  %mul.1 = fmul float %add.1, %src.1
+  %dst.gep.1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %mul.1, float* %dst.gep.1, align 8
+  %src.gep.2 = getelementptr inbounds float, float* %src, i64 2
+  %src.2 = load float, float* %src.gep.2, align 8
+  %add.2 = fadd float %src.2, 1.0
+  %mul.2 = fmul float %add.2, %src.2
+  %dst.gep.2 = getelementptr inbounds float, float* %dst, i64 2
+  store float %mul.2, float* %dst.gep.2, align 8
+  %src.gep.3 = getelementptr inbounds float, float* %src, i64 3
+  %src.3 = load float, float* %src.gep.3, align 8
+  %add.3 = fadd float %src.3, 1.0
+  %mul.3 = fmul float %add.3, %src.3
+  %dst.gep.3 = getelementptr inbounds float, float* %dst, i64 3
+  store float %mul.3, float* %dst.gep.3, align 8
+  %src.gep.4 = getelementptr inbounds float, float* %src, i64 4
+  %src.4 = load float, float* %src.gep.4, align 8
+  %add.4 = fadd float %src.4, 1.0
+  %mul.4 = fmul float %add.4, %src.4
+  %dst.gep.4 = getelementptr inbounds float, float* %dst, i64 4
+  store float %mul.4, float* %dst.gep.4, align 8
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -scoped-noalias-aa -slp-vectorizer -mtriple=arm64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s
+; RUN: opt -scoped-noalias-aa -slp-vectorizer -slp-memory-versioning -mtriple=arm64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s
+; RUN: opt -slp-memory-versioning=false -scoped-noalias-aa -slp-vectorizer -mtriple=arm64-apple-darwin -enable-new-pm=false -S %s | FileCheck --check-prefix=NOVERSION %s
+
+; NOVERSION-NOT: slpversioned
 
 define void @needs_versioning_not_profitable(i32* %dst, i32* %src) {
 ; CHECK-LABEL: @needs_versioning_not_profitable(
@@ -29,6 +32,317 @@
 define void @needs_versioning_profitable(i32* %dst, i32* %src) {
 ; CHECK-LABEL: @needs_versioning_profitable(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC16:%.*]] = bitcast i32* [[SRC:%.*]] to i8*
+; CHECK-NEXT:    [[DST18:%.*]] = bitcast i32* [[DST:%.*]] to i8*
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[SRC]], i64 4
+; CHECK-NEXT:    [[SCEVGEP17:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[SCEVGEP19:%.*]] = getelementptr i32, i32* [[DST]], i64 4
+; CHECK-NEXT:    [[SCEVGEP1920:%.*]] = bitcast i32* [[SCEVGEP19]] to i8*
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[SRC16]], [[SCEVGEP1920]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[DST18]], [[SCEVGEP17]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]]
+; CHECK:       entry.scalar:
+; CHECK-NEXT:    [[SRC_0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[R_0:%.*]] = ashr i32 [[SRC_0]], 16
+; CHECK-NEXT:    store i32 [[R_0]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1
+; CHECK-NEXT:    [[SRC_1:%.*]] = load i32, i32* [[SRC_GEP_1]], align 4
+; CHECK-NEXT:    [[R_1:%.*]] = ashr i32 [[SRC_1]], 16
+; CHECK-NEXT:    [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1
+; CHECK-NEXT:    store i32 [[R_1]], i32* [[DST_GEP_1]], align 4
+; CHECK-NEXT:    [[SRC_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[SRC_2:%.*]] = load i32, i32* [[SRC_GEP_2]], align 4
+; CHECK-NEXT:    [[R_2:%.*]] = ashr i32 [[SRC_2]], 16
+; CHECK-NEXT:    [[DST_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[R_2]], i32* [[DST_GEP_2]], align 4
+; CHECK-NEXT:    [[SRC_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[SRC_3:%.*]] = load i32, i32* [[SRC_GEP_3]], align 4
+; CHECK-NEXT:    [[R_3:%.*]] = ashr i32 [[SRC_3]], 16
+; CHECK-NEXT:    [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[R_3]], i32* [[DST_GEP_3]], align 4
+; CHECK-NEXT:    br label [[ENTRY_MERGE:%.*]]
+; CHECK:       entry.merge:
+; CHECK-NEXT:    ret void
+; CHECK:       entry.slpversioned1:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    br label [[ENTRY_MERGE]]
+;
+entry:
+  %src.0 = load i32, i32* %src, align 4
+  %r.0 = ashr i32 %src.0, 16
+  store i32 %r.0, i32* %dst, align 4
+  %src.gep.1 = getelementptr inbounds i32, i32* %src, i64 1
+  %src.1 = load i32, i32* %src.gep.1, align 4
+  %r.1 = ashr i32 %src.1, 16
+  %dst.gep.1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %r.1, i32* %dst.gep.1, align 4
+  %src.gep.2 = getelementptr inbounds i32, i32* %src, i64 2
+  %src.2 = load i32, i32* %src.gep.2, align 4
+  %r.2 = ashr i32 %src.2, 16
+  %dst.gep.2 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %r.2, i32* %dst.gep.2, align 4
+  %src.gep.3 = getelementptr inbounds i32, i32* %src, i64 3
+  %src.3 = load i32, i32* %src.gep.3, align 4
+  %r.3 = ashr i32 %src.3, 16
+  %dst.gep.3 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %r.3, i32* %dst.gep.3, align 4
+
+  ret void
+}
+
+define void @needs_versioning_profitable_2_sources(i32* %dst, i32* %A, i32* %B) {
+; CHECK-LABEL: @needs_versioning_profitable_2_sources(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A27:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[DST29:%.*]] = bitcast i32* [[DST:%.*]] to i8*
+; CHECK-NEXT:    [[B32:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 4
+; CHECK-NEXT:    [[SCEVGEP28:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[SCEVGEP30:%.*]] = getelementptr i32, i32* [[DST]], i64 4
+; CHECK-NEXT:    [[SCEVGEP3031:%.*]] = bitcast i32* [[SCEVGEP30]] to i8*
+; CHECK-NEXT:    [[SCEVGEP33:%.*]] = getelementptr i32, i32* [[B]], i64 4
+; CHECK-NEXT:    [[SCEVGEP3334:%.*]] = bitcast i32* [[SCEVGEP33]] to i8*
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[A27]], [[SCEVGEP3031]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[DST29]], [[SCEVGEP28]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND035:%.*]] = icmp ult i8* [[B32]], [[SCEVGEP3031]]
+; CHECK-NEXT:    [[BOUND136:%.*]] = icmp ult i8* [[DST29]], [[SCEVGEP3334]]
+; CHECK-NEXT:    [[FOUND_CONFLICT37:%.*]] = and i1 [[BOUND035]], [[BOUND136]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT37]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]]
+; CHECK:       entry.scalar:
+; CHECK-NEXT:    [[A_0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    [[B_0:%.*]] = load i32, i32* [[B]], align 4
+; CHECK-NEXT:    [[R_0:%.*]] = add i32 [[A_0]], [[B_0]]
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[R_0]], 2
+; CHECK-NEXT:    store i32 [[MUL_0]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[A_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
+; CHECK-NEXT:    [[A_1:%.*]] = load i32, i32* [[A_GEP_1]], align 4
+; CHECK-NEXT:    [[B_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1
+; CHECK-NEXT:    [[B_1:%.*]] = load i32, i32* [[B_GEP_1]], align 4
+; CHECK-NEXT:    [[R_1:%.*]] = add i32 [[A_1]], [[B_1]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[R_1]], 2
+; CHECK-NEXT:    [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1
+; CHECK-NEXT:    store i32 [[MUL_1]], i32* [[DST_GEP_1]], align 4
+; CHECK-NEXT:    [[A_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[A_2:%.*]] = load i32, i32* [[A_GEP_2]], align 4
+; CHECK-NEXT:    [[B_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[B_2:%.*]] = load i32, i32* [[B_GEP_2]], align 4
+; CHECK-NEXT:    [[R_2:%.*]] = add i32 [[A_2]], [[B_2]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[R_2]], 2
+; CHECK-NEXT:    [[DST_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[MUL_2]], i32* [[DST_GEP_2]], align 4
+; CHECK-NEXT:    [[A_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[A_3:%.*]] = load i32, i32* [[A_GEP_3]], align 4
+; CHECK-NEXT:    [[B_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[B_3:%.*]] = load i32, i32* [[B_GEP_3]], align 4
+; CHECK-NEXT:    [[R_3:%.*]] = add i32 [[A_3]], [[B_3]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[R_3]], 2
+; CHECK-NEXT:    [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[DST_GEP_3]], align 4
+; CHECK-NEXT:    br label [[ENTRY_MERGE:%.*]]
+; CHECK:       entry.merge:
+; CHECK-NEXT:    ret void
+; CHECK:       entry.slpversioned1:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !alias.scope !5, !noalias !8
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !alias.scope !11, !noalias !12
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !alias.scope !13, !noalias !14
+; CHECK-NEXT:    br label [[ENTRY_MERGE]]
+;
+entry:
+  %A.0 = load i32, i32* %A, align 4
+  %B.0 = load i32, i32* %B, align 4
+  %r.0 = add i32 %A.0, %B.0
+  %mul.0 = mul i32 %r.0, 2
+  store i32 %mul.0, i32* %dst, align 4
+  %A.gep.1 = getelementptr inbounds i32, i32* %A, i64 1
+  %A.1 = load i32, i32* %A.gep.1, align 4
+  %B.gep.1 = getelementptr inbounds i32, i32* %B, i64 1
+  %B.1 = load i32, i32* %B.gep.1, align 4
+  %r.1 = add i32 %A.1, %B.1
+  %mul.1 = mul i32 %r.1, 2
+  %dst.gep.1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %mul.1, i32* %dst.gep.1, align 4
+  %A.gep.2 = getelementptr inbounds i32, i32* %A, i64 2
+  %A.2 = load i32, i32* %A.gep.2, align 4
+  %B.gep.2 = getelementptr inbounds i32, i32* %B, i64 2
+  %B.2 = load i32, i32* %B.gep.2, align 4
+  %r.2 = add i32 %A.2, %B.2
+  %mul.2 = mul i32 %r.2, 2
+  %dst.gep.2 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %mul.2, i32* %dst.gep.2, align 4
+  %A.gep.3 = getelementptr inbounds i32, i32* %A, i64 3
+  %A.3 = load i32, i32* %A.gep.3, align 4
+  %B.gep.3 = getelementptr inbounds i32, i32* %B, i64 3
+  %B.3 = load i32, i32* %B.gep.3, align 4
+  %r.3 = add i32 %A.3, %B.3
+  %mul.3 = mul i32 %r.3, 2
+  %dst.gep.3 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %mul.3, i32* %dst.gep.3, align 4
+
+  ret void
+}
+
+declare void @use(i32)
+
+declare void @bar()
+
+define void @needs_versioning_profitable_split_points(i32* %dst, i32* %src) {
+; CHECK-LABEL: @needs_versioning_profitable_split_points(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC16:%.*]] = bitcast i32* [[SRC:%.*]] to i8*
+; CHECK-NEXT:    [[DST18:%.*]] = bitcast i32* [[DST:%.*]] to i8*
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[SRC]], i64 4
+; CHECK-NEXT:    [[SCEVGEP17:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[SCEVGEP19:%.*]] = getelementptr i32, i32* [[DST]], i64 4
+; CHECK-NEXT:    [[SCEVGEP1920:%.*]] = bitcast i32* [[SCEVGEP19]] to i8*
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[SRC16]], [[SCEVGEP1920]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[DST18]], [[SCEVGEP17]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]]
+; CHECK:       entry.scalar:
+; CHECK-NEXT:    [[SRC_0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[R_0:%.*]] = ashr i32 [[SRC_0]], 16
+; CHECK-NEXT:    store i32 [[R_0]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1
+; CHECK-NEXT:    [[SRC_1:%.*]] = load i32, i32* [[SRC_GEP_1]], align 4
+; CHECK-NEXT:    [[R_1:%.*]] = ashr i32 [[SRC_1]], 16
+; CHECK-NEXT:    [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1
+; CHECK-NEXT:    store i32 [[R_1]], i32* [[DST_GEP_1]], align 4
+; CHECK-NEXT:    [[SRC_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[SRC_2:%.*]] = load i32, i32* [[SRC_GEP_2]], align 4
+; CHECK-NEXT:    [[R_2:%.*]] = ashr i32 [[SRC_2]], 16
+; CHECK-NEXT:    [[DST_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[R_2]], i32* [[DST_GEP_2]], align 4
+; CHECK-NEXT:    [[SRC_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[SRC_3:%.*]] = load i32, i32* [[SRC_GEP_3]], align 4
+; CHECK-NEXT:    [[R_3:%.*]] = ashr i32 [[SRC_3]], 16
+; CHECK-NEXT:    [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[R_3]], i32* [[DST_GEP_3]], align 4
+; CHECK-NEXT:    br label [[ENTRY_MERGE:%.*]]
+; CHECK:       entry.merge:
+; CHECK-NEXT:    br label [[ENTRY_TAIL:%.*]]
+; CHECK:       entry.tail:
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    ret void
+; CHECK:       entry.slpversioned1:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4, !alias.scope !18, !noalias !15
+; CHECK-NEXT:    br label [[ENTRY_MERGE]]
+;
+entry:
+  call void @bar()
+  call void @bar()
+  call void @bar()
+
+  %src.0 = load i32, i32* %src, align 4
+  %r.0 = ashr i32 %src.0, 16
+  store i32 %r.0, i32* %dst, align 4
+  %src.gep.1 = getelementptr inbounds i32, i32* %src, i64 1
+  %src.1 = load i32, i32* %src.gep.1, align 4
+  %r.1 = ashr i32 %src.1, 16
+  %dst.gep.1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %r.1, i32* %dst.gep.1, align 4
+  %src.gep.2 = getelementptr inbounds i32, i32* %src, i64 2
+  %src.2 = load i32, i32* %src.gep.2, align 4
+  %r.2 = ashr i32 %src.2, 16
+  %dst.gep.2 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %r.2, i32* %dst.gep.2, align 4
+  %src.gep.3 = getelementptr inbounds i32, i32* %src, i64 3
+  %src.3 = load i32, i32* %src.gep.3, align 4
+  %r.3 = ashr i32 %src.3, 16
+  %dst.gep.3 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %r.3, i32* %dst.gep.3, align 4
+
+  call void @bar()
+  ret void
+}
+
+define void @needs_versioning_profitable_load_used_outside_region1(i32* %dst, i32* %src, i1 %c) {
+; CHECK-LABEL: @needs_versioning_profitable_load_used_outside_region1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[SRC_0:%.*]] = load i32, i32* [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[R_0:%.*]] = ashr i32 [[SRC_0]], 16
+; CHECK-NEXT:    store i32 [[R_0]], i32* [[DST:%.*]], align 4
+; CHECK-NEXT:    [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1
+; CHECK-NEXT:    [[SRC_1:%.*]] = load i32, i32* [[SRC_GEP_1]], align 4
+; CHECK-NEXT:    [[R_1:%.*]] = ashr i32 [[SRC_1]], 16
+; CHECK-NEXT:    [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1
+; CHECK-NEXT:    store i32 [[R_1]], i32* [[DST_GEP_1]], align 4
+; CHECK-NEXT:    [[SRC_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[SRC_2:%.*]] = load i32, i32* [[SRC_GEP_2]], align 4
+; CHECK-NEXT:    [[R_2:%.*]] = ashr i32 [[SRC_2]], 16
+; CHECK-NEXT:    [[DST_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[R_2]], i32* [[DST_GEP_2]], align 4
+; CHECK-NEXT:    [[SRC_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[SRC_3:%.*]] = load i32, i32* [[SRC_GEP_3]], align 4
+; CHECK-NEXT:    [[R_3:%.*]] = ashr i32 [[SRC_3]], 16
+; CHECK-NEXT:    [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[R_3]], i32* [[DST_GEP_3]], align 4
+; CHECK-NEXT:    [[SRC_GEP_5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 5
+; CHECK-NEXT:    [[L:%.*]] = load i32, i32* [[SRC_GEP_5]], align 4
+; CHECK-NEXT:    call void @use(i32 [[L]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %c, label %then, label %exit
+
+then:
+  %src.0 = load i32, i32* %src, align 4
+  %r.0 = ashr i32 %src.0, 16
+  store i32 %r.0, i32* %dst, align 4
+  %src.gep.1 = getelementptr inbounds i32, i32* %src, i64 1
+  %src.1 = load i32, i32* %src.gep.1, align 4
+  %r.1 = ashr i32 %src.1, 16
+  %dst.gep.1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %r.1, i32* %dst.gep.1, align 4
+  %src.gep.2 = getelementptr inbounds i32, i32* %src, i64 2
+  %src.2 = load i32, i32* %src.gep.2, align 4
+  %r.2 = ashr i32 %src.2, 16
+  %dst.gep.2 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %r.2, i32* %dst.gep.2, align 4
+  %src.gep.3 = getelementptr inbounds i32, i32* %src, i64 3
+  %src.3 = load i32, i32* %src.gep.3, align 4
+  %r.3 = ashr i32 %src.3, 16
+  %dst.gep.3 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %r.3, i32* %dst.gep.3, align 4
+  %src.gep.5 = getelementptr inbounds i32, i32* %src, i64 5
+  %l = load i32, i32* %src.gep.5
+  call void @use(i32 %l)
+  br label %exit
+
+exit:
+  ret void
+}
+
+define void @needs_versioning_profitable_load_used_outside_region2(i32* %dst, i32* %src, i1 %c) {
+; CHECK-LABEL: @needs_versioning_profitable_load_used_outside_region2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
 ; CHECK-NEXT:    [[SRC_0:%.*]] = load i32, i32* [[SRC:%.*]], align 4
 ; CHECK-NEXT:    [[R_0:%.*]] = ashr i32 [[SRC_0]], 16
 ; CHECK-NEXT:    store i32 [[R_0]], i32* [[DST:%.*]], align 4
@@ -39,6 +353,8 @@
 ; CHECK-NEXT:    store i32 [[R_1]], i32* [[DST_GEP_1]], align 4
 ; CHECK-NEXT:    [[SRC_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
 ; CHECK-NEXT:    [[SRC_2:%.*]] = load i32, i32* [[SRC_GEP_2]], align 4
+; CHECK-NEXT:    [[SRC_GEP_5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 5
+; CHECK-NEXT:    [[L:%.*]] = load i32, i32* [[SRC_GEP_5]], align 4
 ; CHECK-NEXT:    [[R_2:%.*]] = ashr i32 [[SRC_2]], 16
 ; CHECK-NEXT:    [[DST_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
 ; CHECK-NEXT:    store i32 [[R_2]], i32* [[DST_GEP_2]], align 4
@@ -47,9 +363,15 @@
 ; CHECK-NEXT:    [[R_3:%.*]] = ashr i32 [[SRC_3]], 16
 ; CHECK-NEXT:    [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
 ; CHECK-NEXT:    store i32 [[R_3]], i32* [[DST_GEP_3]], align 4
+; CHECK-NEXT:    call void @use(i32 [[L]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 entry:
+  br i1 %c, label %then, label %exit
+
+then:
   %src.0 = load i32, i32* %src, align 4
   %r.0 = ashr i32 %src.0, 16
   store i32 %r.0, i32* %dst, align 4
@@ -60,6 +382,8 @@
   store i32 %r.1, i32* %dst.gep.1, align 4
   %src.gep.2 = getelementptr inbounds i32, i32* %src, i64 2
   %src.2 = load i32, i32* %src.gep.2, align 4
+  %src.gep.5 = getelementptr inbounds i32, i32* %src, i64 5
+  %l = load i32, i32* %src.gep.5
   %r.2 = ashr i32 %src.2, 16
   %dst.gep.2 = getelementptr inbounds i32, i32* %dst, i64 2
   store i32 %r.2, i32* %dst.gep.2, align 4
@@ -68,10 +392,14 @@
   %r.3 = ashr i32 %src.3, 16
   %dst.gep.3 = getelementptr inbounds i32, i32* %dst, i64 3
   store i32 %r.3, i32* %dst.gep.3, align 4
+  call void @use(i32 %l)
+  br label %exit
 
+exit:
   ret void
 }
 
+>>>>>>> bac4352c3d46 ([SLPVectorizer] WIP Implement initial memory versioning (WIP!))
 
 define void @no_version(i32* nocapture %dst, i32* nocapture readonly %src) {
 ; CHECK-LABEL: @no_version(
@@ -100,8 +428,20 @@
 define void @version_multiple(i32* nocapture %out_block, i32* nocapture readonly %counter) {
 ; CHECK-LABEL: @version_multiple(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[COUNTER:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK:%.*]], align 4
+; CHECK-NEXT:    [[COUNTER12:%.*]] = bitcast i32* [[COUNTER:%.*]] to i8*
+; CHECK-NEXT:    [[OUT_BLOCK14:%.*]] = bitcast i32* [[OUT_BLOCK:%.*]] to i8*
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[COUNTER]], i64 4
+; CHECK-NEXT:    [[SCEVGEP13:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[SCEVGEP15:%.*]] = getelementptr i32, i32* [[OUT_BLOCK]], i64 4
+; CHECK-NEXT:    [[SCEVGEP1516:%.*]] = bitcast i32* [[SCEVGEP15]] to i8*
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[COUNTER12]], [[SCEVGEP1516]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[OUT_BLOCK14]], [[SCEVGEP13]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]]
+; CHECK:       entry.scalar:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[COUNTER]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1
@@ -122,7 +462,18 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4
 ; CHECK-NEXT:    [[XOR_3:%.*]] = xor i32 [[TMP7]], [[TMP6]]
 ; CHECK-NEXT:    store i32 [[XOR_3]], i32* [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    br label [[ENTRY_MERGE:%.*]]
+; CHECK:       entry.merge:
 ; CHECK-NEXT:    ret void
+; CHECK:       entry.slpversioned1:
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[COUNTER]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4, !alias.scope !20, !noalias !23
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !23, !noalias !20
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4, !alias.scope !23, !noalias !20
+; CHECK-NEXT:    br label [[ENTRY_MERGE]]
 ;
 entry:
   %0 = load i32, i32* %counter, align 4
@@ -446,9 +797,767 @@
   br label %bb15
 
 bb15:
+<<<<<<< HEAD
   %tmp16 = fmul double %tmp, 20.0
   store double %tmp16, double* %tmp9, align 8
   %tmp17 = fmul double %tmp13, 30.0
   store double %tmp17, double* %tmp14, align 8
+=======
+  %t16 = fmul double %t, 20.0
+  store double %t16, double* %t9, align 8
+  %t17 = fmul double %t13, 30.0
+  store double %t17, double* %t14, align 8
+  ret void
+}
+
+%struct.2 = type { [4 x float] }
+
+; Make sure we do not crash when we encounter a SCEVCouldNotCompute.
+define void @no_lcssa_phi(%struct.2* %A, float* %B, i1 %c) {
+; CHECK-LABEL: @no_lcssa_phi(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PTR_PHI:%.*]] = phi %struct.2* [ [[A:%.*]], [[BB:%.*]] ], [ null, [[LOOP]] ]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[PTR_PHI_LCSSA:%.*]] = phi %struct.2* [ [[PTR_PHI]], [[LOOP]] ]
+; CHECK-NEXT:    [[PTR_PHI_LCSSA23:%.*]] = bitcast %struct.2* [[PTR_PHI_LCSSA]] to i8*
+; CHECK-NEXT:    [[B_GEP_0:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 0
+; CHECK-NEXT:    [[B_GEP_021:%.*]] = bitcast float* [[B_GEP_0]] to i8*
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[B_GEP_0]], i64 4
+; CHECK-NEXT:    [[SCEVGEP22:%.*]] = bitcast float* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[SCEVGEP24:%.*]] = getelementptr [[STRUCT_2:%.*]], %struct.2* [[PTR_PHI_LCSSA]], i64 1
+; CHECK-NEXT:    [[SCEVGEP2425:%.*]] = bitcast %struct.2* [[SCEVGEP24]] to i8*
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[B_GEP_021]], [[SCEVGEP2425]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[PTR_PHI_LCSSA23]], [[SCEVGEP22]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[EXIT_SCALAR:%.*]], label [[EXIT_SLPVERSIONED1:%.*]]
+; CHECK:       exit.scalar:
+; CHECK-NEXT:    [[L_0:%.*]] = load float, float* [[B_GEP_0]], align 8
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd float [[L_0]], 1.000000e+01
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul float [[ADD_0]], 3.000000e+01
+; CHECK-NEXT:    [[A_GEP_0:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    store float [[MUL_0]], float* [[A_GEP_0]], align 8
+; CHECK-NEXT:    [[B_GEP_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 1
+; CHECK-NEXT:    [[L_1:%.*]] = load float, float* [[B_GEP_1]], align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd float [[L_1]], 1.000000e+01
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul float [[ADD_1]], 3.000000e+01
+; CHECK-NEXT:    [[A_GEP_1:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 1
+; CHECK-NEXT:    store float [[MUL_1]], float* [[A_GEP_1]], align 8
+; CHECK-NEXT:    [[B_GEP_2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; CHECK-NEXT:    [[L_2:%.*]] = load float, float* [[B_GEP_2]], align 8
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd float [[L_2]], 1.000000e+01
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul float [[ADD_2]], 3.000000e+01
+; CHECK-NEXT:    [[A_GEP_2:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 2
+; CHECK-NEXT:    store float [[MUL_2]], float* [[A_GEP_2]], align 8
+; CHECK-NEXT:    [[B_GEP_3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; CHECK-NEXT:    [[L_3:%.*]] = load float, float* [[B_GEP_3]], align 8
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd float [[L_3]], 1.000000e+01
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul float [[ADD_3]], 3.000000e+01
+; CHECK-NEXT:    [[A_GEP_3:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 3
+; CHECK-NEXT:    store float [[MUL_3]], float* [[A_GEP_3]], align 8
+; CHECK-NEXT:    br label [[EXIT_MERGE:%.*]]
+; CHECK:       exit.merge:
+; CHECK-NEXT:    ret void
+; CHECK:       exit.slpversioned1:
+; CHECK-NEXT:    [[A_GEP_05:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[B_GEP_0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 8, !alias.scope !25, !noalias !28
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01>
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 3.000000e+01, float 3.000000e+01, float 3.000000e+01, float 3.000000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[A_GEP_05]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 8, !alias.scope !28, !noalias !25
+; CHECK-NEXT:    br label [[EXIT_MERGE]]
+;
+bb:
+  br label %loop
+
+loop:
+  %ptr.phi = phi %struct.2* [ %A, %bb ], [ null, %loop ]
+  br i1 %c, label %exit, label %loop
+
+exit:
+  %B.gep.0 = getelementptr inbounds float, float* %B, i64 0
+  %l.0 = load float, float* %B.gep.0, align 8
+  %add.0 = fadd float %l.0, 10.0
+  %mul.0 = fmul float %add.0, 30.0
+  %A.gep.0 = getelementptr inbounds %struct.2, %struct.2* %ptr.phi, i64 0, i32 0, i32 0
+  store float %mul.0, float* %A.gep.0, align 8
+  %B.gep.1 = getelementptr inbounds float, float* %B, i64 1
+  %l.1 = load float, float* %B.gep.1, align 8
+  %add.1 = fadd float %l.1, 10.0
+  %mul.1 = fmul float %add.1, 30.0
+  %A.gep.1 = getelementptr inbounds %struct.2, %struct.2* %ptr.phi, i64 0, i32 0, i32 1
+  store float %mul.1, float* %A.gep.1, align 8
+  %B.gep.2 = getelementptr inbounds float, float* %B, i64 2
+  %l.2 = load float, float* %B.gep.2, align 8
+  %add.2 = fadd float %l.2, 10.0
+  %mul.2 = fmul float %add.2, 30.0
+  %A.gep.2 = getelementptr inbounds %struct.2, %struct.2* %ptr.phi, i64 0, i32 0, i32 2
+  store float %mul.2, float* %A.gep.2, align 8
+  %B.gep.3 = getelementptr inbounds float, float* %B, i64 3
+  %l.3 = load float, float* %B.gep.3, align 8
+  %add.3 = fadd float %l.3, 10.0
+  %mul.3 = fmul float %add.3, 30.0
+  %A.gep.3 = getelementptr inbounds %struct.2, %struct.2* %ptr.phi, i64 0, i32 0, i32 3
+  store float %mul.3, float* %A.gep.3, align 8
+  ret void
+}
+
+; Make sure lcssa phis as pointer bases are handled properly.
+define void @lcssa_phi(%struct.2* %A, float* %B, i1 %c) {
+; CHECK-LABEL: @lcssa_phi(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PTR_PHI:%.*]] = phi %struct.2* [ [[A:%.*]], [[BB:%.*]] ], [ null, [[LOOP]] ]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[PTR_PHI_LCSSA:%.*]] = phi %struct.2* [ [[PTR_PHI]], [[LOOP]] ]
+; CHECK-NEXT:    [[PTR_PHI_LCSSA23:%.*]] = bitcast %struct.2* [[PTR_PHI_LCSSA]] to i8*
+; CHECK-NEXT:    [[B_GEP_0:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 0
+; CHECK-NEXT:    [[B_GEP_021:%.*]] = bitcast float* [[B_GEP_0]] to i8*
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[B_GEP_0]], i64 4
+; CHECK-NEXT:    [[SCEVGEP22:%.*]] = bitcast float* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[SCEVGEP24:%.*]] = getelementptr [[STRUCT_2:%.*]], %struct.2* [[PTR_PHI_LCSSA]], i64 1
+; CHECK-NEXT:    [[SCEVGEP2425:%.*]] = bitcast %struct.2* [[SCEVGEP24]] to i8*
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[B_GEP_021]], [[SCEVGEP2425]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[PTR_PHI_LCSSA23]], [[SCEVGEP22]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[EXIT_SCALAR:%.*]], label [[EXIT_SLPVERSIONED1:%.*]]
+; CHECK:       exit.scalar:
+; CHECK-NEXT:    [[L_0:%.*]] = load float, float* [[B_GEP_0]], align 8
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd float [[L_0]], 1.000000e+01
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul float [[ADD_0]], 3.000000e+01
+; CHECK-NEXT:    [[A_GEP_0:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    store float [[MUL_0]], float* [[A_GEP_0]], align 8
+; CHECK-NEXT:    [[B_GEP_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 1
+; CHECK-NEXT:    [[L_1:%.*]] = load float, float* [[B_GEP_1]], align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd float [[L_1]], 1.000000e+01
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul float [[ADD_1]], 3.000000e+01
+; CHECK-NEXT:    [[A_GEP_1:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 1
+; CHECK-NEXT:    store float [[MUL_1]], float* [[A_GEP_1]], align 8
+; CHECK-NEXT:    [[B_GEP_2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; CHECK-NEXT:    [[L_2:%.*]] = load float, float* [[B_GEP_2]], align 8
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd float [[L_2]], 1.000000e+01
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul float [[ADD_2]], 3.000000e+01
+; CHECK-NEXT:    [[A_GEP_2:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 2
+; CHECK-NEXT:    store float [[MUL_2]], float* [[A_GEP_2]], align 8
+; CHECK-NEXT:    [[B_GEP_3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; CHECK-NEXT:    [[L_3:%.*]] = load float, float* [[B_GEP_3]], align 8
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd float [[L_3]], 1.000000e+01
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul float [[ADD_3]], 3.000000e+01
+; CHECK-NEXT:    [[A_GEP_3:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 3
+; CHECK-NEXT:    store float [[MUL_3]], float* [[A_GEP_3]], align 8
+; CHECK-NEXT:    br label [[EXIT_MERGE:%.*]]
+; CHECK:       exit.merge:
+; CHECK-NEXT:    ret void
+; CHECK:       exit.slpversioned1:
+; CHECK-NEXT:    [[A_GEP_05:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[B_GEP_0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 8, !alias.scope !30, !noalias !33
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01>
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 3.000000e+01, float 3.000000e+01, float 3.000000e+01, float 3.000000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[A_GEP_05]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 8, !alias.scope !33, !noalias !30
+; CHECK-NEXT:    br label [[EXIT_MERGE]]
+;
+bb:
+  br label %loop
+
+loop:
+  %ptr.phi = phi %struct.2* [ %A, %bb ], [ null, %loop ]
+  br i1 %c, label %exit, label %loop
+
+exit:
+  %ptr.phi.lcssa = phi %struct.2* [ %ptr.phi, %loop ]
+  %B.gep.0 = getelementptr inbounds float, float* %B, i64 0
+  %l.0 = load float, float* %B.gep.0, align 8
+  %add.0 = fadd float %l.0, 10.0
+  %mul.0 = fmul float %add.0, 30.0
+  %A.gep.0 = getelementptr inbounds %struct.2, %struct.2* %ptr.phi.lcssa, i64 0, i32 0, i32 0
+  store float %mul.0, float* %A.gep.0, align 8
+  %B.gep.1 = getelementptr inbounds float, float* %B, i64 1
+  %l.1 = load float, float* %B.gep.1, align 8
+  %add.1 = fadd float %l.1, 10.0
+  %mul.1 = fmul float %add.1, 30.0
+  %A.gep.1 = getelementptr inbounds %struct.2, %struct.2* %ptr.phi.lcssa, i64 0, i32 0, i32 1
+  store float %mul.1, float* %A.gep.1, align 8
+  %B.gep.2 = getelementptr inbounds float, float* %B, i64 2
+  %l.2 = load float, float* %B.gep.2, align 8
+  %add.2 = fadd float %l.2, 10.0
+  %mul.2 = fmul float %add.2, 30.0
+  %A.gep.2 = getelementptr inbounds %struct.2, %struct.2* %ptr.phi.lcssa, i64 0, i32 0, i32 2
+  store float %mul.2, float* %A.gep.2, align 8
+  %B.gep.3 = getelementptr inbounds float, float* %B, i64 3
+  %l.3 = load float, float* %B.gep.3, align 8
+  %add.3 = fadd float %l.3, 10.0
+  %mul.3 = fmul float %add.3, 30.0
+  %A.gep.3 = getelementptr inbounds %struct.2, %struct.2* %ptr.phi.lcssa, i64 0, i32 0, i32 3
+  store float %mul.3, float* %A.gep.3, align 8
+  ret void
+}
+
+%struct.spam = type { [60 x i32], i32, [12 x i8] }
+
+declare void @foo(i8*)
+
+; Test case with a basic block where parts can be vectorized without versioning.
+define i32 @block_partly_vectorized_without_versioning(%struct.spam* readonly %arg, i8* nocapture readonly %arg1, i8* nocapture %arg2, i8* nocapture readonly %arg3, i8* %A, i8* %B) {
+; CHECK-LABEL: @block_partly_vectorized_without_versioning(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[T:%.*]] = alloca <16 x i8>, align 16
+; CHECK-NEXT:    [[T4:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[T]], i64 0, i64 0
+; CHECK-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, i8* [[ARG3:%.*]], i64 1
+; CHECK-NEXT:    [[T6:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 2
+; CHECK-NEXT:    [[T7:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 3
+; CHECK-NEXT:    [[T8:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 4
+; CHECK-NEXT:    [[T9:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 5
+; CHECK-NEXT:    [[T10:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 6
+; CHECK-NEXT:    [[T11:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 7
+; CHECK-NEXT:    [[T12:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 8
+; CHECK-NEXT:    [[T13:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 9
+; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 10
+; CHECK-NEXT:    [[T15:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 11
+; CHECK-NEXT:    [[T16:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 12
+; CHECK-NEXT:    [[T17:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 13
+; CHECK-NEXT:    [[T18:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 14
+; CHECK-NEXT:    [[T19:%.*]] = bitcast i8* [[ARG1:%.*]] to <16 x i8>*
+; CHECK-NEXT:    [[A_GEP_0:%.*]] = getelementptr i8, i8* [[A:%.*]], i64 0
+; CHECK-NEXT:    [[B_GEP_0:%.*]] = getelementptr i8, i8* [[B:%.*]], i64 0
+; CHECK-NEXT:    [[A_GEP_1:%.*]] = getelementptr i8, i8* [[A]], i64 1
+; CHECK-NEXT:    [[B_GEP_1:%.*]] = getelementptr i8, i8* [[B]], i64 1
+; CHECK-NEXT:    [[A_GEP_2:%.*]] = getelementptr i8, i8* [[A]], i64 2
+; CHECK-NEXT:    [[B_GEP_2:%.*]] = getelementptr i8, i8* [[B]], i64 2
+; CHECK-NEXT:    [[A_GEP_3:%.*]] = getelementptr i8, i8* [[A]], i64 3
+; CHECK-NEXT:    [[B_GEP_3:%.*]] = getelementptr i8, i8* [[B]], i64 3
+; CHECK-NEXT:    [[A_GEP_4:%.*]] = getelementptr i8, i8* [[A]], i64 4
+; CHECK-NEXT:    [[B_GEP_4:%.*]] = getelementptr i8, i8* [[B]], i64 4
+; CHECK-NEXT:    [[A_GEP_5:%.*]] = getelementptr i8, i8* [[A]], i64 5
+; CHECK-NEXT:    [[B_GEP_5:%.*]] = getelementptr i8, i8* [[B]], i64 5
+; CHECK-NEXT:    [[A_GEP_6:%.*]] = getelementptr i8, i8* [[A]], i64 6
+; CHECK-NEXT:    [[B_GEP_6:%.*]] = getelementptr i8, i8* [[B]], i64 6
+; CHECK-NEXT:    [[A_GEP_7:%.*]] = getelementptr i8, i8* [[A]], i64 7
+; CHECK-NEXT:    [[B_GEP_7:%.*]] = getelementptr i8, i8* [[B]], i64 7
+; CHECK-NEXT:    [[A_GEP_8:%.*]] = getelementptr i8, i8* [[A]], i64 8
+; CHECK-NEXT:    [[B_GEP_8:%.*]] = getelementptr i8, i8* [[B]], i64 8
+; CHECK-NEXT:    [[A_GEP_9:%.*]] = getelementptr i8, i8* [[A]], i64 9
+; CHECK-NEXT:    [[B_GEP_9:%.*]] = getelementptr i8, i8* [[B]], i64 9
+; CHECK-NEXT:    [[A_GEP_10:%.*]] = getelementptr i8, i8* [[A]], i64 10
+; CHECK-NEXT:    [[B_GEP_10:%.*]] = getelementptr i8, i8* [[B]], i64 10
+; CHECK-NEXT:    [[A_GEP_11:%.*]] = getelementptr i8, i8* [[A]], i64 11
+; CHECK-NEXT:    [[B_GEP_11:%.*]] = getelementptr i8, i8* [[B]], i64 11
+; CHECK-NEXT:    [[A_GEP_12:%.*]] = getelementptr i8, i8* [[A]], i64 12
+; CHECK-NEXT:    [[B_GEP_12:%.*]] = getelementptr i8, i8* [[B]], i64 12
+; CHECK-NEXT:    [[A_GEP_13:%.*]] = getelementptr i8, i8* [[A]], i64 13
+; CHECK-NEXT:    [[B_GEP_13:%.*]] = getelementptr i8, i8* [[B]], i64 13
+; CHECK-NEXT:    [[A_GEP_14:%.*]] = getelementptr i8, i8* [[A]], i64 14
+; CHECK-NEXT:    [[B_GEP_14:%.*]] = getelementptr i8, i8* [[B]], i64 14
+; CHECK-NEXT:    [[A_GEP_15:%.*]] = getelementptr i8, i8* [[A]], i64 15
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[A_GEP_0]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
+; CHECK-NEXT:    [[B_GEP_15:%.*]] = getelementptr i8, i8* [[B]], i64 15
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[B_GEP_0]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i8> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[R_GEP_0:%.*]] = getelementptr i8, i8* [[ARG1]], i64 0
+; CHECK-NEXT:    [[R_GEP_1:%.*]] = getelementptr i8, i8* [[ARG1]], i64 1
+; CHECK-NEXT:    [[R_GEP_2:%.*]] = getelementptr i8, i8* [[ARG1]], i64 2
+; CHECK-NEXT:    [[R_GEP_3:%.*]] = getelementptr i8, i8* [[ARG1]], i64 3
+; CHECK-NEXT:    [[R_GEP_4:%.*]] = getelementptr i8, i8* [[ARG1]], i64 4
+; CHECK-NEXT:    [[R_GEP_5:%.*]] = getelementptr i8, i8* [[ARG1]], i64 5
+; CHECK-NEXT:    [[R_GEP_6:%.*]] = getelementptr i8, i8* [[ARG1]], i64 6
+; CHECK-NEXT:    [[R_GEP_7:%.*]] = getelementptr i8, i8* [[ARG1]], i64 7
+; CHECK-NEXT:    [[R_GEP_8:%.*]] = getelementptr i8, i8* [[ARG1]], i64 8
+; CHECK-NEXT:    [[R_GEP_9:%.*]] = getelementptr i8, i8* [[ARG1]], i64 9
+; CHECK-NEXT:    [[R_GEP_10:%.*]] = getelementptr i8, i8* [[ARG1]], i64 10
+; CHECK-NEXT:    [[R_GEP_11:%.*]] = getelementptr i8, i8* [[ARG1]], i64 11
+; CHECK-NEXT:    [[R_GEP_12:%.*]] = getelementptr i8, i8* [[ARG1]], i64 12
+; CHECK-NEXT:    [[R_GEP_13:%.*]] = getelementptr i8, i8* [[ARG1]], i64 13
+; CHECK-NEXT:    [[R_GEP_14:%.*]] = getelementptr i8, i8* [[ARG1]], i64 14
+; CHECK-NEXT:    [[R_GEP_15:%.*]] = getelementptr i8, i8* [[ARG1]], i64 15
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[R_GEP_0]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[TMP5]], align 1
+; CHECK-NEXT:    [[T21:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 15
+; CHECK-NEXT:    [[T22:%.*]] = bitcast i8* [[ARG3]] to <16 x i8>*
+; CHECK-NEXT:    call void @foo(i8* nonnull [[T4]])
+; CHECK-NEXT:    [[T26:%.*]] = load i8, i8* [[ARG3]], align 1
+; CHECK-NEXT:    [[T27:%.*]] = load i8, i8* [[ARG2:%.*]], align 1
+; CHECK-NEXT:    [[T28:%.*]] = xor i8 [[T27]], [[T26]]
+; CHECK-NEXT:    store i8 [[T28]], i8* [[ARG2]], align 1
+; CHECK-NEXT:    [[T29:%.*]] = load i8, i8* [[T5]], align 1
+; CHECK-NEXT:    [[T30:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 1
+; CHECK-NEXT:    [[T31:%.*]] = load i8, i8* [[T30]], align 1
+; CHECK-NEXT:    [[T32:%.*]] = xor i8 [[T31]], [[T29]]
+; CHECK-NEXT:    store i8 [[T32]], i8* [[T30]], align 1
+; CHECK-NEXT:    [[T33:%.*]] = load i8, i8* [[T6]], align 1
+; CHECK-NEXT:    [[T34:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 2
+; CHECK-NEXT:    [[T35:%.*]] = load i8, i8* [[T34]], align 1
+; CHECK-NEXT:    [[T36:%.*]] = xor i8 [[T35]], [[T33]]
+; CHECK-NEXT:    store i8 [[T36]], i8* [[T34]], align 1
+; CHECK-NEXT:    [[T37:%.*]] = load i8, i8* [[T7]], align 1
+; CHECK-NEXT:    [[T38:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 3
+; CHECK-NEXT:    [[T39:%.*]] = load i8, i8* [[T38]], align 1
+; CHECK-NEXT:    [[T40:%.*]] = xor i8 [[T39]], [[T37]]
+; CHECK-NEXT:    store i8 [[T40]], i8* [[T38]], align 1
+; CHECK-NEXT:    [[T41:%.*]] = load i8, i8* [[T8]], align 1
+; CHECK-NEXT:    [[T42:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 4
+; CHECK-NEXT:    [[T43:%.*]] = load i8, i8* [[T42]], align 1
+; CHECK-NEXT:    [[T44:%.*]] = xor i8 [[T43]], [[T41]]
+; CHECK-NEXT:    store i8 [[T44]], i8* [[T42]], align 1
+; CHECK-NEXT:    [[T45:%.*]] = load i8, i8* [[T9]], align 1
+; CHECK-NEXT:    [[T46:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 5
+; CHECK-NEXT:    [[T47:%.*]] = load i8, i8* [[T46]], align 1
+; CHECK-NEXT:    [[T48:%.*]] = xor i8 [[T47]], [[T45]]
+; CHECK-NEXT:    store i8 [[T48]], i8* [[T46]], align 1
+; CHECK-NEXT:    [[T49:%.*]] = load i8, i8* [[T10]], align 1
+; CHECK-NEXT:    [[T50:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 6
+; CHECK-NEXT:    [[T51:%.*]] = load i8, i8* [[T50]], align 1
+; CHECK-NEXT:    [[T52:%.*]] = xor i8 [[T51]], [[T49]]
+; CHECK-NEXT:    store i8 [[T52]], i8* [[T50]], align 1
+; CHECK-NEXT:    [[T53:%.*]] = load i8, i8* [[T11]], align 1
+; CHECK-NEXT:    [[T54:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 7
+; CHECK-NEXT:    [[T55:%.*]] = load i8, i8* [[T54]], align 1
+; CHECK-NEXT:    [[T56:%.*]] = xor i8 [[T55]], [[T53]]
+; CHECK-NEXT:    store i8 [[T56]], i8* [[T54]], align 1
+; CHECK-NEXT:    [[T57:%.*]] = load i8, i8* [[T12]], align 1
+; CHECK-NEXT:    [[T58:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 8
+; CHECK-NEXT:    [[T59:%.*]] = load i8, i8* [[T58]], align 1
+; CHECK-NEXT:    [[T60:%.*]] = xor i8 [[T59]], [[T57]]
+; CHECK-NEXT:    store i8 [[T60]], i8* [[T58]], align 1
+; CHECK-NEXT:    [[T61:%.*]] = load i8, i8* [[T13]], align 1
+; CHECK-NEXT:    [[T62:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 9
+; CHECK-NEXT:    [[T63:%.*]] = load i8, i8* [[T62]], align 1
+; CHECK-NEXT:    [[T64:%.*]] = xor i8 [[T63]], [[T61]]
+; CHECK-NEXT:    store i8 [[T64]], i8* [[T62]], align 1
+; CHECK-NEXT:    [[T65:%.*]] = load i8, i8* [[T14]], align 1
+; CHECK-NEXT:    [[T66:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 10
+; CHECK-NEXT:    [[T67:%.*]] = load i8, i8* [[T66]], align 1
+; CHECK-NEXT:    [[T68:%.*]] = xor i8 [[T67]], [[T65]]
+; CHECK-NEXT:    store i8 [[T68]], i8* [[T66]], align 1
+; CHECK-NEXT:    [[T69:%.*]] = load i8, i8* [[T15]], align 1
+; CHECK-NEXT:    [[T70:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 11
+; CHECK-NEXT:    [[T71:%.*]] = load i8, i8* [[T70]], align 1
+; CHECK-NEXT:    [[T72:%.*]] = xor i8 [[T71]], [[T69]]
+; CHECK-NEXT:    store i8 [[T72]], i8* [[T70]], align 1
+; CHECK-NEXT:    [[T73:%.*]] = load i8, i8* [[T16]], align 1
+; CHECK-NEXT:    [[T74:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 12
+; CHECK-NEXT:    [[T75:%.*]] = load i8, i8* [[T74]], align 1
+; CHECK-NEXT:    [[T76:%.*]] = xor i8 [[T75]], [[T73]]
+; CHECK-NEXT:    store i8 [[T76]], i8* [[T74]], align 1
+; CHECK-NEXT:    [[T77:%.*]] = load i8, i8* [[T17]], align 1
+; CHECK-NEXT:    [[T78:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 13
+; CHECK-NEXT:    [[T79:%.*]] = load i8, i8* [[T78]], align 1
+; CHECK-NEXT:    [[T80:%.*]] = xor i8 [[T79]], [[T77]]
+; CHECK-NEXT:    store i8 [[T80]], i8* [[T78]], align 1
+; CHECK-NEXT:    [[T81:%.*]] = load i8, i8* [[T18]], align 1
+; CHECK-NEXT:    [[T82:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 14
+; CHECK-NEXT:    [[T83:%.*]] = load i8, i8* [[T82]], align 1
+; CHECK-NEXT:    [[T84:%.*]] = xor i8 [[T83]], [[T81]]
+; CHECK-NEXT:    store i8 [[T84]], i8* [[T82]], align 1
+; CHECK-NEXT:    [[T85:%.*]] = load i8, i8* [[T21]], align 1
+; CHECK-NEXT:    [[T86:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 15
+; CHECK-NEXT:    [[T87:%.*]] = load i8, i8* [[T86]], align 1
+; CHECK-NEXT:    [[T88:%.*]] = xor i8 [[T87]], [[T85]]
+; CHECK-NEXT:    store i8 [[T88]], i8* [[T86]], align 1
+; CHECK-NEXT:    ret i32 1
+;
+bb:
+  %t = alloca <16 x i8>, align 16
+  %t4 = getelementptr inbounds <16 x i8>, <16 x i8>* %t, i64 0, i64 0
+  %t5 = getelementptr inbounds i8, i8* %arg3, i64 1
+  %t6 = getelementptr inbounds i8, i8* %arg3, i64 2
+  %t7 = getelementptr inbounds i8, i8* %arg3, i64 3
+  %t8 = getelementptr inbounds i8, i8* %arg3, i64 4
+  %t9 = getelementptr inbounds i8, i8* %arg3, i64 5
+  %t10 = getelementptr inbounds i8, i8* %arg3, i64 6
+  %t11 = getelementptr inbounds i8, i8* %arg3, i64 7
+  %t12 = getelementptr inbounds i8, i8* %arg3, i64 8
+  %t13 = getelementptr inbounds i8, i8* %arg3, i64 9
+  %t14 = getelementptr inbounds i8, i8* %arg3, i64 10
+  %t15 = getelementptr inbounds i8, i8* %arg3, i64 11
+  %t16 = getelementptr inbounds i8, i8* %arg3, i64 12
+  %t17 = getelementptr inbounds i8, i8* %arg3, i64 13
+  %t18 = getelementptr inbounds i8, i8* %arg3, i64 14
+  %t19 = bitcast i8* %arg1 to <16 x i8>*
+  %A.gep.0 = getelementptr i8, i8* %A, i64 0
+  %A.0 = load i8, i8* %A.gep.0
+  %B.gep.0 = getelementptr i8, i8* %B, i64 0
+  %B.0 = load i8, i8* %B.gep.0
+  %xor.0 = xor i8 %A.0, %B.0
+  %A.gep.1 = getelementptr i8, i8* %A, i64 1
+  %A.1 = load i8, i8* %A.gep.1
+  %B.gep.1 = getelementptr i8, i8* %B, i64 1
+  %B.1 = load i8, i8* %B.gep.1
+  %xor.1 = xor i8 %A.1, %B.1
+  %A.gep.2 = getelementptr i8, i8* %A, i64 2
+  %A.2 = load i8, i8* %A.gep.2
+  %B.gep.2 = getelementptr i8, i8* %B, i64 2
+  %B.2 = load i8, i8* %B.gep.2
+  %xor.2 = xor i8 %A.2, %B.2
+  %A.gep.3 = getelementptr i8, i8* %A, i64 3
+  %A.3 = load i8, i8* %A.gep.3
+  %B.gep.3 = getelementptr i8, i8* %B, i64 3
+  %B.3 = load i8, i8* %B.gep.3
+  %xor.3 = xor i8 %A.3, %B.3
+  %A.gep.4 = getelementptr i8, i8* %A, i64 4
+  %A.4 = load i8, i8* %A.gep.4
+  %B.gep.4 = getelementptr i8, i8* %B, i64 4
+  %B.4 = load i8, i8* %B.gep.4
+  %xor.4 = xor i8 %A.4, %B.4
+  %A.gep.5 = getelementptr i8, i8* %A, i64 5
+  %A.5 = load i8, i8* %A.gep.5
+  %B.gep.5 = getelementptr i8, i8* %B, i64 5
+  %B.5 = load i8, i8* %B.gep.5
+  %xor.5 = xor i8 %A.5, %B.5
+  %A.gep.6 = getelementptr i8, i8* %A, i64 6
+  %A.6 = load i8, i8* %A.gep.6
+  %B.gep.6 = getelementptr i8, i8* %B, i64 6
+  %B.6 = load i8, i8* %B.gep.6
+  %xor.6 = xor i8 %A.6, %B.6
+  %A.gep.7 = getelementptr i8, i8* %A, i64 7
+  %A.7 = load i8, i8* %A.gep.7
+  %B.gep.7 = getelementptr i8, i8* %B, i64 7
+  %B.7 = load i8, i8* %B.gep.7
+  %xor.7 = xor i8 %A.7, %B.7
+  %A.gep.8 = getelementptr i8, i8* %A, i64 8
+  %A.8 = load i8, i8* %A.gep.8
+  %B.gep.8 = getelementptr i8, i8* %B, i64 8
+  %B.8 = load i8, i8* %B.gep.8
+  %xor.8 = xor i8 %A.8, %B.8
+  %A.gep.9 = getelementptr i8, i8* %A, i64 9
+  %A.9 = load i8, i8* %A.gep.9
+  %B.gep.9 = getelementptr i8, i8* %B, i64 9
+  %B.9 = load i8, i8* %B.gep.9
+  %xor.9 = xor i8 %A.9, %B.9
+  %A.gep.10 = getelementptr i8, i8* %A, i64 10
+  %A.10 = load i8, i8* %A.gep.10
+  %B.gep.10 = getelementptr i8, i8* %B, i64 10
+  %B.10 = load i8, i8* %B.gep.10
+  %xor.10 = xor i8 %A.10, %B.10
+  %A.gep.11 = getelementptr i8, i8* %A, i64 11
+  %A.11 = load i8, i8* %A.gep.11
+  %B.gep.11 = getelementptr i8, i8* %B, i64 11
+  %B.11 = load i8, i8* %B.gep.11
+  %xor.11 = xor i8 %A.11, %B.11
+  %A.gep.12 = getelementptr i8, i8* %A, i64 12
+  %A.12 = load i8, i8* %A.gep.12
+  %B.gep.12 = getelementptr i8, i8* %B, i64 12
+  %B.12 = load i8, i8* %B.gep.12
+  %xor.12 = xor i8 %A.12, %B.12
+  %A.gep.13 = getelementptr i8, i8* %A, i64 13
+  %A.13 = load i8, i8* %A.gep.13
+  %B.gep.13 = getelementptr i8, i8* %B, i64 13
+  %B.13 = load i8, i8* %B.gep.13
+  %xor.13 = xor i8 %A.13, %B.13
+  %A.gep.14 = getelementptr i8, i8* %A, i64 14
+  %A.14 = load i8, i8* %A.gep.14
+  %B.gep.14 = getelementptr i8, i8* %B, i64 14
+  %B.14 = load i8, i8* %B.gep.14
+  %xor.14 = xor i8 %A.14, %B.14
+  %A.gep.15 = getelementptr i8, i8* %A, i64 15
+  %A.15 = load i8, i8* %A.gep.15
+  %B.gep.15 = getelementptr i8, i8* %B, i64 15
+  %B.15 = load i8, i8* %B.gep.15
+  %xor.15 = xor i8 %A.15, %B.15
+  %R.gep.0 = getelementptr i8, i8* %arg1, i64 0
+  store i8 %xor.0, i8* %R.gep.0
+  %R.gep.1 = getelementptr i8, i8* %arg1, i64 1
+  store i8 %xor.1, i8* %R.gep.1
+  %R.gep.2 = getelementptr i8, i8* %arg1, i64 2
+  store i8 %xor.2, i8* %R.gep.2
+  %R.gep.3 = getelementptr i8, i8* %arg1, i64 3
+  store i8 %xor.3, i8* %R.gep.3
+  %R.gep.4 = getelementptr i8, i8* %arg1, i64 4
+  store i8 %xor.4, i8* %R.gep.4
+  %R.gep.5 = getelementptr i8, i8* %arg1, i64 5
+  store i8 %xor.5, i8* %R.gep.5
+  %R.gep.6 = getelementptr i8, i8* %arg1, i64 6
+  store i8 %xor.6, i8* %R.gep.6
+  %R.gep.7 = getelementptr i8, i8* %arg1, i64 7
+  store i8 %xor.7, i8* %R.gep.7
+  %R.gep.8 = getelementptr i8, i8* %arg1, i64 8
+  store i8 %xor.8, i8* %R.gep.8
+  %R.gep.9 = getelementptr i8, i8* %arg1, i64 9
+  store i8 %xor.9, i8* %R.gep.9
+  %R.gep.10 = getelementptr i8, i8* %arg1, i64 10
+  store i8 %xor.10, i8* %R.gep.10
+  %R.gep.11 = getelementptr i8, i8* %arg1, i64 11
+  store i8 %xor.11, i8* %R.gep.11
+  %R.gep.12 = getelementptr i8, i8* %arg1, i64 12
+  store i8 %xor.12, i8* %R.gep.12
+  %R.gep.13 = getelementptr i8, i8* %arg1, i64 13
+  store i8 %xor.13, i8* %R.gep.13
+  %R.gep.14 = getelementptr i8, i8* %arg1, i64 14
+  store i8 %xor.14, i8* %R.gep.14
+  %R.gep.15 = getelementptr i8, i8* %arg1, i64 15
+  store i8 %xor.15, i8* %R.gep.15
+
+
+  %t21 = getelementptr inbounds i8, i8* %arg3, i64 15
+  %t22 = bitcast i8* %arg3 to <16 x i8>*
+
+  call void @foo(i8* nonnull %t4)
+  %t26 = load i8, i8* %arg3, align 1
+  %t27 = load i8, i8* %arg2, align 1
+  %t28 = xor i8 %t27, %t26
+  store i8 %t28, i8* %arg2, align 1
+  %t29 = load i8, i8* %t5, align 1
+  %t30 = getelementptr inbounds i8, i8* %arg2, i64 1
+  %t31 = load i8, i8* %t30, align 1
+  %t32 = xor i8 %t31, %t29
+  store i8 %t32, i8* %t30, align 1
+  %t33 = load i8, i8* %t6, align 1
+  %t34 = getelementptr inbounds i8, i8* %arg2, i64 2
+  %t35 = load i8, i8* %t34, align 1
+  %t36 = xor i8 %t35, %t33
+  store i8 %t36, i8* %t34, align 1
+  %t37 = load i8, i8* %t7, align 1
+  %t38 = getelementptr inbounds i8, i8* %arg2, i64 3
+  %t39 = load i8, i8* %t38, align 1
+  %t40 = xor i8 %t39, %t37
+  store i8 %t40, i8* %t38, align 1
+  %t41 = load i8, i8* %t8, align 1
+  %t42 = getelementptr inbounds i8, i8* %arg2, i64 4
+  %t43 = load i8, i8* %t42, align 1
+  %t44 = xor i8 %t43, %t41
+  store i8 %t44, i8* %t42, align 1
+  %t45 = load i8, i8* %t9, align 1
+  %t46 = getelementptr inbounds i8, i8* %arg2, i64 5
+  %t47 = load i8, i8* %t46, align 1
+  %t48 = xor i8 %t47, %t45
+  store i8 %t48, i8* %t46, align 1
+  %t49 = load i8, i8* %t10, align 1
+  %t50 = getelementptr inbounds i8, i8* %arg2, i64 6
+  %t51 = load i8, i8* %t50, align 1
+  %t52 = xor i8 %t51, %t49
+  store i8 %t52, i8* %t50, align 1
+  %t53 = load i8, i8* %t11, align 1
+  %t54 = getelementptr inbounds i8, i8* %arg2, i64 7
+  %t55 = load i8, i8* %t54, align 1
+  %t56 = xor i8 %t55, %t53
+  store i8 %t56, i8* %t54, align 1
+  %t57 = load i8, i8* %t12, align 1
+  %t58 = getelementptr inbounds i8, i8* %arg2, i64 8
+  %t59 = load i8, i8* %t58, align 1
+  %t60 = xor i8 %t59, %t57
+  store i8 %t60, i8* %t58, align 1
+  %t61 = load i8, i8* %t13, align 1
+  %t62 = getelementptr inbounds i8, i8* %arg2, i64 9
+  %t63 = load i8, i8* %t62, align 1
+  %t64 = xor i8 %t63, %t61
+  store i8 %t64, i8* %t62, align 1
+  %t65 = load i8, i8* %t14, align 1
+  %t66 = getelementptr inbounds i8, i8* %arg2, i64 10
+  %t67 = load i8, i8* %t66, align 1
+  %t68 = xor i8 %t67, %t65
+  store i8 %t68, i8* %t66, align 1
+  %t69 = load i8, i8* %t15, align 1
+  %t70 = getelementptr inbounds i8, i8* %arg2, i64 11
+  %t71 = load i8, i8* %t70, align 1
+  %t72 = xor i8 %t71, %t69
+  store i8 %t72, i8* %t70, align 1
+  %t73 = load i8, i8* %t16, align 1
+  %t74 = getelementptr inbounds i8, i8* %arg2, i64 12
+  %t75 = load i8, i8* %t74, align 1
+  %t76 = xor i8 %t75, %t73
+  store i8 %t76, i8* %t74, align 1
+  %t77 = load i8, i8* %t17, align 1
+  %t78 = getelementptr inbounds i8, i8* %arg2, i64 13
+  %t79 = load i8, i8* %t78, align 1
+  %t80 = xor i8 %t79, %t77
+  store i8 %t80, i8* %t78, align 1
+  %t81 = load i8, i8* %t18, align 1
+  %t82 = getelementptr inbounds i8, i8* %arg2, i64 14
+  %t83 = load i8, i8* %t82, align 1
+  %t84 = xor i8 %t83, %t81
+  store i8 %t84, i8* %t82, align 1
+  %t85 = load i8, i8* %t21, align 1
+  %t86 = getelementptr inbounds i8, i8* %arg2, i64 15
+  %t87 = load i8, i8* %t86, align 1
+  %t88 = xor i8 %t87, %t85
+  store i8 %t88, i8* %t86, align 1
+  ret i32 1
+}
+
+; A test case where instructions required to compute the pointer bounds get
+; vectorized before versioning. Make sure there is no crash.
+define void @crash_instructions_deleted(float* %t, i32* %a, i32** noalias %ptr) {
+; CHECK-LABEL: @crash_instructions_deleted(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[T42:%.*]] = bitcast float* [[T:%.*]] to i8*
+; CHECK-NEXT:    [[T15:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 2
+; CHECK-NEXT:    [[T16:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[T15]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> <i32 0, i32 10>, <2 x i32>* [[TMP0]], align 8
+; CHECK-NEXT:    [[T17:%.*]] = load i32*, i32** [[PTR:%.*]], align 8
+; CHECK-NEXT:    br label [[BB18:%.*]]
+; CHECK:       bb18:
+; CHECK-NEXT:    [[T19:%.*]] = sext i32 0 to i64
+; CHECK-NEXT:    [[T20:%.*]] = add nsw i64 1, [[T19]]
+; CHECK-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T17]], i64 [[T20]]
+; CHECK-NEXT:    [[T22:%.*]] = bitcast i32* [[T21]] to i8*
+; CHECK-NEXT:    [[T23:%.*]] = getelementptr inbounds i8, i8* [[T22]], i64 1
+; CHECK-NEXT:    [[T24:%.*]] = getelementptr inbounds i8, i8* [[T22]], i64 2
+; CHECK-NEXT:    [[T25:%.*]] = getelementptr inbounds i8, i8* [[T22]], i64 3
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[T17]], i64 2
+; CHECK-NEXT:    [[SCEVGEP18:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[SCEVGEP43:%.*]] = getelementptr float, float* [[T]], i64 4
+; CHECK-NEXT:    [[SCEVGEP4344:%.*]] = bitcast float* [[SCEVGEP43]] to i8*
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[T22]], [[SCEVGEP4344]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[T42]], [[SCEVGEP18]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[BB18_SCALAR:%.*]], label [[BB18_SLPVERSIONED1:%.*]]
+; CHECK:       bb18.scalar:
+; CHECK-NEXT:    [[T26:%.*]] = load i8, i8* [[T22]], align 1
+; CHECK-NEXT:    [[T27:%.*]] = uitofp i8 [[T26]] to float
+; CHECK-NEXT:    [[T28:%.*]] = fdiv float [[T27]], 2.550000e+02
+; CHECK-NEXT:    [[T29:%.*]] = getelementptr inbounds float, float* [[T]], i64 0
+; CHECK-NEXT:    store float [[T28]], float* [[T29]], align 8
+; CHECK-NEXT:    [[T30:%.*]] = load i8, i8* [[T23]], align 1
+; CHECK-NEXT:    [[T31:%.*]] = uitofp i8 [[T30]] to float
+; CHECK-NEXT:    [[T32:%.*]] = fdiv float [[T31]], 2.550000e+02
+; CHECK-NEXT:    [[T33:%.*]] = getelementptr inbounds float, float* [[T]], i64 1
+; CHECK-NEXT:    store float [[T32]], float* [[T33]], align 4
+; CHECK-NEXT:    [[T34:%.*]] = load i8, i8* [[T24]], align 1
+; CHECK-NEXT:    [[T35:%.*]] = uitofp i8 [[T34]] to float
+; CHECK-NEXT:    [[T36:%.*]] = fdiv float [[T35]], 2.550000e+02
+; CHECK-NEXT:    [[T37:%.*]] = getelementptr inbounds float, float* [[T]], i64 2
+; CHECK-NEXT:    store float [[T36]], float* [[T37]], align 8
+; CHECK-NEXT:    [[T38:%.*]] = load i8, i8* [[T25]], align 1
+; CHECK-NEXT:    [[T39:%.*]] = uitofp i8 [[T38]] to float
+; CHECK-NEXT:    [[T40:%.*]] = fdiv float [[T39]], 2.550000e+02
+; CHECK-NEXT:    [[T41:%.*]] = getelementptr inbounds float, float* [[T]], i64 3
+; CHECK-NEXT:    store float [[T40]], float* [[T41]], align 4
+; CHECK-NEXT:    br label [[BB18_MERGE:%.*]]
+; CHECK:       bb18.merge:
+; CHECK-NEXT:    ret void
+; CHECK:       bb18.slpversioned1:
+; CHECK-NEXT:    [[T295:%.*]] = getelementptr inbounds float, float* [[T]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[T22]] to <4 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1, !alias.scope !35, !noalias !38
+; CHECK-NEXT:    [[TMP3:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = fdiv <4 x float> [[TMP3]], <float 2.550000e+02, float 2.550000e+02, float 2.550000e+02, float 2.550000e+02>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[T295]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 8, !alias.scope !38, !noalias !35
+; CHECK-NEXT:    br label [[BB18_MERGE]]
+;
+bb:
+  %t6 = icmp slt i32 10, 0
+  %t7 = icmp sgt i32 20, 20
+  %t9 = select i1 %t7, i32 5, i32 0
+  %t10 = select i1 %t6, i32 0, i32 %t9
+  %t11 = icmp slt i32 10, 0
+  %t12 = icmp sgt i32 20, 20
+  %t13 = select i1 %t12, i32 5, i32 10
+  %t14 = select i1 %t11, i32 0, i32 %t13
+  %t15 = getelementptr inbounds i32, i32* %a, i32 2
+  store i32 %t10, i32* %t15, align 8
+  %t16 = getelementptr inbounds i32, i32* %a, i32 3
+  store i32 %t14, i32* %t16, align 4
+  %t17 = load i32*, i32** %ptr, align 8
+  br label %bb18
+
+bb18:                                             ; preds = %bb5
+  %t19 = sext i32 %t10 to i64
+  %t20 = add nsw i64 1, %t19
+  %t21 = getelementptr inbounds i32, i32* %t17, i64 %t20
+  %t22 = bitcast i32* %t21 to i8*
+  %t23 = getelementptr inbounds i8, i8* %t22, i64 1
+  %t24 = getelementptr inbounds i8, i8* %t22, i64 2
+  %t25 = getelementptr inbounds i8, i8* %t22, i64 3
+  %t26 = load i8, i8* %t22, align 1
+  %t27 = uitofp i8 %t26 to float
+  %t28 = fdiv float %t27, 2.550000e+02
+  %t29 = getelementptr inbounds float, float* %t, i64 0
+  store float %t28, float* %t29, align 8
+  %t30 = load i8, i8* %t23, align 1
+  %t31 = uitofp i8 %t30 to float
+  %t32 = fdiv float %t31, 2.550000e+02
+  %t33 = getelementptr inbounds float, float* %t, i64 1
+  store float %t32, float* %t33, align 4
+  %t34 = load i8, i8* %t24, align 1
+  %t35 = uitofp i8 %t34 to float
+  %t36 = fdiv float %t35, 2.550000e+02
+  %t37 = getelementptr inbounds float, float* %t, i64 2
+  store float %t36, float* %t37, align 8
+  %t38 = load i8, i8* %t25, align 1
+  %t39 = uitofp i8 %t38 to float
+  %t40 = fdiv float %t39, 2.550000e+02
+  %t41 = getelementptr inbounds float, float* %t, i64 3
+  store float %t40, float* %t41, align 4
+  ret void
+}
+
+; A test case where there are no instructions accessing a tracked object in a
+; block for which versioning was requested.
+define void @crash_no_tracked_instructions(float** %arg, float* %arg.2, float* %arg.3, i1 %c) {
+; CHECK-LABEL: @crash_no_tracked_instructions(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[T19:%.*]] = load float*, float** [[ARG:%.*]], align 8
+; CHECK-NEXT:    [[T20:%.*]] = load float, float* [[ARG_3:%.*]], align 4
+; CHECK-NEXT:    [[T21:%.*]] = getelementptr inbounds float, float* [[ARG_2:%.*]], i64 0
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB22:%.*]], label [[BB30:%.*]]
+; CHECK:       bb22:
+; CHECK-NEXT:    [[T23:%.*]] = fmul float [[T20]], 9.900000e+01
+; CHECK-NEXT:    [[T24:%.*]] = fmul float [[T23]], 9.900000e+01
+; CHECK-NEXT:    [[T25:%.*]] = getelementptr inbounds float, float* [[T19]], i64 2
+; CHECK-NEXT:    [[T26:%.*]] = fmul float [[T23]], 1.000000e+01
+; CHECK-NEXT:    store float [[T26]], float* [[T25]], align 4
+; CHECK-NEXT:    [[T27:%.*]] = load float, float* [[T21]], align 8
+; CHECK-NEXT:    [[T28:%.*]] = fadd float [[T24]], 2.000000e+01
+; CHECK-NEXT:    [[T29:%.*]] = fadd float [[T26]], 2.000000e+01
+; CHECK-NEXT:    br label [[BB30]]
+; CHECK:       bb30:
+; CHECK-NEXT:    [[T31:%.*]] = phi float [ [[T28]], [[BB22]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[T32:%.*]] = phi float [ [[T29]], [[BB22]] ], [ [[T20]], [[ENTRY]] ]
+; CHECK-NEXT:    br label [[BB36:%.*]]
+; CHECK:       bb36:
+; CHECK-NEXT:    [[T37:%.*]] = fmul float [[T31]], 3.000000e+00
+; CHECK-NEXT:    [[T38:%.*]] = getelementptr inbounds float, float* [[ARG_3]], i64 0
+; CHECK-NEXT:    store float [[T37]], float* [[T38]], align 4
+; CHECK-NEXT:    [[T39:%.*]] = fmul float [[T32]], 3.000000e+00
+; CHECK-NEXT:    [[T40:%.*]] = getelementptr inbounds float, float* [[ARG_3]], i64 1
+; CHECK-NEXT:    store float [[T39]], float* [[T40]], align 4
+; CHECK-NEXT:    br label [[BB41:%.*]]
+; CHECK:       bb41:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %t19 = load float*, float** %arg
+  %t20 = load float, float* %arg.3, align 4
+  %t21 = getelementptr inbounds float, float* %arg.2, i64 0
+  br i1 %c, label %bb22, label %bb30
+
+bb22:
+  %t23 = fmul float %t20, 99.0
+  %t24 = fmul float %t23, 99.0
+  %t25 = getelementptr inbounds float, float* %t19, i64 2
+  %t26 = fmul float %t23, 10.0
+  store float %t26, float* %t25, align 4
+  %t27 = load float, float* %t21, align 8
+  %t28 = fadd float %t24, 20.0
+  %t29 = fadd float %t26, 20.0
+  br label %bb30
+
+bb30:
+  %t31 = phi float [ %t28, %bb22 ], [ 0.0, %entry ]
+  %t32 = phi float [ %t29, %bb22 ], [ %t20, %entry ]
+  br label %bb36
+
+bb36:
+  %t37 = fmul float %t31, 3.0
+  %t38 = getelementptr inbounds float, float* %arg.3, i64 0
+  store float %t37, float* %t38, align 4
+  %t39 = fmul float %t32, 3.0
+  %t40 = getelementptr inbounds float, float* %arg.3, i64 1
+  store float %t39, float* %t40, align 4
+  br label %bb41
+
+bb41:
   ret void
 }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
@@ -1,11 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -scoped-noalias-aa -slp-vectorizer -mtriple=x86_64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s
+; RUN: opt -slp-memory-versioning -scoped-noalias-aa -slp-vectorizer -mtriple=x86_64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s
+; RUN: opt -slp-memory-versioning=false -scoped-noalias-aa -slp-vectorizer -mtriple=x86_64-apple-darwin -enable-new-pm=false -S %s | FileCheck --check-prefix=NOVERSION %s
+
+; NOVERSION-NOT: memcheck
 
 define void @version_multiple(i32* nocapture %out_block, i32* nocapture readonly %counter) {
 ; CHECK-LABEL: @version_multiple(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[COUNTER:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK:%.*]], align 4
+; CHECK-NEXT:    [[COUNTER12:%.*]] = bitcast i32* [[COUNTER:%.*]] to i8*
+; CHECK-NEXT:    [[OUT_BLOCK14:%.*]] = bitcast i32* [[OUT_BLOCK:%.*]] to i8*
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[COUNTER]], i64 4
+; CHECK-NEXT:    [[SCEVGEP13:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[SCEVGEP15:%.*]] = getelementptr i32, i32* [[OUT_BLOCK]], i64 4
+; CHECK-NEXT:    [[SCEVGEP1516:%.*]] = bitcast i32* [[SCEVGEP15]] to i8*
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[COUNTER12]], [[SCEVGEP1516]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[OUT_BLOCK14]], [[SCEVGEP13]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]]
+; CHECK:       entry.scalar:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[COUNTER]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1
@@ -26,7 +41,18 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4
 ; CHECK-NEXT:    [[XOR_3:%.*]] = xor i32 [[TMP7]], [[TMP6]]
 ; CHECK-NEXT:    store i32 [[XOR_3]], i32* [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    br label [[ENTRY_MERGE:%.*]]
+; CHECK:       entry.merge:
 ; CHECK-NEXT:    ret void
+; CHECK:       entry.slpversioned1:
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[COUNTER]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    br label [[ENTRY_MERGE]]
 ;
 entry:
   %0 = load i32, i32* %counter, align 4