diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
@@ -62,6 +63,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Operator.h"
@@ -85,8 +87,11 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
 #include <cassert>
@@ -578,6 +583,44 @@
   return Index;
 }
 
+static bool extendMemBounds(
+    Instruction &I, bool Insert, ScalarEvolution &SE,
+    DenseMap<Value *, std::pair<const SCEV *, const SCEV *>> &MemBounds) {
+
+  BasicBlock *BB = I.getParent();
+  auto GetPtr = [](Instruction *I) -> Value * {
+    if (auto *L = dyn_cast<LoadInst>(I))
+      return L->getPointerOperand();
+    if (auto *S = dyn_cast<StoreInst>(I))
+      return S->getPointerOperand();
+    return nullptr;
+  };
+  auto *Ptr = GetPtr(&I);
+  if (!Ptr)
+    return false;
+  auto *PtrSCEV = SE.getSCEV(Ptr);
+
+  Value *Obj = getUnderlyingObject(Ptr);
+  if (!Obj)
+    return false;
+
+  if (!SE.properlyDominates(PtrSCEV, BB))
+    return false;
+
+  if (Insert)
+    MemBounds.insert({Obj, {PtrSCEV, PtrSCEV}});
+  auto BoundsIter = MemBounds.find(Obj);
+  if (BoundsIter == MemBounds.end())
+    return false;
+
+  if (SE.isKnownPredicate(CmpInst::ICMP_ULT, PtrSCEV, BoundsIter->second.first))
+    BoundsIter->second.first = PtrSCEV;
+  if (SE.isKnownPredicate(CmpInst::ICMP_UGT, PtrSCEV,
+                          BoundsIter->second.second))
+    BoundsIter->second.second = PtrSCEV;
+
+  return true;
+}
 namespace slpvectorizer {
 
 /// Bottom Up SLP Vectorizer.
@@ -586,6 +629,16 @@
   struct ScheduleData;
 
 public:
+  // Map of objects to start & end pointers we need to generate runtime checks
+  // for.
+  DenseMap<Value *, std::pair<const SCEV *, const SCEV *>> MemBounds;
+  /// Cache for alias results.
+  /// TODO: consider moving this to the AliasAnalysis itself.
+  using AliasCacheKey = std::pair<Instruction *, Instruction *>;
+  DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
+
+  bool CollectMemAccess = false;
+
   using ValueList = SmallVector<Value *, 8>;
   using InstrList = SmallVector<Instruction *, 16>;
   using ValueSet = SmallPtrSet<Value *, 16>;
@@ -664,6 +717,7 @@
     }
     MinBWs.clear();
     InstrElementSize.clear();
+    MemBounds.clear();
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -1962,11 +2016,6 @@
     return aliased;
   }
 
-  using AliasCacheKey = std::pair<Instruction *, Instruction *>;
-
-  /// Cache for alias results.
-  /// TODO: consider moving this to the AliasAnalysis itself.
-  DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
 
   /// Removes an instruction from its block and eventually deletes it.
   /// It's like Instruction::eraseFromParent() except that the actual deletion
@@ -2567,11 +2616,9 @@
            "trying to erase instruction with users.");
     Pair.getFirst()->eraseFromParent();
   }
-#ifdef EXPENSIVE_CHECKS
   // If we could guarantee that this call is not extremely slow, we could
   // remove the ifdef limitation (see PR47712).
   assert(!verifyFunction(*F, &dbgs()));
-#endif
 }
 
 void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
@@ -6097,6 +6144,7 @@
           while (DepDest) {
             assert(isInSchedulingRegion(DepDest));
 
+            ScheduleData *DestBundle = DepDest->FirstInBundle;
             // We have two limits to reduce the complexity:
             // 1) AliasedCheckLimit: It's a small limit to reduce calls to
             //    SLP->isAliased (which is the expensive part in this loop).
@@ -6114,9 +6162,33 @@
               // balance between reduced runtime and accurate dependencies.
               numAliased++;
 
+              // If this bundle is not scheduled and no versioned code has been
+              // generated yet, try to collect the bounds of the accesses to
+              // generate runtime checks.
+              if (!DestBundle->IsScheduled && SLP->CollectMemAccess) {
+                // FIXME Naming
+                auto GetPtr = [](Instruction *I) -> Value * {
+                  if (auto *L = dyn_cast<LoadInst>(I))
+                    return L->getPointerOperand();
+                  if (auto *S = dyn_cast<StoreInst>(I))
+                    return S->getPointerOperand();
+                  return nullptr;
+                };
+                auto *Src = GetPtr(SrcInst);
+                auto *Dst = GetPtr(DepDest->Inst);
+
+                if (SrcInst->getParent() == DepDest->Inst->getParent() && Src &&
+                    Dst) {
+                  bool AddedSrc =
+                      extendMemBounds(*SrcInst, true, *SLP->SE, SLP->MemBounds);
+                  bool AddedDst = extendMemBounds(*DepDest->Inst, true,
+                                                  *SLP->SE, SLP->MemBounds);
+                  if (!AddedSrc || !AddedDst)
+                    SLP->MemBounds.clear();
+                }
+              }
               DepDest->MemoryDependencies.push_back(BundleMember);
               BundleMember->Dependencies++;
-              ScheduleData *DestBundle = DepDest->FirstInBundle;
               if (!DestBundle->IsScheduled) {
                 BundleMember->incrementUnscheduledDeps(1);
               }
@@ -6596,7 +6668,7 @@
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
-  PA.preserveSet<CFGAnalyses>();
+  // PA.preserveSet<CFGAnalyses>();
   return PA;
 }
 
@@ -6643,6 +6715,9 @@
   // Update DFS numbers now so that we can use them for ordering.
   DT->updateDFSNumbers();
 
+  SmallVector<BasicBlock *, 4> BlocksToRetry;
+  SmallVector<DenseMap<Value *, std::pair<const SCEV *, const SCEV *>>, 4>
+      BoundsToUse;
   // Scan the blocks in the function in post order.
   for (auto BB : post_order(&F.getEntryBlock())) {
     collectSeedInstructions(BB);
@@ -6651,7 +6726,40 @@
     if (!Stores.empty()) {
       LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
                         << " underlying objects.\n");
-      Changed |= vectorizeStoreChains(R);
+      R.MemBounds.clear();
+
+      auto NoOrSingleSucc = [](BasicBlock *BB) {
+        return succ_begin(BB) == succ_end(BB) ||
+               std::next(succ_begin(BB)) == succ_end(BB);
+      };
+      auto NoOrSinglePred = [](BasicBlock *BB) {
+        return pred_begin(BB) == pred_end(BB) ||
+               std::next(pred_begin(BB)) == pred_end(BB);
+      };
+
+      auto AllUsesInside = [](BasicBlock *BB) {
+        return all_of(*BB, [BB](Instruction &I) {
+          return all_of(I.users(), [BB](User *U) {
+            return cast<Instruction>(U)->getParent() == BB;
+          });
+        });
+      };
+      auto TermSupported = [](BasicBlock *BB) {
+        auto *RetI = dyn_cast<ReturnInst>(BB->getTerminator());
+        return isa<BranchInst>(BB->getTerminator()) ||
+               (RetI && !RetI->getReturnValue());
+      };
+      R.CollectMemAccess = NoOrSingleSucc(BB) && NoOrSinglePred(BB) &&
+                           AllUsesInside(BB) && TermSupported(BB);
+
+      bool VectorizedChains = vectorizeStoreChains(R);
+      if (!VectorizedChains && !R.MemBounds.empty()) {
+        BlocksToRetry.push_back(BB);
+        BoundsToUse.push_back(R.MemBounds);
+      }
+      R.CollectMemAccess = false;
+      R.MemBounds.clear();
+      Changed |= VectorizedChains;
     }
 
     // Vectorize trees that end at reductions.
@@ -6667,6 +6775,161 @@
     }
   }
 
+  R.AliasCache.clear();
+  for (unsigned I = 0; I != BlocksToRetry.size(); I++) {
+    BasicBlock *BB = BlocksToRetry[I];
+    auto &MemBounds = BoundsToUse[I];
+
+    for (Instruction &I : *BB)
+      extendMemBounds(I, false, *SE, MemBounds);
+
+    LLVMContext &Ctx = BB->getContext();
+    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+    std::string OriginalName = BB->getName().str();
+    auto *CheckBlock = splitBlockBefore(BB, &*BB->getFirstNonPHI(), &DTU, LI,
+                                        nullptr, OriginalName + ".slpmemcheck");
+    auto *MergeBlock = BB;
+    BB = splitBlockBefore(BB, BB->getTerminator(), &DTU, LI, nullptr,
+                          OriginalName + ".slpversioned");
+
+    ValueToValueMapTy VMap;
+    auto *Scalar = CloneBasicBlock(BB, VMap, "", BB->getParent());
+    Scalar->setName(OriginalName + ".scalar");
+    MergeBlock->setName(OriginalName + ".merge");
+    SmallVector<BasicBlock *> Tmp;
+    Tmp.push_back(Scalar);
+    remapInstructionsInBlocks(Tmp, VMap);
+
+    Value *MemoryRuntimeCheck = nullptr;
+    Instruction *FirstInst = nullptr;
+    SCEVExpander Exp(*SE, BB->getParent()->getParent()->getDataLayout(),
+                     "memcheck");
+    SmallVector<std::pair<Value *, Value *>, 4> ExpandedBounds;
+    Type *PtrArithTy =
+        Type::getInt8PtrTy(BB->getParent()->getParent()->getContext(), 0);
+    for (auto &KV : MemBounds) {
+      ExpandedBounds.emplace_back(
+          Exp.expandCodeFor(KV.second.first, PtrArithTy,
+                            CheckBlock->getTerminator()),
+          Exp.expandCodeFor(KV.second.second, PtrArithTy,
+                            CheckBlock->getTerminator()));
+    }
+    auto GetFirstInst = [](Instruction *FirstInst, Value *V,
+                           Instruction *Loc) -> Instruction * {
+      if (FirstInst)
+        return FirstInst;
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        return I->getParent() == Loc->getParent() ? I : nullptr;
+      return nullptr;
+    };
+
+    Instruction *Loc = CheckBlock->getTerminator();
+    IRBuilder<> ChkBuilder(CheckBlock->getTerminator());
+    for (unsigned i = 0; i < MemBounds.size(); ++i) {
+      for (unsigned j = i + 1; j < MemBounds.size(); ++j) {
+        Value *ALow = ExpandedBounds[i].first;
+        Value *AHigh = ExpandedBounds[i].second;
+        Value *BLow = ExpandedBounds[j].first;
+        Value *BHigh = ExpandedBounds[j].second;
+
+        unsigned AS0 = ALow->getType()->getPointerAddressSpace();
+        unsigned AS1 = BLow->getType()->getPointerAddressSpace();
+
+        Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
+        Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
+        Value *Start0 = ChkBuilder.CreateBitCast(ALow, PtrArithTy0, "bc");
+        Value *Start1 = ChkBuilder.CreateBitCast(BLow, PtrArithTy1, "bc");
+        Value *End0 = ChkBuilder.CreateBitCast(AHigh, PtrArithTy1, "bc");
+        Value *End1 = ChkBuilder.CreateBitCast(BHigh, PtrArithTy0, "bc");
+        // [A|B].Start points to the first accessed byte under base [A|B].
+        // [A|B].End points to the last accessed byte, plus one.
+        // There is no conflict when the intervals are disjoint:
+        // NoConflict = (B.Start >= A.End) || (A.Start >= B.End)
+        //
+        // bound0 = (B.Start < A.End)
+        // bound1 = (A.Start < B.End)
+        //  IsConflict = bound0 & bound1
+        Value *Cmp0 = ChkBuilder.CreateICmpULT(Start0, End1, "bound0");
+        FirstInst = GetFirstInst(FirstInst, Cmp0, Loc);
+        Value *Cmp1 = ChkBuilder.CreateICmpULT(Start1, End0, "bound1");
+        FirstInst = GetFirstInst(FirstInst, Cmp1, Loc);
+        Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
+        FirstInst = GetFirstInst(FirstInst, IsConflict, Loc);
+        if (MemoryRuntimeCheck) {
+          IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
+                                           "conflict.rdx");
+          FirstInst = GetFirstInst(FirstInst, IsConflict, Loc);
+        }
+        MemoryRuntimeCheck = IsConflict;
+      }
+    }
+
+    ChkBuilder.CreateCondBr(MemoryRuntimeCheck, BB, Scalar);
+    CheckBlock->getTerminator()->eraseFromParent();
+    DTU.applyUpdates({{DT->Insert, CheckBlock, Scalar}});
+    Changed = true;
+
+    MDBuilder MDB(Ctx);
+    MDNode *Domain = MDB.createAnonymousAliasScopeDomain("SLPVerDomain");
+
+    DenseMap<const std::pair<const SCEV *, const SCEV *> *, MDNode *>
+        GroupToScope;
+    for (const auto &Group : MemBounds)
+      GroupToScope[&Group.second] = MDB.createAnonymousAliasScope(Domain);
+
+    for (Instruction &I : *BB) {
+      auto GetPtr = [](Instruction *I) -> Value * {
+        if (auto *L = dyn_cast<LoadInst>(I))
+          return L->getPointerOperand();
+        if (auto *S = dyn_cast<StoreInst>(I))
+          return S->getPointerOperand();
+        return nullptr;
+      };
+      auto *Ptr = GetPtr(&I);
+      if (!Ptr)
+        continue;
+      auto *PtrSCEV = SE->getSCEV(Ptr);
+
+      Value *Obj = getUnderlyingObject(Ptr);
+      if (!Obj)
+        continue;
+
+      auto BoundsIter = MemBounds.find(Obj);
+      if (BoundsIter == MemBounds.end())
+        continue;
+      auto *LowerBound = BoundsIter->second.first;
+      auto *UpperBound = BoundsIter->second.second;
+      auto *Scope = GroupToScope.find(&BoundsIter->second)->second;
+      if (SE->isKnownPredicate(CmpInst::ICMP_UGE, PtrSCEV, LowerBound) &&
+          SE->isKnownPredicate(CmpInst::ICMP_ULE, PtrSCEV, UpperBound)) {
+        I.setMetadata(
+            LLVMContext::MD_alias_scope,
+            MDNode::concatenate(I.getMetadata(LLVMContext::MD_alias_scope),
+                                MDNode::get(Ctx, Scope)));
+
+        SmallVector<Metadata *, 4> NonAliasing;
+        for (auto &KV : GroupToScope) {
+          if (KV.first == &BoundsIter->second)
+            continue;
+          NonAliasing.push_back(KV.second);
+        }
+        I.setMetadata(
+            LLVMContext::MD_noalias,
+            MDNode::concatenate(I.getMetadata(LLVMContext::MD_noalias),
+                                MDNode::get(Ctx, NonAliasing)));
+      }
+    }
+
+    collectSeedInstructions(BB);
+
+    // Vectorize trees that end at stores.
+    if (!Stores.empty()) {
+      LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
+                        << " underlying objects.\n");
+      Changed |= vectorizeStoreChains(R);
+    }
+  }
+
   if (Changed) {
     R.optimizeGatherSequence();
     LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
@@ -3,16 +3,38 @@
 
 define void @needs_versioning(i32* %dst, i32* %src) {
 ; CHECK-LABEL: @needs_versioning(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SRC_0:%.*]] = load i32, i32* [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[R_0:%.*]] = ashr i32 [[SRC_0]], 16
-; CHECK-NEXT:    store i32 [[R_0]], i32* [[DST:%.*]], align 4
+; CHECK-NEXT:  entry.slpmemcheck:
+; CHECK-NEXT:    [[SRC8:%.*]] = bitcast i32* [[SRC:%.*]] to i8*
+; CHECK-NEXT:    [[DST10:%.*]] = bitcast i32* [[DST:%.*]] to i8*
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[SRC]], i64 1
+; CHECK-NEXT:    [[SCEVGEP9:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[DST]], i64 1
+; CHECK-NEXT:    [[SCEVGEP1112:%.*]] = bitcast i32* [[SCEVGEP11]] to i8*
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[SRC8]], [[SCEVGEP1112]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[DST10]], [[SCEVGEP9]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[ENTRY_SLPVERSIONED:%.*]], label [[ENTRY_SCALAR:%.*]]
+; CHECK:       entry.slpversioned:
 ; CHECK-NEXT:    [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1
-; CHECK-NEXT:    [[SRC_1:%.*]] = load i32, i32* [[SRC_GEP_1]], align 4
-; CHECK-NEXT:    [[R_1:%.*]] = ashr i32 [[SRC_1]], 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr <2 x i32> [[TMP1]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1
-; CHECK-NEXT:    store i32 [[R_1]], i32* [[DST_GEP_1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], <2 x i32>* [[TMP3]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    br label [[ENTRY_MERGE:%.*]]
+; CHECK:       entry.merge:
 ; CHECK-NEXT:    ret void
+; CHECK:       entry.scalar:
+; CHECK-NEXT:    [[SRC_02:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[R_03:%.*]] = ashr i32 [[SRC_02]], 16
+; CHECK-NEXT:    store i32 [[R_03]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[SRC_GEP_14:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1
+; CHECK-NEXT:    [[SRC_15:%.*]] = load i32, i32* [[SRC_GEP_14]], align 4
+; CHECK-NEXT:    [[R_16:%.*]] = ashr i32 [[SRC_15]], 16
+; CHECK-NEXT:    [[DST_GEP_17:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1
+; CHECK-NEXT:    store i32 [[R_16]], i32* [[DST_GEP_17]], align 4
+; CHECK-NEXT:    br label [[ENTRY_MERGE]]
 ;
 entry:
   %src.0 = load i32, i32* %src, align 4
@@ -52,11 +74,22 @@
 
 define void @version_multiple(i32* nocapture %out_block, i32* nocapture readonly %counter) {
 ; CHECK-LABEL: @version_multiple(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[COUNTER:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK:%.*]], align 4
+; CHECK-NEXT:  entry.slpmemcheck:
+; CHECK-NEXT:    [[COUNTER12:%.*]] = bitcast i32* [[COUNTER:%.*]] to i8*
+; CHECK-NEXT:    [[OUT_BLOCK14:%.*]] = bitcast i32* [[OUT_BLOCK:%.*]] to i8*
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[COUNTER]], i64 3
+; CHECK-NEXT:    [[SCEVGEP13:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[SCEVGEP15:%.*]] = getelementptr i32, i32* [[OUT_BLOCK]], i64 2
+; CHECK-NEXT:    [[SCEVGEP1516:%.*]] = bitcast i32* [[SCEVGEP15]] to i8*
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[COUNTER12]], [[SCEVGEP1516]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[OUT_BLOCK14]], [[SCEVGEP13]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[ENTRY_SLPVERSIONED:%.*]], label [[ENTRY_SCALAR:%.*]]
+; CHECK:       entry.slpversioned:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[COUNTER]], align 4, !alias.scope !5, !noalias !8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4, !alias.scope !8, !noalias !5
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4
+; CHECK-NEXT:    store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4, !alias.scope !8, !noalias !5
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1
@@ -64,18 +97,43 @@
 ; CHECK-NEXT:    [[XOR_1:%.*]] = xor i32 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    store i32 [[XOR_1]], i32* [[ARRAYIDX2_1]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX2_2]], align 4
-; CHECK-NEXT:    [[XOR_2:%.*]] = xor i32 [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    store i32 [[XOR_2]], i32* [[ARRAYIDX2_2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[ARRAYIDX_2]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4
-; CHECK-NEXT:    [[XOR_3:%.*]] = xor i32 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    store i32 [[XOR_3]], i32* [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[ARRAYIDX2_2]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[TMP7]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX2_2]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP8]], <2 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    br label [[ENTRY_MERGE:%.*]]
+; CHECK:       entry.merge:
 ; CHECK-NEXT:    ret void
+; CHECK:       entry.scalar:
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[COUNTER]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i32 [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    store i32 [[XOR2]], i32* [[OUT_BLOCK]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX_13]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_14:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2_14]], align 4
+; CHECK-NEXT:    [[XOR_15:%.*]] = xor i32 [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    store i32 [[XOR_15]], i32* [[ARRAYIDX2_14]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX_26]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_27:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX2_27]], align 4
+; CHECK-NEXT:    [[XOR_28:%.*]] = xor i32 [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    store i32 [[XOR_28]], i32* [[ARRAYIDX2_27]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_39:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX_39]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_310:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX2_310]], align 4
+; CHECK-NEXT:    [[XOR_311:%.*]] = xor i32 [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    store i32 [[XOR_311]], i32* [[ARRAYIDX2_310]], align 4
+; CHECK-NEXT:    br label [[ENTRY_MERGE]]
 ;
 entry:
   %0 = load i32, i32* %counter, align 4