Index: llvm/include/llvm/Analysis/LoopAccessAnalysis.h
===================================================================
--- llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -328,13 +328,15 @@
 /// two groups.
 struct RuntimeCheckingPtrGroup {
   /// Create a new pointer checking group containing a single
-  /// pointer, with index \p Index in RtCheck.
-  RuntimeCheckingPtrGroup(unsigned Index, RuntimePointerChecking &RtCheck);
+  /// pointer, with index \p Index in RtCheck and using \p Fork (0 or 1) if
+  /// this represents a forked pointer.
+  RuntimeCheckingPtrGroup(unsigned Index, RuntimePointerChecking &RtCheck,
+                          bool IsFork = false);
 
   RuntimeCheckingPtrGroup(unsigned Index, const SCEV *Start, const SCEV *End,
-                          unsigned AS)
+                          unsigned AS, bool IsFork = false)
       : High(End), Low(Start), AddressSpace(AS) {
-    Members.push_back(Index);
+    Members.push_back(std::make_pair(Index, IsFork ? 1 : 0));
   }
 
   /// Tries to add the pointer recorded in RtCheck at index
@@ -342,9 +344,12 @@
   /// to a checking group if we will still be able to get
   /// the upper and lower bounds of the check. Returns true in case
   /// of success, false otherwise.
-  bool addPointer(unsigned Index, RuntimePointerChecking &RtCheck);
+  /// For forked pointers this will only add one fork at a time, determined
+  /// by \p Fork (0 or 1).
+  bool addPointer(unsigned Index, RuntimePointerChecking &RtCheck,
+                  bool IsFork = false);
   bool addPointer(unsigned Index, const SCEV *Start, const SCEV *End,
-                  unsigned AS, ScalarEvolution &SE);
+                  unsigned AS, ScalarEvolution &SE, bool IsFork = false);
 
   /// The SCEV expression which represents the upper bound of all the
   /// pointers in this group.
@@ -352,8 +357,9 @@
   /// The SCEV expression which represents the lower bound of all the
   /// pointers in this group.
   const SCEV *Low;
-  /// Indices of all the pointers that constitute this grouping.
-  SmallVector<unsigned, 2> Members;
+  /// Indices of all the pointers that constitute this grouping, including
+  /// which fork if the pointer is forked.
+  SmallVector<std::pair<unsigned, unsigned>, 2> Members;
   /// Address space of the involved pointers.
   unsigned AddressSpace;
 };
@@ -363,6 +369,11 @@
                   const RuntimeCheckingPtrGroup *>
     RuntimePointerCheck;
 
+/// Type for a possible forked pointer, where a single pointer could be
+/// one of two different pointers determined conditionally via a select
+/// instruction.
+using ForkedPointer = std::pair<const SCEV *, const SCEV *>;
+
 /// Holds information about the memory runtime legality checks to verify
 /// that a group of pointers do not overlap.
 class RuntimePointerChecking {
@@ -372,12 +383,12 @@
   struct PointerInfo {
     /// Holds the pointer value that we need to check.
     TrackingVH<Value> PointerValue;
-    /// Holds the smallest byte address accessed by the pointer throughout all
-    /// iterations of the loop.
-    const SCEV *Start;
-    /// Holds the largest byte address accessed by the pointer throughout all
-    /// iterations of the loop, plus 1.
-    const SCEV *End;
+    /// Holds the smallest byte address(es) accessed by the pointer throughout
+    /// all iterations of the loop.
+    const SCEV *Starts[2];
+    /// Holds the largest byte address(es) accessed by the pointer throughout
+    /// all iterations of the loop, plus 1.
+    const SCEV *Ends[2];
     /// Holds the information if this pointer is used for writing to memory.
     bool IsWritePtr;
     /// Holds the id of the set of pointers that could be dependent because of a
@@ -385,23 +396,39 @@
     unsigned DependencySetId;
     /// Holds the id of the disjoint alias set to which this pointer belongs.
     unsigned AliasSetId;
-    /// SCEV for the access.
-    const SCEV *Expr;
-
-    PointerInfo(Value *PointerValue, const SCEV *Start, const SCEV *End,
-                bool IsWritePtr, unsigned DependencySetId, unsigned AliasSetId,
-                const SCEV *Expr)
-        : PointerValue(PointerValue), Start(Start), End(End),
-          IsWritePtr(IsWritePtr), DependencySetId(DependencySetId),
-          AliasSetId(AliasSetId), Expr(Expr) {}
+    /// SCEV(s) for the access.
+    const SCEV *Exprs[2];
+    /// Determines whether this represents a forked pointer, which will have
+    /// two SCEV expressions to consider for runtime checking instead of one.
+    bool HasFork = false;
+
+    PointerInfo(Value *PointerValue, SmallVectorImpl<const SCEV *> &ScStarts,
+                SmallVectorImpl<const SCEV *> &ScEnds, bool IsWritePtr,
+                unsigned DependencySetId, unsigned AliasSetId,
+                SmallVectorImpl<const SCEV *> &ScExprs)
+        : PointerValue(PointerValue), IsWritePtr(IsWritePtr),
+          DependencySetId(DependencySetId), AliasSetId(AliasSetId) {
+      assert(ScExprs.size() <= 2 && "Too many SCEV expressions for pointer");
+      for (size_t i = 0; i < ScExprs.size(); ++i) {
+        Exprs[i] = ScExprs[i];
+        Starts[i] = ScStarts[i];
+        Ends[i] = ScEnds[i];
+      }
+
+      // If there are two SCEV expressions associated with this pointer,
+      // then we have a fork.
+      HasFork = (ScExprs.size() == 2);
+    }
   };
 
-  RuntimePointerChecking(ScalarEvolution *SE) : Need(false), SE(SE) {}
+  RuntimePointerChecking(ScalarEvolution *SE, bool AllowForkedPtrs)
+      : Need(false), AllowForkedPtrs(AllowForkedPtrs), SE(SE) {}
 
   /// Reset the state of the pointer runtime information.
   void reset() {
     Need = false;
     Pointers.clear();
+    ForkedPtrs.clear();
     Checks.clear();
   }
 
@@ -444,12 +471,28 @@
                    const SmallVectorImpl<RuntimePointerCheck> &Checks,
                    unsigned Depth = 0) const;
 
+  /// Returns true if the pointer can decompose into two separate pointers
+  /// through a select.
+  bool isForkedPtr(const Value *Ptr) const {
+    return ForkedPtrs.count(Ptr) != 0;
+  }
+
   /// This flag indicates if we need to add the runtime check.
   bool Need;
 
+  /// This flags indicates we allow diverging pointers over a select
+  bool AllowForkedPtrs;
+
   /// Information about the pointers that may require checking.
   SmallVector<PointerInfo, 2> Pointers;
 
+  /// Mapping between a pointer Value used by memory operations and a pair
+  /// of SCEV expressions. This is used in cases where a 'select' instruction
+  /// is used to form part of an address, and we could form a single
+  /// SCEVAddRecExpr with either side but not with both, so we just calculate
+  /// and store the two expressions to determine whether checks are required.
+  DenseMap<const Value *, ForkedPointer> ForkedPtrs;
+
   /// Holds a partitioning of pointers into "check groups".
   SmallVector<RuntimeCheckingPtrGroup, 2> CheckingGroups;
 
Index: llvm/lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -129,6 +129,15 @@
     cl::desc("Enable conflict detection in loop-access analysis"),
     cl::init(true));
 
+/// Enables the detection of forked pointers when analyzing loops; these
+/// pointers could have two possible values at runtime based on a conditional
+/// select instruction, and we can analyze both possibilities to determine
+/// bounds.
+static cl::opt<bool> EnableForkedPointers(
+    "enable-forked-pointer-detection", cl::Hidden,
+    cl::desc("Enable detection of pointers forked by a select instruction"),
+    cl::init(false));
+
 bool VectorizerParams::isInterleaveForced() {
   return ::VectorizationInterleave.getNumOccurrences() > 0;
 }
@@ -167,13 +176,174 @@
   return Expr;
 }
 
+// Walk back through the IR for a pointer, looking for a select like the
+// following:
+//
+//  %offset = select i1 %cmp, i64 %a, i64 %b
+//  %addr = getelementptr double, double* %base, i64 %offset
+//  %ld = load double, double* %addr, align 8
+//
+// We won't be able to form a single SCEVAddRecExpr from this since the
+// address for each loop iteration depends on %cmp. We could potentially
+// produce multiple valid SCEVAddRecExprs, though, and check all of them for
+// memory safety/aliasing if needed.
+//
+// If we encounter some IR we don't yet handle, or something obviously fine
+// like a constant, then we just add the SCEV for that term to the list passed
+// in by the caller. If we have a node that may potentially yield a valid
+// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms
+// ourselves before adding to the list.
+static void findForkedSCEVs(ScalarEvolution *SE, const Loop *L, Value *Ptr,
+                            SmallVectorImpl<const SCEV *> &ScevList) {
+  const SCEV *Scev = SE->getSCEV(Ptr);
+  if (SE->isLoopInvariant(Scev, L) || isa<SCEVAddRecExpr>(Scev) ||
+      !isa<Instruction>(Ptr)) {
+    ScevList.push_back(Scev);
+    return;
+  }
+
+  auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) {
+    switch (Opcode) {
+    case Instruction::Add:
+      return SE->getAddExpr(L, R);
+    case Instruction::Sub:
+      return SE->getMinusSCEV(L, R);
+    case Instruction::Mul:
+      return SE->getMulExpr(L, R);
+    default:
+      llvm_unreachable("Unexpected binary operator when walking ForkedPtrs");
+    }
+  };
+
+  Instruction *I = cast<Instruction>(Ptr);
+  unsigned Opcode = I->getOpcode();
+  switch (Opcode) {
+  case Instruction::BitCast:
+    findForkedSCEVs(SE, L, I->getOperand(0), ScevList);
+    break;
+  case Instruction::SExt:
+  case Instruction::ZExt: {
+    SmallVector<const SCEV *, 2> ExtScevs;
+    findForkedSCEVs(SE, L, I->getOperand(0), ExtScevs);
+    for (const SCEV *Scev : ExtScevs)
+      if (Opcode == Instruction::SExt)
+        ScevList.push_back(SE->getSignExtendExpr(Scev, I->getType()));
+      else
+        ScevList.push_back(SE->getZeroExtendExpr(Scev, I->getType()));
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+    Type *SourceTy = GEP->getSourceElementType();
+    // We only handle base + single offset GEPs here for now.
+    // Not dealing with preexisting gathers yet, so no vectors.
+    if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) {
+      ScevList.push_back(Scev);
+      break;
+    }
+    SmallVector<const SCEV *, 2> BaseScevs;
+    SmallVector<const SCEV *, 2> OffsetScevs;
+    findForkedSCEVs(SE, L, I->getOperand(0), BaseScevs);
+    findForkedSCEVs(SE, L, I->getOperand(1), OffsetScevs);
+
+    // Make sure we get the correct pointer type to extend to, including the
+    // address space.
+    const SCEV *BaseExpr = SE->getSCEV(GEP->getPointerOperand());
+    Type *IntPtrTy = SE->getEffectiveSCEVType(BaseExpr->getType());
+    SCEV::NoWrapFlags Wrap =
+        GEP->isInBounds() ? SCEV::FlagNSW : SCEV::FlagAnyWrap;
+    // Find the size of the type being pointed to. We only have a single
+    // index term (guarded above) so we don't need to index into arrays or
+    // structures, just get the size of the scalar value.
+    const SCEV *Size = SE->getSizeOfExpr(IntPtrTy, SourceTy);
+
+    if (OffsetScevs.size() == 2 && BaseScevs.size() == 1) {
+      const SCEV *Off1 = SE->getTruncateOrSignExtend(OffsetScevs[0], IntPtrTy);
+      const SCEV *Off2 = SE->getTruncateOrSignExtend(OffsetScevs[1], IntPtrTy);
+      const SCEV *Mul1 = SE->getMulExpr(Size, Off1, Wrap);
+      const SCEV *Mul2 = SE->getMulExpr(Size, Off2, Wrap);
+      const SCEV *Add1 = SE->getAddExpr(BaseScevs[0], Mul1, Wrap);
+      const SCEV *Add2 = SE->getAddExpr(BaseScevs[0], Mul2, Wrap);
+      ScevList.push_back(Add1);
+      ScevList.push_back(Add2);
+    } else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1) {
+      const SCEV *Off = SE->getTruncateOrSignExtend(OffsetScevs[0], IntPtrTy);
+      const SCEV *Mul = SE->getMulExpr(Size, Off, Wrap);
+      const SCEV *Add1 = SE->getAddExpr(BaseScevs[0], Mul, Wrap);
+      const SCEV *Add2 = SE->getAddExpr(BaseScevs[1], Mul, Wrap);
+      ScevList.push_back(Add1);
+      ScevList.push_back(Add2);
+    } else
+      ScevList.push_back(Scev);
+    break;
+  }
+  case Instruction::Select: {
+    SmallVector<const SCEV *, 2> ChildScevs;
+    // A select means we've found a forked pointer, but we currently only
+    // support a single select per pointer so if there's another behind this
+    // then we just bail out and return the generic SCEV.
+    findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs);
+    findForkedSCEVs(SE, L, I->getOperand(2), ChildScevs);
+    if (ChildScevs.size() == 2) {
+      ScevList.push_back(ChildScevs[0]);
+      ScevList.push_back(ChildScevs[1]);
+    } else
+      ScevList.push_back(Scev);
+    break;
+  }
+  // If adding another binop to this list, update GetBinOpExpr above
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul: {
+    SmallVector<const SCEV *, 2> LScevs;
+    SmallVector<const SCEV *, 2> RScevs;
+    findForkedSCEVs(SE, L, I->getOperand(0), LScevs);
+    findForkedSCEVs(SE, L, I->getOperand(1), RScevs);
+    if (LScevs.size() == 2 && RScevs.size() == 1) {
+      const SCEV *Op1 = GetBinOpExpr(Opcode, LScevs[0], RScevs[0]);
+      const SCEV *Op2 = GetBinOpExpr(Opcode, LScevs[1], RScevs[0]);
+      ScevList.push_back(Op1);
+      ScevList.push_back(Op2);
+    } else if (LScevs.size() == 1 && RScevs.size() == 2) {
+      const SCEV *Op1 = GetBinOpExpr(Opcode, LScevs[0], RScevs[0]);
+      const SCEV *Op2 = GetBinOpExpr(Opcode, LScevs[0], RScevs[1]);
+      ScevList.push_back(Op1);
+      ScevList.push_back(Op2);
+    } else
+      ScevList.push_back(Scev);
+    break;
+  }
+  default:
+    // Just return the current SCEV if we haven't handled the instruction yet.
+    LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n");
+    ScevList.push_back(Scev);
+    break;
+  }
+
+  return;
+}
+
+static Optional<ForkedPointer> findForkedPointer(ScalarEvolution *SE, Value *V,
+                                                 const Loop *L) {
+  assert(SE->isSCEVable(V->getType()) && "Value is not SCEVable!");
+  SmallVector<const SCEV *, 2> Scevs;
+  findForkedSCEVs(SE, L, V, Scevs);
+
+  // For now, we will only accept a forked pointer with two options.
+  if (Scevs.size() == 2)
+    return std::make_pair(Scevs[0], Scevs[1]);
+
+  return None;
+}
+
 RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
-    unsigned Index, RuntimePointerChecking &RtCheck)
-    : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start),
+    unsigned Index, RuntimePointerChecking &RtCheck, bool IsFork)
+    : High(RtCheck.Pointers[Index].Ends[IsFork ? 1 : 0]),
+      Low(RtCheck.Pointers[Index].Starts[IsFork ? 1 : 0]),
       AddressSpace(RtCheck.Pointers[Index]
                        .PointerValue->getType()
                        ->getPointerAddressSpace()) {
-  Members.push_back(Index);
+  Members.push_back(std::make_pair(Index, IsFork ? 1 : 0));
 }
 
 /// Calculate Start and End points of memory access.
@@ -200,38 +370,51 @@
   const SCEV *ScStart;
   const SCEV *ScEnd;
 
-  if (SE->isLoopInvariant(Sc, Lp)) {
-    ScStart = ScEnd = Sc;
-  } else {
-    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
-    assert(AR && "Invalid addrec expression");
-    const SCEV *Ex = PSE.getBackedgeTakenCount();
-
-    ScStart = AR->getStart();
-    ScEnd = AR->evaluateAtIteration(Ex, *SE);
-    const SCEV *Step = AR->getStepRecurrence(*SE);
-
-    // For expressions with negative step, the upper bound is ScStart and the
-    // lower bound is ScEnd.
-    if (const auto *CStep = dyn_cast<SCEVConstant>(Step)) {
-      if (CStep->getValue()->isNegative())
-        std::swap(ScStart, ScEnd);
+  // See if this was a ForkedPtr. If so, we want to add checks for both sides.
+  SmallVector<const SCEV *, 2> Scevs;
+  SmallVector<const SCEV *, 2> Starts;
+  SmallVector<const SCEV *, 2> Ends;
+  if (ForkedPtrs.count(Ptr)) {
+    Scevs.push_back(ForkedPtrs[Ptr].first);
+    Scevs.push_back(ForkedPtrs[Ptr].second);
+  } else
+    Scevs.push_back(Sc);
+
+  for (const SCEV *Sc : Scevs) {
+    if (SE->isLoopInvariant(Sc, Lp)) {
+      ScStart = ScEnd = Sc;
     } else {
-      // Fallback case: the step is not constant, but we can still
-      // get the upper and lower bounds of the interval by using min/max
-      // expressions.
-      ScStart = SE->getUMinExpr(ScStart, ScEnd);
-      ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+      assert(AR && "Invalid addrec expression");
+      const SCEV *Ex = PSE.getBackedgeTakenCount();
+
+      ScStart = AR->getStart();
+      ScEnd = AR->evaluateAtIteration(Ex, *SE);
+      const SCEV *Step = AR->getStepRecurrence(*SE);
+
+      // For expressions with negative step, the upper bound is ScStart and the
+      // lower bound is ScEnd.
+      if (const auto *CStep = dyn_cast<SCEVConstant>(Step)) {
+        if (CStep->getValue()->isNegative())
+          std::swap(ScStart, ScEnd);
+      } else {
+        // Fallback case: the step is not constant, but we can still
+        // get the upper and lower bounds of the interval by using min/max
+        // expressions.
+        ScStart = SE->getUMinExpr(ScStart, ScEnd);
+        ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
+      }
     }
+    // Add the size of the pointed element to ScEnd.
+    auto &DL = Lp->getHeader()->getModule()->getDataLayout();
+    Type *IdxTy = DL.getIndexType(Ptr->getType());
+    const SCEV *EltSizeSCEV =
+        SE->getStoreSizeOfExpr(IdxTy, Ptr->getType()->getPointerElementType());
+    ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
+    Starts.push_back(ScStart);
+    Ends.push_back(ScEnd);
   }
-  // Add the size of the pointed element to ScEnd.
-  auto &DL = Lp->getHeader()->getModule()->getDataLayout();
-  Type *IdxTy = DL.getIndexType(Ptr->getType());
-  const SCEV *EltSizeSCEV =
-      SE->getStoreSizeOfExpr(IdxTy, Ptr->getType()->getPointerElementType());
-  ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
-
-  Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc);
+  Pointers.emplace_back(Ptr, Starts, Ends, WritePtr, DepSetId, ASId, Scevs);
 }
 
 SmallVector<RuntimePointerCheck, 4>
@@ -261,7 +444,7 @@
     const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
   for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
     for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
-      if (needsChecking(M.Members[I], N.Members[J]))
+      if (needsChecking(M.Members[I].first, N.Members[J].first))
         return true;
   return false;
 }
@@ -281,16 +464,19 @@
 }
 
 bool RuntimeCheckingPtrGroup::addPointer(unsigned Index,
-                                         RuntimePointerChecking &RtCheck) {
+                                         RuntimePointerChecking &RtCheck,
+                                         bool IsFork) {
+  unsigned Fork = IsFork ? 1 : 0;
   return addPointer(
-      Index, RtCheck.Pointers[Index].Start, RtCheck.Pointers[Index].End,
+      Index, RtCheck.Pointers[Index].Starts[Fork],
+      RtCheck.Pointers[Index].Ends[Fork],
       RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(),
-      *RtCheck.SE);
+      *RtCheck.SE, Fork);
 }
 
 bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start,
                                          const SCEV *End, unsigned AS,
-                                         ScalarEvolution &SE) {
+                                         ScalarEvolution &SE, bool IsFork) {
   assert(AddressSpace == AS &&
          "all pointers in a checking group must be in the same address space");
 
@@ -313,7 +499,7 @@
   if (Min1 != End)
     High = End;
 
-  Members.push_back(Index);
+  Members.push_back(std::make_pair(Index, IsFork ? 1 : 0));
   return true;
 }
 
@@ -364,8 +550,14 @@
   // for correctness, because in this case we can have checking between
   // pointers to the same underlying object.
   if (!UseDependencies) {
-    for (unsigned I = 0; I < Pointers.size(); ++I)
-      CheckingGroups.push_back(RuntimeCheckingPtrGroup(I, *this));
+    for (unsigned I = 0; I < Pointers.size(); ++I) {
+      auto Group = CheckingGroups.emplace_back(I, *this, /*Fork=*/0);
+      // Try to add a fork to the same group first, otherwise establish
+      // a new one.
+      if (Pointers[I].HasFork)
+        if (!Group.addPointer(I, *this, /*Fork=*/1))
+          CheckingGroups.emplace_back(I, *this, /*Fork=*/1);
+    }
     return;
   }
 
@@ -405,7 +597,9 @@
       assert(PointerI != PositionMap.end() &&
              "pointer in equivalence class not found in PositionMap");
       unsigned Pointer = PointerI->second;
-      bool Merged = false;
+      bool Merged[2] = {false, false};
+      // Two expressions to evaluate for forked pointers.
+      const unsigned NumExprs = Pointers[Pointer].HasFork ? 2 : 1;
       // Mark this pointer as seen.
       Seen.insert(Pointer);
 
@@ -421,17 +615,27 @@
 
         TotalComparisons++;
 
-        if (Group.addPointer(Pointer, *this)) {
-          Merged = true;
+        // Try to add both forks, if applicable.
+        for (unsigned J = 0; J < NumExprs; ++J)
+          if (!Merged[J])
+            Merged[J] = Group.addPointer(Pointer, *this, J);
+
+        if (Merged[0] && (Merged[1] || NumExprs == 1))
           break;
-        }
       }
 
-      if (!Merged)
-        // We couldn't add this pointer to any existing set or the threshold
-        // for the number of comparisons has been reached. Create a new group
-        // to hold the current pointer.
-        Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this));
+      // If we couldn't add the pointer expression(s) to any existing set or
+      // the threshold for the number of comparisons has been reached, create a
+      // new group to hold the current pointer expression(s).
+      RuntimeCheckingPtrGroup *Group = nullptr;
+      if (!Merged[0])
+        Group = &Groups.emplace_back(Pointer, *this);
+      if (!Merged[1] && NumExprs == 2) {
+        // Try merging forks first, as they may have the same base address
+        // but different offsets.
+        if (!Group || !Group->addPointer(Pointer, *this, /*Fork=*/1))
+          Groups.emplace_back(Pointer, *this, /*Fork=*/1);
+      }
     }
 
     // We've computed the grouped checks for this partition.
@@ -476,12 +680,22 @@
     OS.indent(Depth) << "Check " << N++ << ":\n";
 
     OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n";
-    for (unsigned K = 0; K < First.size(); ++K)
-      OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n";
+    for (unsigned K = 0; K < First.size(); ++K) {
+      const PointerInfo &Info = Pointers[First[K].first];
+      OS.indent(Depth + 2) << *Info.PointerValue << "\n";
+      if (Info.HasFork)
+        OS.indent(Depth + 6)
+            << "Fork: " << *Info.Exprs[First[K].second] << "\n";
+    }
 
     OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n";
-    for (unsigned K = 0; K < Second.size(); ++K)
-      OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n";
+    for (unsigned K = 0; K < Second.size(); ++K) {
+      const PointerInfo &Info = Pointers[Second[K].first];
+      OS.indent(Depth + 2) << *Info.PointerValue << "\n";
+      if (Info.HasFork)
+        OS.indent(Depth + 6)
+            << "Fork: " << *Info.Exprs[Second[K].second] << "\n";
+    }
   }
 }
 
@@ -498,7 +712,8 @@
     OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
                          << ")\n";
     for (unsigned J = 0; J < CG.Members.size(); ++J) {
-      OS.indent(Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr
+      const PointerInfo &Info = Pointers[CG.Members[J].first];
+      OS.indent(Depth + 6) << "Member: " << *(Info.Exprs[CG.Members[J].second])
                            << "\n";
     }
   }
@@ -633,11 +848,13 @@
 /// by adding run-time checks (overflow checks) if necessary.
 static bool hasComputableBounds(PredicatedScalarEvolution &PSE,
                                 const ValueToValueMap &Strides, Value *Ptr,
-                                Loop *L, bool Assume) {
+                                Loop *L, bool Assume,
+                                RuntimePointerChecking &RtCheck) {
   const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
 
+  ScalarEvolution *SE = PSE.getSE();
   // The bounds for loop-invariant pointer is trivial.
-  if (PSE.getSE()->isLoopInvariant(PtrScev, L))
+  if (SE->isLoopInvariant(PtrScev, L))
     return true;
 
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
@@ -645,19 +862,97 @@
   if (!AR && Assume)
     AR = PSE.getAsAddRec(Ptr);
 
-  if (!AR)
-    return false;
+  if (AR)
+    return AR->isAffine();
+
+  // If we can't find a single SCEVAddRecExpr, then maybe we can find two.
+  // The findForkedPointer function tries walking backwards through IR until
+  // it finds operands that are either loop invariant or for which a
+  // SCEVAddRecExpr can be formed. If a select instruction is encountered
+  // during this walk then we split the walk and try to generate two valid
+  // SCEVAddRecExprs. Both of those SCEVs can then be considered when
+  // deciding whether runtime checks are required.
+  if (RtCheck.AllowForkedPtrs) {
+    Optional<ForkedPointer> FPtr = findForkedPointer(SE, Ptr, L);
+    if (!FPtr)
+      return false;
+
+    const SCEV *A = FPtr->first;
+    const SCEV *B = FPtr->second;
+    LLVM_DEBUG(dbgs() << "LAA: ForkedPtr found: " << *Ptr << "\n");
+    LLVM_DEBUG(dbgs() << "LAA: SCEV1: " << *A << "\n");
+    LLVM_DEBUG(dbgs() << "LAA: SCEV2: " << *B << "\n");
+    if (isa<SCEVAddRecExpr>(A) && cast<SCEVAddRecExpr>(A)->isAffine() &&
+        isa<SCEVAddRecExpr>(B) && cast<SCEVAddRecExpr>(B)->isAffine()) {
+      RtCheck.ForkedPtrs[Ptr] = *FPtr;
+      return true;
+    }
 
-  return AR->isAffine();
+    LLVM_DEBUG(
+        dbgs() << "LAA: Could not determine bounds for forked pointer\n");
+  }
+
+  return false;
 }
 
 /// Check whether a pointer address cannot wrap.
 static bool isNoWrap(PredicatedScalarEvolution &PSE,
-                     const ValueToValueMap &Strides, Value *Ptr, Loop *L) {
+                     const ValueToValueMap &Strides, Value *Ptr, Loop *L,
+                     RuntimePointerChecking &RtCheck) {
   const SCEV *PtrScev = PSE.getSCEV(Ptr);
   if (PSE.getSE()->isLoopInvariant(PtrScev, L))
     return true;
 
+  // The SCEV generated directly from a forked pointer is not an AddRecExpr,
+  // but an unknown SCEV. The PSE.hasNoOverflow method currently assumes
+  // that it must be an AddRecExpr and just casts. So we just bail out at
+  // this point, since we can't pass in the SCEVs one at a time --
+  // hasNoOverflow takes a Value* as a param instead of a SCEV*.
+  // TODO: Support overflow checking for ForkedPointers.
+  //
+  // We can, however, calculate an effective stride for each side of the fork
+  // and check if the stride is 1 for both; this is the other way we can
+  // assume a pointer doesn't wrap.
+  if (RtCheck.isForkedPtr(Ptr)) {
+    auto SCEVs = RtCheck.ForkedPtrs[Ptr];
+    const SCEVAddRecExpr *LSAR = cast<SCEVAddRecExpr>(SCEVs.first);
+    const SCEVAddRecExpr *RSAR = cast<SCEVAddRecExpr>(SCEVs.second);
+    const SCEV *LStep = LSAR->getStepRecurrence(*PSE.getSE());
+    const SCEV *RStep = RSAR->getStepRecurrence(*PSE.getSE());
+
+    if (LStep != RStep) {
+      LLVM_DEBUG(dbgs() << "LAA: Forkedptr with mismatched steps\n");
+      return false;
+    }
+
+    auto *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+    auto &DL = L->getHeader()->getModule()->getDataLayout();
+    int64_t Size = DL.getTypeAllocSize(PtrTy->getElementType()).getFixedSize();
+
+    const SCEVConstant *C = dyn_cast<SCEVConstant>(LStep);
+    if (!C) {
+      LLVM_DEBUG(dbgs() << "LAA: Forkedptr with non-constant steps\n");
+      return false;
+    }
+    const APInt &APStepVal = C->getAPInt();
+    if (APStepVal.getBitWidth() > 64) {
+      LLVM_DEBUG(dbgs() << "LAA: Forkedptr step is too large\n");
+      return false;
+    }
+
+    int64_t StepVal = APStepVal.getSExtValue();
+
+    // Strided access.
+    int64_t Stride = StepVal / Size;
+    int64_t Rem = StepVal % Size;
+    if (Rem) {
+      LLVM_DEBUG(dbgs() << "LAA: Forkedptr step not multiple of elt size\n");
+      return false;
+    }
+
+    return Stride == 1;
+  }
+
   Type *AccessTy = Ptr->getType()->getPointerElementType();
   int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, L, Strides);
   if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW))
@@ -698,12 +993,12 @@
                                           bool Assume) {
   Value *Ptr = Access.getPointer();
 
-  if (!hasComputableBounds(PSE, StridesMap, Ptr, TheLoop, Assume))
+  if (!hasComputableBounds(PSE, StridesMap, Ptr, TheLoop, Assume, RtCheck))
     return false;
 
   // When we run after a failing dependency check we have to make sure
   // we don't have wrapping pointers.
-  if (ShouldCheckWrap && !isNoWrap(PSE, StridesMap, Ptr, TheLoop)) {
+  if (ShouldCheckWrap && !isNoWrap(PSE, StridesMap, Ptr, TheLoop, RtCheck)) {
     auto *Expr = PSE.getSCEV(Ptr);
     if (!Assume || !isa<SCEVAddRecExpr>(Expr))
       return false;
@@ -1862,6 +2157,7 @@
 
   PtrRtChecking->Pointers.clear();
   PtrRtChecking->Need = false;
+  PtrRtChecking->ForkedPtrs.clear();
 
   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
 
@@ -2245,7 +2541,8 @@
                                const TargetLibraryInfo *TLI, AAResults *AA,
                                DominatorTree *DT, LoopInfo *LI)
     : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
-      PtrRtChecking(std::make_unique<RuntimePointerChecking>(SE)),
+      PtrRtChecking(
+          std::make_unique<RuntimePointerChecking>(SE, EnableForkedPointers)),
       DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
       NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
       HasConvergentOp(false),
Index: llvm/lib/Transforms/Scalar/LoopDistribute.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -908,29 +908,30 @@
       const RuntimePointerChecking *RtPtrChecking) {
     SmallVector<RuntimePointerCheck, 4> Checks;
 
-    copy_if(AllChecks, std::back_inserter(Checks),
-            [&](const RuntimePointerCheck &Check) {
-              for (unsigned PtrIdx1 : Check.first->Members)
-                for (unsigned PtrIdx2 : Check.second->Members)
-                  // Only include this check if there is a pair of pointers
-                  // that require checking and the pointers fall into
-                  // separate partitions.
-                  //
-                  // (Note that we already know at this point that the two
-                  // pointer groups need checking but it doesn't follow
-                  // that each pair of pointers within the two groups need
-                  // checking as well.
-                  //
-                  // In other words we don't want to include a check just
-                  // because there is a pair of pointers between the two
-                  // pointer groups that require checks and a different
-                  // pair whose pointers fall into different partitions.)
-                  if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
-                      !RuntimePointerChecking::arePointersInSamePartition(
-                          PtrToPartition, PtrIdx1, PtrIdx2))
-                    return true;
-              return false;
-            });
+    copy_if(
+        AllChecks, std::back_inserter(Checks),
+        [&](const RuntimePointerCheck &Check) {
+          for (auto PtrIdx1 : Check.first->Members)
+            for (auto PtrIdx2 : Check.second->Members)
+              // Only include this check if there is a pair of pointers
+              // that require checking and the pointers fall into
+              // separate partitions.
+              //
+              // (Note that we already know at this point that the two
+              // pointer groups need checking but it doesn't follow
+              // that each pair of pointers within the two groups need
+              // checking as well.
+              //
+              // In other words we don't want to include a check just
+              // because there is a pair of pointers between the two
+              // pointer groups that require checks and a different
+              // pair whose pointers fall into different partitions.)
+              if (RtPtrChecking->needsChecking(PtrIdx1.first, PtrIdx2.first) &&
+                  !RuntimePointerChecking::arePointersInSamePartition(
+                      PtrToPartition, PtrIdx1.first, PtrIdx2.first))
+                return true;
+          return false;
+        });
 
     return Checks;
   }
Index: llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -393,8 +393,8 @@
             [&](const RuntimePointerCheck &Check) {
               for (auto PtrIdx1 : Check.first->Members)
                 for (auto PtrIdx2 : Check.second->Members)
-                  if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath,
-                                    CandLoadPtrs))
+                  if (needsChecking(PtrIdx1.first, PtrIdx2.first,
+                                    PtrsWrittenOnFwdingPath, CandLoadPtrs))
                     return true;
               return false;
             });
Index: llvm/lib/Transforms/Utils/LoopVersioning.cpp
===================================================================
--- llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -193,8 +193,9 @@
   for (const auto &Group : RtPtrChecking->CheckingGroups) {
     GroupToScope[&Group] = MDB.createAnonymousAliasScope(Domain);
 
-    for (unsigned PtrIdx : Group.Members)
-      PtrToGroup[RtPtrChecking->getPointerInfo(PtrIdx).PointerValue] = &Group;
+    for (auto PtrIdx : Group.Members)
+      PtrToGroup[RtPtrChecking->getPointerInfo(PtrIdx.first).PointerValue] =
+          &Group;
   }
 
   // Go through the checks and for each pointer group, collect the scopes for
Index: llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll
===================================================================
--- llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll
+++ llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll
@@ -1,19 +1,48 @@
-; RUN: opt -loop-accesses -analyze -enable-new-pm=0 %s 2>&1 | FileCheck %s
-; RUN: opt -disable-output -passes='require<scalar-evolution>,require<aa>,loop(print-access-info)' %s 2>&1 | FileCheck %s
+; RUN: opt -enable-forked-pointer-detection -loop-accesses -analyze -enable-new-pm=0 %s 2>&1 | FileCheck %s
+; RUN: opt -enable-forked-pointer-detection -disable-output -passes='require<scalar-evolution>,require<aa>,loop(print-access-info)' %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 ; CHECK-LABEL: function 'forked_ptrs_different_base_same_offset':
-; CHECK-NEXT:  for.body:
-; CHECK-NEXT:    Report: cannot identify array bounds
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Grouped accesses:
+; CHECK-NEXT: for.body:
+; CHECK-NEXT:   Memory dependences are safe with run-time checks
+; CHECK-NEXT:   Dependences:
+; CHECK-NEXT:   Run-time memory checks:
+; CHECK-NEXT:   Check 0:
+; CHECK-NEXT:     Comparing group
+; CHECK-NEXT:       %1 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv
+; CHECK-NEXT:     Against group
+; CHECK-NEXT:       %arrayidx = getelementptr inbounds i32, i32* %Preds, i64 %indvars.iv
+; CHECK-NEXT:   Check 1:
+; CHECK-NEXT:     Comparing group
+; CHECK-NEXT:       %1 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv
+; CHECK-NEXT:     Against group
+; CHECK-NEXT:       %.sink.in = getelementptr inbounds float, float* %spec.select, i64 %indvars.iv
+; CHECK-NEXT:         Fork: {%Base2,+,4}<nsw><%for.body>
+; CHECK-NEXT:   Check 2:
+; CHECK-NEXT:     Comparing group
+; CHECK-NEXT:       %1 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv
+; CHECK-NEXT:     Against group
+; CHECK-NEXT:       %.sink.in = getelementptr inbounds float, float* %spec.select, i64 %indvars.iv
+; CHECK-NEXT:         Fork: {%Base1,+,4}<nsw><%for.body>
+; CHECK-NEXT:   Grouped accesses:
+; CHECK-NEXT:     Group
+; CHECK-NEXT:       (Low: %Dest High: (400 + %Dest))
+; CHECK-NEXT:         Member: {%Dest,+,4}<nuw><%for.body>
+; CHECK-NEXT:     Group
+; CHECK-NEXT:       (Low: %Preds High: (400 + %Preds))
+; CHECK-NEXT:         Member: {%Preds,+,4}<nuw><%for.body>
+; CHECK-NEXT:     Group
+; CHECK-NEXT:       (Low: %Base2 High: (400 + %Base2))
+; CHECK-NEXT:         Member: {%Base2,+,4}<nsw><%for.body>
+; CHECK-NEXT:     Group
+; CHECK-NEXT:       (Low: %Base1 High: (400 + %Base1))
+; CHECK-NEXT:         Member: {%Base1,+,4}<nsw><%for.body>
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:    SCEV assumptions:
+; CHECK-NEXT:   Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:   SCEV assumptions:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Expressions re-written:
+; CHECK-NEXT:   Expressions re-written:
 
 ;;;; Derived from the following C code
 ;; void forked_ptrs_different_base_same_offset(float *A, float *B, float *C, int *D) {
@@ -49,16 +78,39 @@
 }
 
 ; CHECK-LABEL: function 'forked_ptrs_same_base_different_offset':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: cannot identify array bounds
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:     Run-time memory checks:
-; CHECK-NEXT:     Grouped accesses:
+; CHECK-NEXT: for.body:
+; CHECK-NEXT:   Memory dependences are safe with run-time checks
+; CHECK-NEXT:   Dependences:
+; CHECK-NEXT:   Run-time memory checks:
+; CHECK-NEXT:   Check 0:
+; CHECK-NEXT:     Comparing group
+; CHECK-NEXT:       %arrayidx5 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv
+; CHECK-NEXT:     Against group
+; CHECK-NEXT:       %arrayidx = getelementptr inbounds i32, i32* %Preds, i64 %indvars.iv
+; CHECK-NEXT:   Check 1:
+; CHECK-NEXT:     Comparing group
+; CHECK-NEXT:       %arrayidx5 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv
+; CHECK-NEXT:     Against group
+; CHECK-NEXT:       %arrayidx3 = getelementptr inbounds float, float* %Base, i64 %idxprom213
+; CHECK-NEXT:         Fork: {(4 + %Base)<nsw>,+,4}<nsw><%for.body>
+; CHECK-NEXT:       %arrayidx3 = getelementptr inbounds float, float* %Base, i64 %idxprom213
+; CHECK-NEXT:         Fork: {%Base,+,4}<nsw><%for.body>
+; CHECK-NEXT:   Grouped accesses:
+; CHECK-NEXT:     Group
+; CHECK-NEXT:       (Low: %Dest High: (400 + %Dest))
+; CHECK-NEXT:         Member: {%Dest,+,4}<nuw><%for.body>
+; CHECK-NEXT:     Group
+; CHECK-NEXT:       (Low: %Preds High: (400 + %Preds))
+; CHECK-NEXT:         Member: {%Preds,+,4}<nuw><%for.body>
+; CHECK-NEXT:     Group
+; CHECK-NEXT:       (Low: %Base High: (404 + %Base))
+; CHECK-NEXT:         Member: {(4 + %Base)<nsw>,+,4}<nsw><%for.body>
+; CHECK-NEXT:         Member: {%Base,+,4}<nsw><%for.body>
 ; CHECK-EMPTY:
-; CHECK-NEXT:     Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:     SCEV assumptions:
+; CHECK-NEXT:   Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:   SCEV assumptions:
 ; CHECK-EMPTY:
-; CHECK-NEXT:     Expressions re-written:
+; CHECK-NEXT:   Expressions re-written:
 
 ;;;; Derived from the following C code
 ;; void forked_ptrs_same_base_different_offset(float *A, float *B, int *C) {
Index: llvm/test/Transforms/LoopVectorize/forked-pointers.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/forked-pointers.ll
+++ llvm/test/Transforms/LoopVectorize/forked-pointers.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -loop-vectorize -instcombine -force-vector-width=4 -S < %s 2>&1 | FileCheck %s
+; RUN: opt -enable-forked-pointer-detection -loop-vectorize -instcombine -force-vector-width=4 -S < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
@@ -17,22 +17,84 @@
 define dso_local void @forked_ptrs_different_base_same_offset(float* nocapture readonly %Base1, float* nocapture readonly %Base2, float* nocapture %Dest, i32* nocapture readonly %Preds) {
 ; CHECK-LABEL: @forked_ptrs_different_base_same_offset(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST:%.*]], i64 100
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[PREDS:%.*]], i64 100
+; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr float, float* [[BASE2:%.*]], i64 100
+; CHECK-NEXT:    [[SCEVGEP10:%.*]] = getelementptr float, float* [[BASE1:%.*]], i64 100
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP4]] to float*
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[DEST]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32*
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[PREDS]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND012:%.*]] = icmp ugt float* [[SCEVGEP7]], [[DEST]]
+; CHECK-NEXT:    [[BOUND113:%.*]] = icmp ugt float* [[SCEVGEP]], [[BASE2]]
+; CHECK-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT14]]
+; CHECK-NEXT:    [[BOUND015:%.*]] = icmp ugt float* [[SCEVGEP10]], [[DEST]]
+; CHECK-NEXT:    [[BOUND116:%.*]] = icmp ugt float* [[SCEVGEP]], [[BASE1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]]
+; CHECK-NEXT:    [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT17]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX18]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float*> poison, float* [[BASE2]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float*> [[BROADCAST_SPLATINSERT]], <4 x float*> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x float*> poison, float* [[BASE1]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x float*> [[BROADCAST_SPLATINSERT19]], <4 x float*> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x float*> [[BROADCAST_SPLAT]], <4 x float*> [[BROADCAST_SPLAT20]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float*> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float*> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float*> [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float*> [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP15]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP10]], align 4, !alias.scope !3
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP12]], align 4, !alias.scope !3
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP14]], align 4, !alias.scope !3
+; CHECK-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP16]], align 4, !alias.scope !3
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP19]], i32 2
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP20]], i32 3
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP25]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP24]], <4 x float>* [[TMP26]], align 4, !alias.scope !5, !noalias !7
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1_NOT]], float* [[BASE2:%.*]], float* [[BASE1:%.*]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP28]], 0
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1_NOT]], float* [[BASE2]], float* [[BASE1]]
 ; CHECK-NEXT:    [[DOTSINK_IN:%.*]] = getelementptr inbounds float, float* [[SPEC_SELECT]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[DOTSINK:%.*]] = load float, float* [[DOTSINK_IN]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[DOTSINK]], float* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[DOTSINK]], float* [[TMP29]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -70,26 +132,83 @@
 define dso_local void @forked_ptrs_same_base_different_offset(float* nocapture readonly %Base, float* nocapture %Dest, i32* nocapture readonly %Preds) {
 ; CHECK-LABEL: @forked_ptrs_same_base_different_offset(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST:%.*]], i64 100
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[PREDS:%.*]], i64 100
+; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr float, float* [[BASE:%.*]], i64 101
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP4]] to float*
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[DEST]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32*
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[PREDS]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND09:%.*]] = icmp ugt float* [[SCEVGEP7]], [[DEST]]
+; CHECK-NEXT:    [[BOUND110:%.*]] = icmp ugt float* [[SCEVGEP]], [[BASE]]
+; CHECK-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND13:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND15:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !12
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw <4 x i32> [[VEC_IND13]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_IND15]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i32> [[TMP6]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP9]], align 4, !alias.scope !15
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP11]], align 4, !alias.scope !15
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP13]], align 4, !alias.scope !15
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP15]], align 4, !alias.scope !15
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> poison, float [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP17]], i32 1
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP18]], i32 2
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP19]], i32 3
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP23]], <4 x float>* [[TMP25]], align 4, !alias.scope !17, !noalias !19
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT14]] = add <4 x i32> [[VEC_IND13]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[VEC_IND_NEXT16]] = add <4 x i32> [[VEC_IND15]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL12:%.*]] = phi i32 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL12]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP27]], 0
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[I_014]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[OFFSET_0:%.*]] = select i1 [[CMP1_NOT]], i32 [[ADD]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[OFFSET_0:%.*]] = select i1 [[CMP1_NOT]], i32 [[ADD]], i32 [[TMP28]]
 ; CHECK-NEXT:    [[IDXPROM213:%.*]] = zext i32 [[OFFSET_0]] to i64
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[BASE:%.*]], i64 [[IDXPROM213]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[TMP2]], float* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[IDXPROM213]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[TMP29]], float* [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ;
 entry:
   br label %for.body