Index: llvm/include/llvm/Analysis/LoopAccessAnalysis.h
===================================================================
--- llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -516,7 +516,8 @@
 class LoopAccessInfo {
 public:
   LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetLibraryInfo *TLI,
-                 AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI);
+                 const TargetTransformInfo *TTI, AliasAnalysis *AA,
+                 DominatorTree *DT, LoopInfo *LI);
 
   /// Return true we can analyze the memory accesses in the loop and there are
   /// no memory dependence cycles.
@@ -608,7 +609,8 @@
 private:
   /// Analyze the loop.
   void analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
-                   const TargetLibraryInfo *TLI, DominatorTree *DT);
+                   const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI,
+                   DominatorTree *DT);
 
   /// Check if the structure of the loop allows it to be analyzed by this
   /// pass.
@@ -626,7 +628,8 @@
   ///
   /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
   /// invariant.
-  void collectStridedAccess(Value *LoadOrStoreInst);
+  void collectStridedAccess(Value *LoadOrStoreInst,
+                            const TargetTransformInfo *TTI);
 
   std::unique_ptr<PredicatedScalarEvolution> PSE;
 
@@ -750,6 +753,7 @@
   // The used analysis passes.
   ScalarEvolution *SE = nullptr;
   const TargetLibraryInfo *TLI = nullptr;
+  const TargetTransformInfo *TTI = nullptr;
   AliasAnalysis *AA = nullptr;
   DominatorTree *DT = nullptr;
   LoopInfo *LI = nullptr;
Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -606,6 +606,10 @@
   /// Return true if the target supports masked expand load.
   bool isLegalMaskedExpandLoad(Type *DataType) const;
 
+  /// Returns true if the target machine can represent a vectorized version
+  /// of \p V as a masked gather or scatter operation.
+  bool isLegalGatherOrScatter(Value *V) const;
+
   /// Return true if the target has a unified operation to calculate division
   /// and remainder. If so, the additional implicit multiplication and
   /// subtraction required to calculate a remainder from division are free. This
Index: llvm/lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -124,6 +124,10 @@
     "enable-mem-access-versioning", cl::init(true), cl::Hidden,
     cl::desc("Enable symbolic stride memory access versioning"));
 
+static cl::opt<bool> PreferGatherOverStrideCheck(
+    "prefer-gather-over-stride-check", cl::init(true), cl::Hidden,
+    cl::desc("Prefer Gather/Scatter over symbolic stride versioning"));
+
 /// Enable store-to-load forwarding conflict detection. This option can
 /// be disabled for correctness testing.
 static cl::opt<bool> EnableForwardingConflictDetection(
@@ -1789,6 +1793,7 @@
 
 void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
                                  const TargetLibraryInfo *TLI,
+                                 const TargetTransformInfo *TTI,
                                  DominatorTree *DT) {
   typedef SmallPtrSet<Value*, 16> ValueSet;
 
@@ -1866,7 +1871,7 @@
         Loads.push_back(Ld);
         DepChecker->addAccess(Ld);
         if (EnableMemAccessVersioning)
-          collectStridedAccess(Ld);
+          collectStridedAccess(Ld, TTI);
         continue;
       }
 
@@ -1890,7 +1895,7 @@
         Stores.push_back(St);
         DepChecker->addAccess(St);
         if (EnableMemAccessVersioning)
-          collectStridedAccess(St);
+          collectStridedAccess(St, TTI);
       }
     } // Next instr.
   } // Next block.
@@ -2279,7 +2284,8 @@
   return addRuntimeChecks(Loc, PtrRtChecking->getChecks());
 }
 
-void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
+void LoopAccessInfo::collectStridedAccess(Value *MemAccess,
+                                          const TargetTransformInfo *TTI) {
   Value *Ptr = nullptr;
   if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
     Ptr = LI->getPointerOperand();
@@ -2296,18 +2302,28 @@
                        "versioning:");
   LLVM_DEBUG(dbgs() << "  Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
 
-  // Avoid adding the "Stride == 1" predicate when we know that
-  // Stride >= Trip-Count. Such a predicate will effectively optimize a single
-  // or zero iteration loop, as Trip-Count <= Stride == 1.
+  // If this is load/store could equally be represented as a gather/scatter, as
+  // opposed to adding a unit stride runtime check, the gather/scatter is likely
+  // to be useful in more cases (even if it might be slower than a sequential
+  // load).
   //
   // TODO: We are currently not making a very informed decision on when it is
   // beneficial to apply stride versioning. It might make more sense that the
   // users of this analysis (such as the vectorizer) will trigger it, based on
   // their specific cost considerations; For example, in cases where stride
-  // versioning does  not help resolving memory accesses/dependences, the
+  // versioning does not help resolving memory accesses/dependences, the
   // vectorizer should evaluate the cost of the runtime test, and the benefit
   // of various possible stride specializations, considering the alternatives
   // of using gather/scatters (if available).
+  if (PreferGatherOverStrideCheck && TTI &&
+      TTI->isLegalGatherOrScatter(MemAccess)) {
+    LLVM_DEBUG(dbgs() << "LAA: But leaving as a gather/scatter instead.\n");
+    return;
+  }
+
+  // Avoid adding the "Stride == 1" predicate when we know that
+  // Stride >= Trip-Count. Such a predicate will effectively optimize a single
+  // or zero iteration loop, as Trip-Count <= Stride == 1.
 
   const SCEV *StrideExpr = PSE->getSCEV(Stride);
   const SCEV *BETakenCount = PSE->getBackedgeTakenCount();
@@ -2343,8 +2359,10 @@
 }
 
 LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
-                               const TargetLibraryInfo *TLI, AliasAnalysis *AA,
-                               DominatorTree *DT, LoopInfo *LI)
+                               const TargetLibraryInfo *TLI,
+                               const TargetTransformInfo *TTI,
+                               AliasAnalysis *AA, DominatorTree *DT,
+                               LoopInfo *LI)
     : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
       PtrRtChecking(std::make_unique<RuntimePointerChecking>(SE)),
       DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
@@ -2352,7 +2370,7 @@
       HasConvergentOp(false),
       HasDependenceInvolvingLoopInvariantAddress(false) {
   if (canAnalyzeLoop())
-    analyzeLoop(AA, LI, TLI, DT);
+    analyzeLoop(AA, LI, TLI, TTI, DT);
 }
 
 void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
@@ -2406,7 +2424,7 @@
   auto &LAI = LoopAccessInfoMap[L];
 
   if (!LAI)
-    LAI = std::make_unique<LoopAccessInfo>(L, SE, TLI, AA, DT, LI);
+    LAI = std::make_unique<LoopAccessInfo>(L, SE, TLI, TTI, AA, DT, LI);
 
   return *LAI.get();
 }
@@ -2426,6 +2444,8 @@
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
   TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+  auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
+  TTI = TTIP ? &TTIP->getTTI(F) : nullptr;
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
@@ -2457,7 +2477,7 @@
 
 LoopAccessInfo LoopAccessAnalysis::run(Loop &L, LoopAnalysisManager &AM,
                                        LoopStandardAnalysisResults &AR) {
-  return LoopAccessInfo(&L, &AR.SE, &AR.TLI, &AR.AA, &AR.DT, &AR.LI);
+  return LoopAccessInfo(&L, &AR.SE, &AR.TLI, &AR.TTI, &AR.AA, &AR.DT, &AR.LI);
 }
 
 namespace llvm {
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -333,6 +333,17 @@
   return TTIImpl->isLegalMaskedExpandLoad(DataType);
 }
 
+bool TargetTransformInfo::isLegalGatherOrScatter(Value *V) const {
+  LoadInst *LI = dyn_cast<LoadInst>(V);
+  StoreInst *SI = dyn_cast<StoreInst>(V);
+  if (!LI && !SI)
+    return false;
+  Type *Ty = LI ? LI->getType() : SI->getValueOperand()->getType();
+  MaybeAlign Align = getLoadStoreAlignment(V);
+  return (LI && isLegalMaskedGather(Ty, Align)) ||
+          (SI && isLegalMaskedScatter(Ty, Align));
+}
+
 bool TargetTransformInfo::hasDivRemOp(Type *DataType, bool IsSigned) const {
   return TTIImpl->hasDivRemOp(DataType, IsSigned);
 }
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1207,31 +1207,6 @@
            TTI.isLegalMaskedLoad(DataType, Alignment);
   }
 
-  /// Returns true if the target machine supports masked scatter operation
-  /// for the given \p DataType.
-  bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
-    return TTI.isLegalMaskedScatter(DataType, Alignment);
-  }
-
-  /// Returns true if the target machine supports masked gather operation
-  /// for the given \p DataType.
-  bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
-    return TTI.isLegalMaskedGather(DataType, Alignment);
-  }
-
-  /// Returns true if the target machine can represent \p V as a masked gather
-  /// or scatter operation.
-  bool isLegalGatherOrScatter(Value *V) {
-    bool LI = isa<LoadInst>(V);
-    bool SI = isa<StoreInst>(V);
-    if (!LI && !SI)
-      return false;
-    auto *Ty = getMemInstValueType(V);
-    MaybeAlign Align = getLoadStoreAlignment(V);
-    return (LI && isLegalMaskedGather(Ty, Align)) ||
-           (SI && isLegalMaskedScatter(Ty, Align));
-  }
-
   /// Returns true if \p I is an instruction that will be scalarized with
   /// predication. Such instructions include conditional stores and
   /// instructions that may divide by zero.
@@ -4618,10 +4593,10 @@
       return WideningDecision == CM_Scalarize;
     }
     const MaybeAlign Alignment = getLoadStoreAlignment(I);
-    return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
-                                isLegalMaskedGather(Ty, Alignment))
-                            : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
-                                isLegalMaskedScatter(Ty, Alignment));
+    bool LegalGather = TTI.isLegalGatherOrScatter(I);
+    return !(LegalGather || isa<LoadInst>(I)
+                 ? isLegalMaskedLoad(Ty, Ptr, Alignment)
+                 : isLegalMaskedStore(Ty, Ptr, Alignment));
   }
   case Instruction::UDiv:
   case Instruction::SDiv:
@@ -5169,7 +5144,7 @@
       //        optimization to non-pointer types.
       //
       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
-          !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
+          !isAccessInterleaved(&I) && !TTI.isLegalGatherOrScatter(&I))
         continue;
 
       MinWidth = std::min(MinWidth,
@@ -6058,7 +6033,7 @@
       }
 
       unsigned GatherScatterCost =
-          isLegalGatherOrScatter(&I)
+          TTI.isLegalGatherOrScatter(&I)
               ? getGatherScatterCost(&I, VF) * NumAccesses
               : std::numeric_limits<unsigned>::max();
 
Index: llvm/test/Transforms/LoopVectorize/ARM/mve-mat-mul.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/mve-mat-mul.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/mve-mat-mul.ll
@@ -27,36 +27,36 @@
 ; CHECK:       for.cond8.preheader.us.us:
 ; CHECK-NEXT:    [[J_051_US_US:%.*]] = phi i32 [ [[INC21_US_US:%.*]], [[FOR_COND8_FOR_COND_CLEANUP10_CRIT_EDGE_US_US:%.*]] ], [ 0, [[FOR_COND8_PREHEADER_US_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[L]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
-; CHECK:       vector.scevcheck:
-; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[M]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = or i1 false, [[IDENT_CHECK]]
-; CHECK-NEXT:    br i1 [[TMP0]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[L]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[L]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[M]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[J_051_US_US]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], [[MUL_US]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP1]], [[M]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[J_051_US_US]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP12]] = add nsw <4 x i32> [[TMP11]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], [[MUL_US]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <4 x i32> [[TMP9]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP10]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP12]] = add <4 x i32> [[TMP11]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; CHECK:       middle.block:
@@ -64,8 +64,8 @@
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[L]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND8_FOR_COND_CLEANUP10_CRIT_EDGE_US_US]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_COND8_PREHEADER_US_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_COND8_PREHEADER_US_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_COND8_PREHEADER_US_US]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_COND8_PREHEADER_US_US]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY11_US_US:%.*]]
 ; CHECK:       for.cond8.for.cond.cleanup10_crit_edge.us.us:
 ; CHECK-NEXT:    [[ADD16_US_US_LCSSA:%.*]] = phi i32 [ [[ADD16_US_US:%.*]], [[FOR_BODY11_US_US]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
Index: llvm/test/Transforms/LoopVectorize/X86/optsize.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/optsize.ll
+++ llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -140,15 +140,46 @@
 
 ; We can't vectorize this one because we version for stride==1; even having TC
 ; a multiple of VF.
-; CHECK-LABEL: @scev4stride1
+; CHECK-LABEL: @scev4stride1_16
 ; CHECK-NOT: vector.scevcheck
 ; CHECK-NOT: vector.body:
 ; CHECK-LABEL: for.body:
-; AUTOVF-LABEL: @scev4stride1
+; AUTOVF-LABEL: @scev4stride1_16
 ; AUTOVF-NOT: vector.scevcheck
 ; AUTOVF-NOT: vector.body:
 ; AUTOVF-LABEL: for.body:
-define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 {
+define void @scev4stride1_16(i16* noalias nocapture %a, i16* noalias nocapture readonly %b, i32 %k) #2 {
+for.body.preheader:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = mul nsw i32 %i.07, %k
+  %arrayidx = getelementptr inbounds i16, i16* %b, i32 %mul
+  %0 = load i16, i16* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i16, i16* %a, i32 %i.07
+  store i16 %0, i16* %arrayidx1, align 4
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 256
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  ret void
+}
+
+; We can vectorize this one because we can instead use gather loads without needing runtime checks.
+; These checks make sure that the scalar remainder loop will not be called.
+; CHECK-LABEL: @scev4stride1_32
+; CHECK-NOT: vector.scevcheck
+; CHECK: br i1 false, label %scalar.ph, label %vector.ph
+; CHECK: %cmp.n = icmp eq i32 256, 256
+; CHECK: br i1 %cmp.n, label %for.end.loopexit, label %scalar.ph
+; AUTOVF-LABEL: @scev4stride1_32
+; AUTOVF-NOT: vector.scevcheck
+; AUTOVF: br i1 false, label %scalar.ph, label %vector.ph
+; AUTOVF: %cmp.n = icmp eq i32 256, 256
+; AUTOVF: br i1 %cmp.n, label %for.end.loopexit, label %scalar.ph
+define void @scev4stride1_32(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 {
 for.body.preheader:
   br label %for.body
 
Index: llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
===================================================================
--- llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
+++ llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
@@ -43,7 +43,7 @@
     AARes.reset(new AAResults(TLI));
     AARes->addAAResult(*BasicAA);
     PSE.reset(new PredicatedScalarEvolution(*SE, *L));
-    LAI.reset(new LoopAccessInfo(L, &*SE, &TLI, &*AARes, &*DT, &*LI));
+    LAI.reset(new LoopAccessInfo(L, &*SE, &TLI, nullptr, &*AARes, &*DT, &*LI));
     IAI.reset(new InterleavedAccessInfo(*PSE, L, &*DT, &*LI, &*LAI));
     IAI->analyzeInterleaving(false);
     return {Plan, *IAI};