Index: llvm/include/llvm/Analysis/LoopAccessAnalysis.h
===================================================================
--- llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -516,7 +516,8 @@
 class LoopAccessInfo {
 public:
   LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetLibraryInfo *TLI,
-                 AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI);
+                 const TargetTransformInfo *TTI, AliasAnalysis *AA,
+                 DominatorTree *DT, LoopInfo *LI);
 
   /// Return true we can analyze the memory accesses in the loop and there are
   /// no memory dependence cycles.
@@ -608,7 +609,8 @@
 private:
   /// Analyze the loop.
   void analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
-                   const TargetLibraryInfo *TLI, DominatorTree *DT);
+                   const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI,
+                   DominatorTree *DT);
 
   /// Check if the structure of the loop allows it to be analyzed by this
   /// pass.
@@ -626,7 +628,8 @@
   ///
   /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
   /// invariant.
-  void collectStridedAccess(Value *LoadOrStoreInst);
+  void collectStridedAccess(Value *LoadOrStoreInst,
+                            const TargetTransformInfo *TTI);
 
   std::unique_ptr<PredicatedScalarEvolution> PSE;
 
@@ -750,6 +753,7 @@
   // The used analysis passes.
   ScalarEvolution *SE = nullptr;
   const TargetLibraryInfo *TLI = nullptr;
+  const TargetTransformInfo *TTI = nullptr;
   AliasAnalysis *AA = nullptr;
   DominatorTree *DT = nullptr;
   LoopInfo *LI = nullptr;
Index: llvm/lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1789,6 +1789,7 @@
 
 void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
                                  const TargetLibraryInfo *TLI,
+                                 const TargetTransformInfo *TTI,
                                  DominatorTree *DT) {
   typedef SmallPtrSet<Value*, 16> ValueSet;
 
@@ -1866,7 +1867,7 @@
         Loads.push_back(Ld);
         DepChecker->addAccess(Ld);
         if (EnableMemAccessVersioning)
-          collectStridedAccess(Ld);
+          collectStridedAccess(Ld, TTI);
         continue;
       }
 
@@ -1890,7 +1891,7 @@
         Stores.push_back(St);
         DepChecker->addAccess(St);
         if (EnableMemAccessVersioning)
-          collectStridedAccess(St);
+          collectStridedAccess(St, TTI);
       }
     } // Next instr.
   } // Next block.
@@ -2279,13 +2280,20 @@
   return addRuntimeChecks(Loc, PtrRtChecking->getChecks());
 }
 
-void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
+void LoopAccessInfo::collectStridedAccess(Value *MemAccess,
+                                          const TargetTransformInfo *TTI) {
   Value *Ptr = nullptr;
-  if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
+  if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess)) {
     Ptr = LI->getPointerOperand();
-  else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
+    if (TTI && TTI->isLegalMaskedGather(LI->getType(),
+                                        getLoadStoreAlignment(MemAccess)))
+      return;
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess)) {
     Ptr = SI->getPointerOperand();
-  else
+    if (TTI && TTI->isLegalMaskedScatter(SI->getValueOperand()->getType(),
+                                         getLoadStoreAlignment(MemAccess)))
+      return;
+  } else
     return;
 
   Value *Stride = getStrideFromPointer(Ptr, PSE->getSE(), TheLoop);
@@ -2343,8 +2351,10 @@
 }
 
 LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
-                               const TargetLibraryInfo *TLI, AliasAnalysis *AA,
-                               DominatorTree *DT, LoopInfo *LI)
+                               const TargetLibraryInfo *TLI,
+                               const TargetTransformInfo *TTI,
+                               AliasAnalysis *AA, DominatorTree *DT,
+                               LoopInfo *LI)
     : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
       PtrRtChecking(std::make_unique<RuntimePointerChecking>(SE)),
       DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
@@ -2352,7 +2362,7 @@
       HasConvergentOp(false),
       HasDependenceInvolvingLoopInvariantAddress(false) {
   if (canAnalyzeLoop())
-    analyzeLoop(AA, LI, TLI, DT);
+    analyzeLoop(AA, LI, TLI, TTI, DT);
 }
 
 void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
@@ -2406,7 +2416,7 @@
   auto &LAI = LoopAccessInfoMap[L];
 
   if (!LAI)
-    LAI = std::make_unique<LoopAccessInfo>(L, SE, TLI, AA, DT, LI);
+    LAI = std::make_unique<LoopAccessInfo>(L, SE, TLI, TTI, AA, DT, LI);
 
   return *LAI.get();
 }
@@ -2426,6 +2436,8 @@
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
   TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+  auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
+  TTI = TTIP ? &TTIP->getTTI(F) : nullptr;
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
@@ -2457,7 +2469,7 @@
 
 LoopAccessInfo LoopAccessAnalysis::run(Loop &L, LoopAnalysisManager &AM,
                                        LoopStandardAnalysisResults &AR) {
-  return LoopAccessInfo(&L, &AR.SE, &AR.TLI, &AR.AA, &AR.DT, &AR.LI);
+  return LoopAccessInfo(&L, &AR.SE, &AR.TLI, &AR.TTI, &AR.AA, &AR.DT, &AR.LI);
 }
 
 namespace llvm {
Index: llvm/test/Transforms/LoopVectorize/ARM/mve-mat-mul.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/mve-mat-mul.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/mve-mat-mul.ll
@@ -27,36 +27,36 @@
 ; CHECK:       for.cond8.preheader.us.us:
 ; CHECK-NEXT:    [[J_051_US_US:%.*]] = phi i32 [ [[INC21_US_US:%.*]], [[FOR_COND8_FOR_COND_CLEANUP10_CRIT_EDGE_US_US:%.*]] ], [ 0, [[FOR_COND8_PREHEADER_US_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[L]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
-; CHECK:       vector.scevcheck:
-; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[M]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = or i1 false, [[IDENT_CHECK]]
-; CHECK-NEXT:    br i1 [[TMP0]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[L]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[L]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[M]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[J_051_US_US]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], [[MUL_US]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP1]], [[M]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[J_051_US_US]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], [[MUL_US]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <4 x i32> [[TMP9]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP10]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP12]] = add nsw <4 x i32> [[TMP11]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; CHECK:       middle.block:
@@ -64,8 +64,8 @@
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[L]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND8_FOR_COND_CLEANUP10_CRIT_EDGE_US_US]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_COND8_PREHEADER_US_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_COND8_PREHEADER_US_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_COND8_PREHEADER_US_US]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_COND8_PREHEADER_US_US]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY11_US_US:%.*]]
 ; CHECK:       for.cond8.for.cond.cleanup10_crit_edge.us.us:
 ; CHECK-NEXT:    [[ADD16_US_US_LCSSA:%.*]] = phi i32 [ [[ADD16_US_US:%.*]], [[FOR_BODY11_US_US]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
Index: llvm/test/Transforms/LoopVectorize/X86/optsize.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/optsize.ll
+++ llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -148,17 +148,17 @@
 ; AUTOVF-NOT: vector.scevcheck
 ; AUTOVF-NOT: vector.body:
 ; AUTOVF-LABEL: for.body:
-define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 {
+define void @scev4stride1(i16* noalias nocapture %a, i16* noalias nocapture readonly %b, i32 %k) #2 {
 for.body.preheader:
   br label %for.body
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
   %mul = mul nsw i32 %i.07, %k
-  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07
-  store i32 %0, i32* %arrayidx1, align 4
+  %arrayidx = getelementptr inbounds i16, i16* %b, i32 %mul
+  %0 = load i16, i16* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i16, i16* %a, i32 %i.07
+  store i16 %0, i16* %arrayidx1, align 4
   %inc = add nuw nsw i32 %i.07, 1
   %exitcond = icmp eq i32 %inc, 256
   br i1 %exitcond, label %for.end.loopexit, label %for.body
Index: llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
===================================================================
--- llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
+++ llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
@@ -43,7 +43,7 @@
     AARes.reset(new AAResults(TLI));
     AARes->addAAResult(*BasicAA);
     PSE.reset(new PredicatedScalarEvolution(*SE, *L));
-    LAI.reset(new LoopAccessInfo(L, &*SE, &TLI, &*AARes, &*DT, &*LI));
+    LAI.reset(new LoopAccessInfo(L, &*SE, &TLI, nullptr, &*AARes, &*DT, &*LI));
     IAI.reset(new InterleavedAccessInfo(*PSE, L, &*DT, &*LI, &*LAI));
     IAI->analyzeInterleaving(false);
     return {Plan, *IAI};