diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -656,10 +656,17 @@
   /// Return true if the target supports nontemporal load.
   bool isLegalNTLoad(Type *DataType, Align Alignment) const;
 
-  /// Return true if the target supports masked scatter.
-  bool isLegalMaskedScatter(Type *DataType, Align Alignment) const;
+  /// Return true if masked gather should be used for vectorization.
+  bool shouldUseMaskedGatherForVectorization(Type *DataType, bool VariableMask,
+                                             Align Alignment) const;
+  /// Return true if masked scatter should be used for vectorization.
+  bool shouldUseMaskedScatterForVectorization(Type *DataType, bool VariableMask,
+                                              Align Alignment) const;
+
   /// Return true if the target supports masked gather.
   bool isLegalMaskedGather(Type *DataType, Align Alignment) const;
+  /// Return true if the target supports masked scatter.
+  bool isLegalMaskedScatter(Type *DataType, Align Alignment) const;
 
   /// Return true if the target supports masked compress store.
   bool isLegalMaskedCompressStore(Type *DataType) const;
@@ -1513,8 +1520,14 @@
   virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0;
-  virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0;
+  virtual bool shouldUseMaskedGatherForVectorization(Type *DataType,
+                                                     bool VariableMask,
+                                                     Align Alignment) = 0;
+  virtual bool shouldUseMaskedScatterForVectorization(Type *DataType,
+                                                      bool VariableMask,
+                                                      Align Alignment) = 0;
   virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0;
+  virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
   virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0;
   virtual bool enableOrderedReductions() = 0;
@@ -1893,12 +1906,22 @@
   bool isLegalNTLoad(Type *DataType, Align Alignment) override {
     return Impl.isLegalNTLoad(DataType, Alignment);
   }
-  bool isLegalMaskedScatter(Type *DataType, Align Alignment) override {
-    return Impl.isLegalMaskedScatter(DataType, Alignment);
+  bool shouldUseMaskedGatherForVectorization(Type *DataType, bool VariableMask,
+                                             Align Alignment) override {
+    return Impl.shouldUseMaskedGatherForVectorization(DataType, VariableMask,
+                                                      Alignment);
+  }
+  bool shouldUseMaskedScatterForVectorization(Type *DataType, bool VariableMask,
+                                              Align Alignment) override {
+    return Impl.shouldUseMaskedScatterForVectorization(DataType, VariableMask,
+                                                       Alignment);
   }
   bool isLegalMaskedGather(Type *DataType, Align Alignment) override {
     return Impl.isLegalMaskedGather(DataType, Alignment);
   }
+  bool isLegalMaskedScatter(Type *DataType, Align Alignment) override {
+    return Impl.isLegalMaskedScatter(DataType, Alignment);
+  }
   bool isLegalMaskedCompressStore(Type *DataType) override {
     return Impl.isLegalMaskedCompressStore(DataType);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -862,6 +862,16 @@
 public:
   using BaseT::getGEPCost;
 
+  bool shouldUseMaskedGatherForVectorization(Type *DataType, bool VariableMask,
+                                             Align Alignment) {
+    return static_cast<T *>(this)->isLegalMaskedGather(DataType, Alignment);
+  }
+
+  bool shouldUseMaskedScatterForVectorization(Type *DataType, bool VariableMask,
+                                              Align Alignment) {
+    return static_cast<T *>(this)->isLegalMaskedScatter(DataType, Alignment);
+  }
+
   InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
                              ArrayRef<const Value *> Operands,
                              TTI::TargetCostKind CostKind) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -393,6 +393,18 @@
   return TTIImpl->isLegalNTLoad(DataType, Alignment);
 }
 
+bool TargetTransformInfo::shouldUseMaskedGatherForVectorization(
+    Type *DataType, bool VariableMask, Align Alignment) const {
+  return TTIImpl->shouldUseMaskedGatherForVectorization(DataType, VariableMask,
+                                                        Alignment);
+}
+
+bool TargetTransformInfo::shouldUseMaskedScatterForVectorization(
+    Type *DataType, bool VariableMask, Align Alignment) const {
+  return TTIImpl->shouldUseMaskedScatterForVectorization(DataType, VariableMask,
+                                                         Alignment);
+}
+
 bool TargetTransformInfo::isLegalMaskedGather(Type *DataType,
                                               Align Alignment) const {
   return TTIImpl->isLegalMaskedGather(DataType, Alignment);
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -227,6 +227,10 @@
   bool isLegalMaskedStore(Type *DataType, Align Alignment);
   bool isLegalNTLoad(Type *DataType, Align Alignment);
   bool isLegalNTStore(Type *DataType, Align Alignment);
+  bool shouldUseMaskedGatherForVectorization(Type *DataType, bool VariableMask,
+                                             Align Alignment);
+  bool shouldUseMaskedScatterForVectorization(Type *DataType, bool VariableMask,
+                                              Align Alignment);
   bool isLegalMaskedGather(Type *DataType, Align Alignment);
   bool isLegalMaskedScatter(Type *DataType, Align Alignment);
   bool isLegalMaskedExpandLoad(Type *DataType);
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4871,6 +4871,21 @@
   return isLegalMaskedExpandLoad(DataTy);
 }
 
+bool X86TTIImpl::shouldUseMaskedGatherForVectorization(Type *DataType,
+                                                       bool VariableMask,
+                                                       Align Alignment) {
+  if (!VariableMask)
+    return true;
+  return isLegalMaskedGather(DataType, Alignment);
+}
+bool X86TTIImpl::shouldUseMaskedScatterForVectorization(Type *DataType,
+                                                        bool VariableMask,
+                                                        Align Alignment) {
+  if (!VariableMask)
+    return true;
+  return isLegalMaskedScatter(DataType, Alignment);
+}
+
 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
   // Some CPUs have better gather performance than others.
   // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1504,15 +1504,18 @@
 
   /// Returns true if the target machine can represent \p V as a masked gather
   /// or scatter operation.
-  bool isLegalGatherOrScatter(Value *V) {
+  bool shouldUseGatherOrScatterForVectorization(Value *V) {
+    auto *I = cast<Instruction>(V);
     bool LI = isa<LoadInst>(V);
     bool SI = isa<StoreInst>(V);
     if (!LI && !SI)
       return false;
     auto *Ty = getLoadStoreType(V);
     Align Align = getLoadStoreAlignment(V);
-    return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
-           (SI && TTI.isLegalMaskedScatter(Ty, Align));
+    return (LI && TTI.shouldUseMaskedGatherForVectorization(
+                      Ty, Legal->isMaskRequired(I), Align)) ||
+           (SI && TTI.shouldUseMaskedScatterForVectorization(
+                      Ty, Legal->isMaskRequired(I), Align));
   }
 
   /// Returns true if the target machine supports all of the reduction
@@ -6342,7 +6345,8 @@
       //        optimization to non-pointer types.
       //
       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
-          !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
+          !isAccessInterleaved(&I) &&
+          !shouldUseGatherOrScatterForVectorization(&I))
         continue;
 
       ElementTypesInLoop.insert(T);
@@ -7454,7 +7458,7 @@
         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
         InstructionCost Cost;
         if (isa<StoreInst>(&I) && VF.isScalable() &&
-            isLegalGatherOrScatter(&I)) {
+            shouldUseGatherOrScatterForVectorization(&I)) {
           Cost = getGatherScatterCost(&I, VF);
           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
         } else {
@@ -7496,7 +7500,7 @@
       }
 
       InstructionCost GatherScatterCost =
-          isLegalGatherOrScatter(&I)
+          shouldUseGatherOrScatterForVectorization(&I)
               ? getGatherScatterCost(&I, VF) * NumAccesses
               : InstructionCost::getInvalid();
 
diff --git a/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll
@@ -17,45 +17,45 @@
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 224 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 8 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 16 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 32 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 64 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 28 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 56 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 112 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 224 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 8 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 16 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 32 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 64 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 218 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 436 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 14 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 28 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 58 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 116 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 218 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 436 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 58 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 116 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 218 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 436 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 58 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 116 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 222 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 444 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 888 for VF 64 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 14 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 30 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 62 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 124 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 248 for VF 64 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %valB = load i16, i16* %inB, align 4
 define void @test() {
diff --git a/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll
@@ -17,30 +17,30 @@
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 19 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 38 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 76 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 118 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 236 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 9 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 19 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 38 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 76 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 220 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 440 for VF 32 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 14 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 30 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 60 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 120 for VF 32 For instruction:   %valB = load i32, i32* %inB, align 4
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 110 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 220 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 440 for VF 32 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 30 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 60 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 120 for VF 32 For instruction:   %valB = load i32, i32* %inB, align 4
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse42 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse42 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE42
 ; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
 ; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2,-fast-gather --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2-SLOWGATHER
 ; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2,+fast-gather --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2-FASTGATHER
@@ -17,30 +17,30 @@
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 58 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 116 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 232 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 18 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 36 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 72 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 58 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 116 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 232 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 9 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 18 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 36 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 72 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 56 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 112 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 16 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 32 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 64 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 128 for VF 32 For instruction:   %valB = load i64, i64* %inB, align 8
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 56 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 16 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 32 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction:   %valB = load i64, i64* %inB, align 8
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll
@@ -17,45 +17,45 @@
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 119 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 239 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 19 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 39 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 79 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 119 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 239 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 9 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 19 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 39 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 79 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 216 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 434 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 14 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 28 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 56 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 114 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 434 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 56 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 114 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 434 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 56 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 114 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 220 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 442 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 884 for VF 64 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 14 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 30 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 60 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 122 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 244 for VF 64 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %valB = load i8, i8* %inB, align 4
 define void @test() {
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
@@ -18,13 +18,13 @@
 ; SSE2: LV: Found an estimated cost of 34 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 7 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 11 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 25 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 50 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 11 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 25 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 50 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll
@@ -17,45 +17,45 @@
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %valB, i16* %out, align 2
-; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
-; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
-; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
-; SSE2: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
+; SSE2: LV: Found an estimated cost of 8 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
+; SSE2: LV: Found an estimated cost of 16 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
+; SSE2: LV: Found an estimated cost of 32 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
+; SSE2: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %valB, i16* %out, align 2
-; SSE42: LV: Found an estimated cost of 28 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
-; SSE42: LV: Found an estimated cost of 56 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
-; SSE42: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
-; SSE42: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
+; SSE42: LV: Found an estimated cost of 8 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
+; SSE42: LV: Found an estimated cost of 16 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
+; SSE42: LV: Found an estimated cost of 32 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
+; SSE42: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX1: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX1: LV: Found an estimated cost of 28 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX1: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX1: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 228 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 464 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 928 for VF 64 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 30 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 68 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 144 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 288 for VF 64 For instruction:   store i16 %valB, i16* %out, align 2
 ;
 ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i16 %valB, i16* %out
 define void @test() {
diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll
@@ -17,37 +17,37 @@
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out, align 4
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
-; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
-; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
-; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
+; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
+; SSE2: LV: Found an estimated cost of 19 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
+; SSE2: LV: Found an estimated cost of 38 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
+; SSE2: LV: Found an estimated cost of 76 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out, align 4
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
-; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
-; SSE42: LV: Found an estimated cost of 118 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
-; SSE42: LV: Found an estimated cost of 236 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
+; SSE42: LV: Found an estimated cost of 9 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
+; SSE42: LV: Found an estimated cost of 19 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
+; SSE42: LV: Found an estimated cost of 38 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
+; SSE42: LV: Found an estimated cost of 76 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX1: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX1: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX1: LV: Found an estimated cost of 32 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX1: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX1: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 32 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-FASTGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-FASTGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-FASTGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-FASTGATHER: LV: Found an estimated cost of 32 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-FASTGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-FASTGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out
 ; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i32 %valB, i32* %out
diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll
@@ -17,37 +17,37 @@
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
-; SSE2: LV: Found an estimated cost of 58 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
-; SSE2: LV: Found an estimated cost of 116 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
-; SSE2: LV: Found an estimated cost of 232 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
+; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
+; SSE2: LV: Found an estimated cost of 18 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
+; SSE2: LV: Found an estimated cost of 36 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
+; SSE2: LV: Found an estimated cost of 72 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
-; SSE42: LV: Found an estimated cost of 58 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
-; SSE42: LV: Found an estimated cost of 116 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
-; SSE42: LV: Found an estimated cost of 232 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
+; SSE42: LV: Found an estimated cost of 9 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
+; SSE42: LV: Found an estimated cost of 18 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
+; SSE42: LV: Found an estimated cost of 36 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
+; SSE42: LV: Found an estimated cost of 72 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX1: LV: Found an estimated cost of 56 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX1: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX1: LV: Found an estimated cost of 16 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX1: LV: Found an estimated cost of 32 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX1: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX1: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i64 %valB, i64* %out, align 8
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 56 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 16 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 32 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i64 %valB, i64* %out, align 8
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-FASTGATHER: LV: Found an estimated cost of 56 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-FASTGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-FASTGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-FASTGATHER: LV: Found an estimated cost of 16 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-FASTGATHER: LV: Found an estimated cost of 32 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-FASTGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-FASTGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i64 %valB, i64* %out, align 8
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
 ; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll
@@ -17,45 +17,45 @@
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %valB, i8* %out, align 1
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
-; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
-; SSE2: LV: Found an estimated cost of 119 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
-; SSE2: LV: Found an estimated cost of 239 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
+; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
+; SSE2: LV: Found an estimated cost of 19 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
+; SSE2: LV: Found an estimated cost of 39 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
+; SSE2: LV: Found an estimated cost of 79 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %valB, i8* %out, align 1
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
-; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
-; SSE42: LV: Found an estimated cost of 119 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
-; SSE42: LV: Found an estimated cost of 239 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
+; SSE42: LV: Found an estimated cost of 9 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
+; SSE42: LV: Found an estimated cost of 19 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
+; SSE42: LV: Found an estimated cost of 39 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
+; SSE42: LV: Found an estimated cost of 79 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX1: LV: Found an estimated cost of 216 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX1: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX1: LV: Found an estimated cost of 28 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX1: LV: Found an estimated cost of 56 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX1: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 56 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 56 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 220 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 456 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 928 for VF 64 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 30 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 60 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 136 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 288 for VF 64 For instruction:   store i8 %valB, i8* %out, align 1
 ;
 ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i8 %valB, i8* %out
 define void @test() {
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -mtriple=x86_64-apple-macosx -S -mcpu=corei7-avx -enable-interleaved-mem-accesses=false < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -17,6 +18,144 @@
 ; CHECK-NOT: x float>
 
 define void @_Z4testmm(i64 %size, i64 %offset) {
+; CHECK-LABEL: @_Z4testmm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP53:%.*]] = icmp eq i64 [[SIZE:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP53]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SIZE]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SIZE]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SIZE]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[OFFSET:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[TMP4]], <i64 3, i64 3, i64 3, i64 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, <4 x i64> [[TMP5]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP6]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <4 x float> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul fast <4 x float> [[TMP10]], [[WIDE_LOAD3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast <4 x float> [[TMP14]], [[WIDE_LOAD4]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = fmul fast <4 x float> [[TMP18]], [[WIDE_LOAD5]]
+; CHECK-NEXT:    [[TMP23]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = add <4 x i64> [[TMP5]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, <4 x i64> [[TMP24]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP25]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; CHECK-NEXT:    [[TMP26:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER6]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fmul fast <4 x float> [[WIDE_LOAD3]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast <4 x float> [[WIDE_LOAD4]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = fmul fast <4 x float> [[WIDE_LOAD5]], [[TMP28]]
+; CHECK-NEXT:    [[TMP30]] = fadd fast <4 x float> [[VEC_PHI1]], [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = add <4 x i64> [[TMP5]], <i64 2, i64 2, i64 2, i64 2>
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, <4 x i64> [[TMP31]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP32]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; CHECK-NEXT:    [[TMP33:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER7]]
+; CHECK-NEXT:    [[TMP34:%.*]] = fmul fast <4 x float> [[WIDE_LOAD3]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast <4 x float> [[WIDE_LOAD4]], [[TMP34]]
+; CHECK-NEXT:    [[TMP36:%.*]] = fmul fast <4 x float> [[WIDE_LOAD5]], [[TMP35]]
+; CHECK-NEXT:    [[TMP37]] = fadd fast <4 x float> [[VEC_PHI2]], [[TMP36]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP39:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP37]])
+; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP30]])
+; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP23]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SIZE]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX8:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP40]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX9:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[R_057:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[G_056:%.*]] = phi float [ [[BC_MERGE_RDX8]], [[SCALAR_PH]] ], [ [[ADD20:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[V_055:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[B_054:%.*]] = phi float [ [[BC_MERGE_RDX9]], [[SCALAR_PH]] ], [ [[ADD30:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[V_055]], [[OFFSET]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[ADD]], 3
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[MUL]]
+; CHECK-NEXT:    [[TMP42:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 [[V_055]]
+; CHECK-NEXT:    [[TMP43:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[MUL3:%.*]] = fmul fast float [[TMP42]], [[TMP43]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 [[V_055]]
+; CHECK-NEXT:    [[TMP44:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[MUL5:%.*]] = fmul fast float [[MUL3]], [[TMP44]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 [[V_055]]
+; CHECK-NEXT:    [[TMP45:%.*]] = load float, float* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[MUL7:%.*]] = fmul fast float [[MUL5]], [[TMP45]]
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 [[V_055]]
+; CHECK-NEXT:    [[TMP46:%.*]] = load float, float* [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[MUL9:%.*]] = fmul fast float [[MUL7]], [[TMP46]]
+; CHECK-NEXT:    [[ADD10]] = fadd fast float [[R_057]], [[MUL9]]
+; CHECK-NEXT:    [[ARRAYIDX_SUM:%.*]] = add i64 [[MUL]], 1
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[ARRAYIDX_SUM]]
+; CHECK-NEXT:    [[TMP47:%.*]] = load float, float* [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[MUL13:%.*]] = fmul fast float [[TMP43]], [[TMP47]]
+; CHECK-NEXT:    [[MUL15:%.*]] = fmul fast float [[TMP44]], [[MUL13]]
+; CHECK-NEXT:    [[MUL17:%.*]] = fmul fast float [[TMP45]], [[MUL15]]
+; CHECK-NEXT:    [[MUL19:%.*]] = fmul fast float [[TMP46]], [[MUL17]]
+; CHECK-NEXT:    [[ADD20]] = fadd fast float [[G_056]], [[MUL19]]
+; CHECK-NEXT:    [[ARRAYIDX_SUM52:%.*]] = add i64 [[MUL]], 2
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[ARRAYIDX_SUM52]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load float, float* [[ARRAYIDX21]], align 4
+; CHECK-NEXT:    [[MUL23:%.*]] = fmul fast float [[TMP43]], [[TMP48]]
+; CHECK-NEXT:    [[MUL25:%.*]] = fmul fast float [[TMP44]], [[MUL23]]
+; CHECK-NEXT:    [[MUL27:%.*]] = fmul fast float [[TMP45]], [[MUL25]]
+; CHECK-NEXT:    [[MUL29:%.*]] = fmul fast float [[TMP46]], [[MUL27]]
+; CHECK-NEXT:    [[ADD30]] = fadd fast float [[B_054]], [[MUL29]]
+; CHECK-NEXT:    [[INC]] = add i64 [[V_055]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INC]], [[SIZE]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       for.cond.for.end_crit_edge:
+; CHECK-NEXT:    [[ADD30_LCSSA:%.*]] = phi float [ [[ADD30]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD20_LCSSA:%.*]] = phi float [ [[ADD20]], [[FOR_BODY]] ], [ [[TMP40]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD10_LCSSA:%.*]] = phi float [ [[ADD10]], [[FOR_BODY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[PHITMP:%.*]] = fptoui float [[ADD10_LCSSA]] to i8
+; CHECK-NEXT:    [[PHITMP60:%.*]] = fptoui float [[ADD20_LCSSA]] to i8
+; CHECK-NEXT:    [[PHITMP61:%.*]] = fptoui float [[ADD30_LCSSA]] to i8
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i8 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[G_0_LCSSA:%.*]] = phi i8 [ [[PHITMP60]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[B_0_LCSSA:%.*]] = phi i8 [ [[PHITMP61]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    store i8 [[R_0_LCSSA]], i8* @r_, align 1
+; CHECK-NEXT:    store i8 [[G_0_LCSSA]], i8* @g_, align 1
+; CHECK-NEXT:    store i8 [[B_0_LCSSA]], i8* @b_, align 1
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp53 = icmp eq i64 %size, 0
   br i1 %cmp53, label %for.end, label %for.body.lr.ph
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -328,13 +328,21 @@
 ; CHECK-NEXT:    call void @init(i32* [[BASE]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32*> poison, i32* [[BASE]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32*> poison, i32* [[BASE]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT4]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32*> poison, i32* [[BASE]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT7]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32*> poison, i32* [[BASE]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT10]], <4 x i32*> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP100:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP101:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP102:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP103:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP68:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP70:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP71:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -399,63 +407,35 @@
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP65:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP66:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP67:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <4 x i32> poison, i32 [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP65]], i32 1
-; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <4 x i32> [[TMP69]], i32 [[TMP66]], i32 2
-; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP67]], i32 3
-; CHECK-NEXT:    [[TMP72:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP73:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP74:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP75:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> poison, i32 [[TMP72]], i32 0
-; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP73]], i32 1
-; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP74]], i32 2
-; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP75]], i32 3
-; CHECK-NEXT:    [[TMP80:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP81:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP82:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP83:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0
-; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1
-; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2
-; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3
-; CHECK-NEXT:    [[TMP88:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP89:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP90:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP91:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0
-; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1
-; CHECK-NEXT:    [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2
-; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3
-; CHECK-NEXT:    [[TMP96:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP97:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP98:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP99:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP71]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI4:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP79]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI5:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP87]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI6:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP95]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP100]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP101]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI4]]
-; CHECK-NEXT:    [[TMP102]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI5]]
-; CHECK-NEXT:    [[TMP103]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI6]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[BROADCAST_SPLAT]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[BROADCAST_SPLAT5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[BROADCAST_SPLAT8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[BROADCAST_SPLAT11]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[TMP64:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP65:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP66:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP67:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_GATHER]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI13:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_GATHER6]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI14:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_GATHER9]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI15:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_GATHER12]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP68]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP69]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI13]]
+; CHECK-NEXT:    [[TMP70]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI14]]
+; CHECK-NEXT:    [[TMP71]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI15]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP104:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP104]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP72]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP101]], [[TMP100]]
-; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP102]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP103]], [[BIN_RDX7]]
-; CHECK-NEXT:    [[TMP105:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP69]], [[TMP68]]
+; CHECK-NEXT:    [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP70]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP71]], [[BIN_RDX16]]
+; CHECK-NEXT:    [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP105]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP73]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -473,7 +453,7 @@
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP105]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP73]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -12,9 +13,29 @@
 ;    }
 ;}
 
-;CHECK-LABEL: @loop(
-;CHECK-NOT: <4 x i32>
 define void @loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+; CHECK-LABEL: @loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[IDXPROM3:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IDXPROM3]]
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 512
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -42,9 +63,41 @@
 ; The same loop with parallel loop metadata added to the loop branch
 ; and the memory instructions.
 
-;CHECK-LABEL: @parallel_loop(
-;CHECK: <4 x i32>
 define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+; CHECK-LABEL: @parallel_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i32> [[WIDE_LOAD1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], <4 x i64> [[TMP4]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> [[WIDE_LOAD]], <4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !llvm.access.group !1
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD2]], <4 x i32>* [[TMP9]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -74,10 +127,30 @@
 ; The same loop with an illegal parallel loop metadata: the memory
 ; accesses refer to a different loop's identifier.
 
-;CHECK-LABEL: @mixed_metadata(
-;CHECK-NOT: <4 x i32>
 
 define void @mixed_metadata(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+; CHECK-LABEL: @mixed_metadata(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !llvm.access.group !7
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !llvm.access.group !7
+; CHECK-NEXT:    [[IDXPROM3:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IDXPROM3]]
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[ARRAYIDX4]], align 4, !llvm.access.group !8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !llvm.access.group !7
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[ARRAYIDX2]], align 4, !llvm.access.group !7
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 512
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
@@ -480,8 +480,8 @@
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP53:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
@@ -493,62 +493,34 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
 ; CHECK-NEXT:    [[TMP8:%.*]] = udiv <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
 ; CHECK-NEXT:    [[TMP9:%.*]] = udiv <4 x i64> [[STEP_ADD]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE:%.*]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP8]], i32 3
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i8, i8* [[TMP11]], align 1
-; CHECK-NEXT:    [[TMP27:%.*]] = load i8, i8* [[TMP13]], align 1
-; CHECK-NEXT:    [[TMP28:%.*]] = load i8, i8* [[TMP15]], align 1
-; CHECK-NEXT:    [[TMP29:%.*]] = load i8, i8* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i8> poison, i8 [[TMP26]], i32 0
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i8> [[TMP30]], i8 [[TMP27]], i32 1
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i8> [[TMP31]], i8 [[TMP28]], i32 2
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i8> [[TMP32]], i8 [[TMP29]], i32 3
-; CHECK-NEXT:    [[TMP34:%.*]] = load i8, i8* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i8, i8* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = load i8, i8* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP37:%.*]] = load i8, i8* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i8> poison, i8 [[TMP34]], i32 0
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i8> [[TMP38]], i8 [[TMP35]], i32 1
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i8> [[TMP39]], i8 [[TMP36]], i32 2
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i8> [[TMP40]], i8 [[TMP37]], i32 3
-; CHECK-NEXT:    [[TMP42:%.*]] = urem <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK-NEXT:    [[TMP43:%.*]] = urem <4 x i64> [[STEP_ADD]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK-NEXT:    [[TMP44:%.*]] = trunc <4 x i64> [[TMP42]] to <4 x i8>
-; CHECK-NEXT:    [[TMP45:%.*]] = trunc <4 x i64> [[TMP43]] to <4 x i8>
-; CHECK-NEXT:    [[TMP46:%.*]] = lshr <4 x i8> [[TMP33]], [[TMP44]]
-; CHECK-NEXT:    [[TMP47:%.*]] = lshr <4 x i8> [[TMP41]], [[TMP45]]
-; CHECK-NEXT:    [[TMP48:%.*]] = and <4 x i8> [[TMP46]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP49:%.*]] = and <4 x i8> [[TMP47]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP50:%.*]] = zext <4 x i8> [[TMP48]] to <4 x i32>
-; CHECK-NEXT:    [[TMP51:%.*]] = zext <4 x i8> [[TMP49]] to <4 x i32>
-; CHECK-NEXT:    [[TMP52]] = add <4 x i32> [[VEC_PHI]], [[TMP50]]
-; CHECK-NEXT:    [[TMP53]] = add <4 x i32> [[VEC_PHI2]], [[TMP51]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE:%.*]], <4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], <4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP10]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP11]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
+; CHECK-NEXT:    [[TMP12:%.*]] = urem <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[TMP13:%.*]] = urem <4 x i64> [[STEP_ADD]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[TMP12]] to <4 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = lshr <4 x i8> [[WIDE_MASKED_GATHER]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = lshr <4 x i8> [[WIDE_MASKED_GATHER3]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <4 x i8> [[TMP16]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP19:%.*]] = and <4 x i8> [[TMP17]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP20:%.*]] = zext <4 x i8> [[TMP18]] to <4 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = zext <4 x i8> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[TMP22]] = add <4 x i32> [[VEC_PHI]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23]] = add <4 x i32> [[VEC_PHI2]], [[TMP21]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP53]], [[TMP52]]
-; CHECK-NEXT:    [[TMP55:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP23]], [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -566,7 +538,7 @@
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LOOP]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LOOP]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
@@ -51,7 +51,7 @@
 ;       p[i][y] = (int*) (1 + q[i]);
 ;     }
 ; CHECK: test_nonconsecutive_store
-; CHECK: The Smallest and Widest types: 16 / 16 bits.
+; CHECK: The Smallest and Widest types: 16 / 64 bits.
 define void @test_nonconsecutive_store() nounwind ssp uwtable {
   br label %1
 
@@ -117,7 +117,7 @@
 
 ;; However, we should not take unconsecutive loads of pointers into account.
 ; CHECK: test_nonconsecutive_ptr_load
-; CHECK: LV: The Smallest and Widest types: 16 / 16 bits.
+; CHECK: LV: The Smallest and Widest types: 16 / 64 bits.
 define void @test_nonconsecutive_ptr_load() nounwind ssp uwtable {
   br label %1
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -805,45 +805,15 @@
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP3]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP0]], i32 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP5]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP0]], i32 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP7]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP0]], i32 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP9]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP0]], i32 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP11]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP0]], i32 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP13]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[TMP0]], i32 7
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP15]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = load i8, i8* [[TMP2]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = load i8, i8* [[TMP4]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = load i8, i8* [[TMP6]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP20:%.*]] = load i8, i8* [[TMP8]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = load i8, i8* [[TMP10]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP12]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP23:%.*]] = load i8, i8* [[TMP14]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP24:%.*]] = load i8, i8* [[TMP16]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP25:%.*]] = insertelement <8 x i8> poison, i8 [[TMP17]], i32 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP26:%.*]] = insertelement <8 x i8> [[TMP25]], i8 [[TMP18]], i32 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP27:%.*]] = insertelement <8 x i8> [[TMP26]], i8 [[TMP19]], i32 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP28:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP20]], i32 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP29:%.*]] = insertelement <8 x i8> [[TMP28]], i8 [[TMP21]], i32 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP30:%.*]] = insertelement <8 x i8> [[TMP29]], i8 [[TMP22]], i32 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP31:%.*]] = insertelement <8 x i8> [[TMP30]], i8 [[TMP23]], i32 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP32:%.*]] = insertelement <8 x i8> [[TMP31]], i8 [[TMP24]], i32 7
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP34:%.*]] = bitcast i8* [[TMP33]] to <8 x i8>*
-; DISABLED_MASKED_STRIDED-NEXT:    store <8 x i8> [[TMP32]], <8 x i8>* [[TMP34]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], <8 x i32> [[TMP0]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> [[TMP1]], i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>*
+; DISABLED_MASKED_STRIDED-NEXT:    store <8 x i8> [[WIDE_MASKED_GATHER]], <8 x i8>* [[TMP3]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP35]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; DISABLED_MASKED_STRIDED:       for.end:
 ; DISABLED_MASKED_STRIDED-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
@@ -29,46 +29,18 @@
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>*
 ; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl nuw nsw <4 x i64> [[VEC_IND]], <i64 2, i64 2, i64 2, i64 2>
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 [[TMP3]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP5]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP7]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP9]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 0
-; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP11]], i16* [[TMP4]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 1
-; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP12]], i16* [[TMP6]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP13]], i16* [[TMP8]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
-; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP14]], i16* [[TMP10]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = bitcast i16* [[TMP15]] to <4 x i16>*
-; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP16]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = or <4 x i64> [[TMP2]], <i64 1, i64 1, i64 1, i64 1>
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP18]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP20]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP22]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP24]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 0
-; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP26]], i16* [[TMP19]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 1
-; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP27]], i16* [[TMP21]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 2
-; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP28]], i16* [[TMP23]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP29:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 3
-; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP29]], i16* [[TMP25]], align 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], <4 x i64> [[TMP2]]
+; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> [[WIDE_LOAD]], <4 x i16*> [[TMP3]], i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[INDEX]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>*
+; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP2]], <i64 1, i64 1, i64 1, i64 1>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], <4 x i64> [[TMP6]]
+; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> [[WIDE_LOAD1]], <4 x i16*> [[TMP7]], i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP30]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP8]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; DISABLED_MASKED_STRIDED:       for.end:
 ; DISABLED_MASKED_STRIDED-NEXT:    ret void
 ;