diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -656,10 +656,17 @@
   /// Return true if the target supports nontemporal load.
   bool isLegalNTLoad(Type *DataType, Align Alignment) const;
 
-  /// Return true if the target supports masked scatter.
-  bool isLegalMaskedScatter(Type *DataType, Align Alignment) const;
+  /// Return true if masked gather should be used for vectorization.
+  bool shouldUseMaskedGatherForVectorization(Type *DataType, bool VariableMask,
+                                             Align Alignment) const;
+  /// Return true if masked scatter should be used for vectorization.
+  bool shouldUseMaskedScatterForVectorization(Type *DataType, bool VariableMask,
+                                              Align Alignment) const;
+
   /// Return true if the target supports masked gather.
   bool isLegalMaskedGather(Type *DataType, Align Alignment) const;
+  /// Return true if the target supports masked scatter.
+  bool isLegalMaskedScatter(Type *DataType, Align Alignment) const;
 
   /// Return true if the target supports masked compress store.
   bool isLegalMaskedCompressStore(Type *DataType) const;
@@ -1513,8 +1520,14 @@
   virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0;
-  virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0;
+  virtual bool shouldUseMaskedGatherForVectorization(Type *DataType,
+                                                     bool VariableMask,
+                                                     Align Alignment) = 0;
+  virtual bool shouldUseMaskedScatterForVectorization(Type *DataType,
+                                                      bool VariableMask,
+                                                      Align Alignment) = 0;
   virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0;
+  virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
   virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0;
   virtual bool enableOrderedReductions() = 0;
@@ -1893,12 +1906,22 @@
   bool isLegalNTLoad(Type *DataType, Align Alignment) override {
     return Impl.isLegalNTLoad(DataType, Alignment);
   }
-  bool isLegalMaskedScatter(Type *DataType, Align Alignment) override {
-    return Impl.isLegalMaskedScatter(DataType, Alignment);
+  bool shouldUseMaskedGatherForVectorization(Type *DataType, bool VariableMask,
+                                             Align Alignment) override {
+    return Impl.shouldUseMaskedGatherForVectorization(DataType, VariableMask,
+                                                      Alignment);
+  }
+  bool shouldUseMaskedScatterForVectorization(Type *DataType, bool VariableMask,
+                                              Align Alignment) override {
+    return Impl.shouldUseMaskedScatterForVectorization(DataType, VariableMask,
+                                                       Alignment);
   }
   bool isLegalMaskedGather(Type *DataType, Align Alignment) override {
     return Impl.isLegalMaskedGather(DataType, Alignment);
   }
+  bool isLegalMaskedScatter(Type *DataType, Align Alignment) override {
+    return Impl.isLegalMaskedScatter(DataType, Alignment);
+  }
   bool isLegalMaskedCompressStore(Type *DataType) override {
     return Impl.isLegalMaskedCompressStore(DataType);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -862,6 +862,16 @@
 public:
   using BaseT::getGEPCost;
 
+  bool shouldUseMaskedGatherForVectorization(Type *DataType, bool VariableMask,
+                                             Align Alignment) {
+    return static_cast<T *>(this)->isLegalMaskedGather(DataType, Alignment);
+  }
+
+  bool shouldUseMaskedScatterForVectorization(Type *DataType, bool VariableMask,
+                                              Align Alignment) {
+    return static_cast<T *>(this)->isLegalMaskedScatter(DataType, Alignment);
+  }
+
   InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
                              ArrayRef<const Value *> Operands,
                              TTI::TargetCostKind CostKind) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -393,6 +393,18 @@
   return TTIImpl->isLegalNTLoad(DataType, Alignment);
 }
 
+bool TargetTransformInfo::shouldUseMaskedGatherForVectorization(
+    Type *DataType, bool VariableMask, Align Alignment) const {
+  return TTIImpl->shouldUseMaskedGatherForVectorization(DataType, VariableMask,
+                                                        Alignment);
+}
+
+bool TargetTransformInfo::shouldUseMaskedScatterForVectorization(
+    Type *DataType, bool VariableMask, Align Alignment) const {
+  return TTIImpl->shouldUseMaskedScatterForVectorization(DataType, VariableMask,
+                                                         Alignment);
+}
+
 bool TargetTransformInfo::isLegalMaskedGather(Type *DataType,
                                               Align Alignment) const {
   return TTIImpl->isLegalMaskedGather(DataType, Alignment);
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -227,6 +227,10 @@
   bool isLegalMaskedStore(Type *DataType, Align Alignment);
   bool isLegalNTLoad(Type *DataType, Align Alignment);
   bool isLegalNTStore(Type *DataType, Align Alignment);
+  bool shouldUseMaskedGatherForVectorization(Type *DataType, bool VariableMask,
+                                             Align Alignment);
+  bool shouldUseMaskedScatterForVectorization(Type *DataType, bool VariableMask,
+                                              Align Alignment);
   bool isLegalMaskedGather(Type *DataType, Align Alignment);
   bool isLegalMaskedScatter(Type *DataType, Align Alignment);
   bool isLegalMaskedExpandLoad(Type *DataType);
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4871,6 +4871,21 @@
   return isLegalMaskedExpandLoad(DataTy);
 }
 
+bool X86TTIImpl::shouldUseMaskedGatherForVectorization(Type *DataType,
+                                                       bool VariableMask,
+                                                       Align Alignment) {
+  if (!VariableMask)
+    return true;
+  return isLegalMaskedGather(DataType, Alignment);
+}
+bool X86TTIImpl::shouldUseMaskedScatterForVectorization(Type *DataType,
+                                                        bool VariableMask,
+                                                        Align Alignment) {
+  if (!VariableMask)
+    return true;
+  return isLegalMaskedScatter(DataType, Alignment);
+}
+
 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
   // Some CPUs have better gather performance than others.
   // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1504,15 +1504,18 @@
 
   /// Returns true if the target machine can represent \p V as a masked gather
   /// or scatter operation.
-  bool isLegalGatherOrScatter(Value *V) {
+  bool shouldUseGatherOrScatterForVectorization(Value *V) {
+    auto *I = cast<Instruction>(V);
     bool LI = isa<LoadInst>(V);
     bool SI = isa<StoreInst>(V);
     if (!LI && !SI)
       return false;
     auto *Ty = getLoadStoreType(V);
     Align Align = getLoadStoreAlignment(V);
-    return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
-           (SI && TTI.isLegalMaskedScatter(Ty, Align));
+    return (LI && TTI.shouldUseMaskedGatherForVectorization(
+                      Ty, Legal->isMaskRequired(I), Align)) ||
+           (SI && TTI.shouldUseMaskedScatterForVectorization(
+                      Ty, Legal->isMaskRequired(I), Align));
   }
 
   /// Returns true if the target machine supports all of the reduction
@@ -6342,7 +6345,8 @@
       //        optimization to non-pointer types.
       //
       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
-          !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
+          !isAccessInterleaved(&I) &&
+          !shouldUseGatherOrScatterForVectorization(&I))
         continue;
 
       ElementTypesInLoop.insert(T);
@@ -7454,7 +7458,7 @@
         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
         InstructionCost Cost;
         if (isa<StoreInst>(&I) && VF.isScalable() &&
-            isLegalGatherOrScatter(&I)) {
+            shouldUseGatherOrScatterForVectorization(&I)) {
           Cost = getGatherScatterCost(&I, VF);
           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
         } else {
@@ -7496,7 +7500,7 @@
       }
 
       InstructionCost GatherScatterCost =
-          isLegalGatherOrScatter(&I)
+          shouldUseGatherOrScatterForVectorization(&I)
               ? getGatherScatterCost(&I, VF) * NumAccesses
               : InstructionCost::getInvalid();
 
diff --git a/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll
@@ -17,45 +17,45 @@
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 224 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 8 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 16 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 32 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 28 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 56 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 112 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 224 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 8 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 16 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 32 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 218 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 436 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 8 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 16 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 48 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 96 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 218 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 436 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 48 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 96 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 218 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 436 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 48 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 96 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 222 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 444 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 888 for VF 64 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 16 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 48 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 112 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 224 for VF 64 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %valB = load i16, i16* %inB, align 4
 define void @test() {
diff --git a/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll
@@ -17,30 +17,30 @@
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 8 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 16 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 32 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 64 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 118 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 236 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 8 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 16 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 32 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 64 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 220 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 440 for VF 32 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 8 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 24 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 48 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 96 for VF 32 For instruction:   %valB = load i32, i32* %inB, align 4
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 110 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 220 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 440 for VF 32 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 24 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 48 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 96 for VF 32 For instruction:   %valB = load i32, i32* %inB, align 4
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll
@@ -17,30 +17,30 @@
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 58 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 116 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 232 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 12 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 24 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 48 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 58 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 116 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 232 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 12 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 24 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 48 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 56 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 112 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 12 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 24 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 48 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 96 for VF 32 For instruction:   %valB = load i64, i64* %inB, align 8
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 56 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 12 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 24 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 48 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 96 for VF 32 For instruction:   %valB = load i64, i64* %inB, align 8
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll
@@ -17,45 +17,45 @@
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 119 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 8 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 24 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 72 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
 ; SSE2: LV: Found an estimated cost of 239 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 119 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 8 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 24 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 72 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
 ; SSE42: LV: Found an estimated cost of 239 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 216 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 434 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 8 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 16 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 32 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 96 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 434 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 32 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 96 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 434 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 32 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 96 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 220 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 442 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 884 for VF 64 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 16 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 32 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 96 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 224 for VF 64 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %valB = load i8, i8* %inB, align 4
 define void @test() {
diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll
@@ -17,45 +17,45 @@
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %valB, i16* %out, align 2
-; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
-; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
-; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
-; SSE2: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
+; SSE2: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
+; SSE2: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
+; SSE2: LV: Found an estimated cost of 16 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
+; SSE2: LV: Found an estimated cost of 32 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %valB, i16* %out, align 2
-; SSE42: LV: Found an estimated cost of 28 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
-; SSE42: LV: Found an estimated cost of 56 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
-; SSE42: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
-; SSE42: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
+; SSE42: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
+; SSE42: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
+; SSE42: LV: Found an estimated cost of 16 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
+; SSE42: LV: Found an estimated cost of 32 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX1: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX1: LV: Found an estimated cost of 16 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX1: LV: Found an estimated cost of 40 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX1: LV: Found an estimated cost of 80 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 40 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 80 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 40 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 80 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 228 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 464 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 928 for VF 64 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 16 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 40 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 88 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 176 for VF 64 For instruction:   store i16 %valB, i16* %out, align 2
 ;
 ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i16 %valB, i16* %out
 define void @test() {
diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll
@@ -17,37 +17,37 @@
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out, align 4
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
-; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
-; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
-; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
+; SSE2: LV: Found an estimated cost of 5 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
+; SSE2: LV: Found an estimated cost of 11 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
+; SSE2: LV: Found an estimated cost of 22 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
+; SSE2: LV: Found an estimated cost of 44 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out, align 4
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
-; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
-; SSE42: LV: Found an estimated cost of 118 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
-; SSE42: LV: Found an estimated cost of 236 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
+; SSE42: LV: Found an estimated cost of 5 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
+; SSE42: LV: Found an estimated cost of 11 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
+; SSE42: LV: Found an estimated cost of 22 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
+; SSE42: LV: Found an estimated cost of 44 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX1: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX1: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX1: LV: Found an estimated cost of 20 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX1: LV: Found an estimated cost of 40 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX1: LV: Found an estimated cost of 80 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 20 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 40 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 80 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-FASTGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-FASTGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-FASTGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-FASTGATHER: LV: Found an estimated cost of 20 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-FASTGATHER: LV: Found an estimated cost of 40 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-FASTGATHER: LV: Found an estimated cost of 80 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out
 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i32 %valB, i32* %out
diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll
@@ -17,37 +17,37 @@
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
-; SSE2: LV: Found an estimated cost of 58 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
-; SSE2: LV: Found an estimated cost of 116 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
-; SSE2: LV: Found an estimated cost of 232 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
+; SSE2: LV: Found an estimated cost of 5 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
+; SSE2: LV: Found an estimated cost of 10 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
+; SSE2: LV: Found an estimated cost of 20 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
+; SSE2: LV: Found an estimated cost of 40 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
-; SSE42: LV: Found an estimated cost of 58 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
-; SSE42: LV: Found an estimated cost of 116 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
-; SSE42: LV: Found an estimated cost of 232 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
+; SSE42: LV: Found an estimated cost of 5 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
+; SSE42: LV: Found an estimated cost of 10 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
+; SSE42: LV: Found an estimated cost of 20 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
+; SSE42: LV: Found an estimated cost of 40 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX1: LV: Found an estimated cost of 56 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX1: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX1: LV: Found an estimated cost of 10 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX1: LV: Found an estimated cost of 20 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX1: LV: Found an estimated cost of 40 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX1: LV: Found an estimated cost of 80 for VF 32 For instruction:   store i64 %valB, i64* %out, align 8
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 56 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 10 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 20 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 40 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 80 for VF 32 For instruction:   store i64 %valB, i64* %out, align 8
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-FASTGATHER: LV: Found an estimated cost of 56 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-FASTGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-FASTGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-FASTGATHER: LV: Found an estimated cost of 10 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-FASTGATHER: LV: Found an estimated cost of 20 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-FASTGATHER: LV: Found an estimated cost of 40 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-FASTGATHER: LV: Found an estimated cost of 80 for VF 32 For instruction:   store i64 %valB, i64* %out, align 8
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll
@@ -17,45 +17,45 @@
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %valB, i8* %out, align 1
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
-; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
-; SSE2: LV: Found an estimated cost of 119 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
-; SSE2: LV: Found an estimated cost of 239 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
+; SSE2: LV: Found an estimated cost of 5 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
+; SSE2: LV: Found an estimated cost of 11 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
+; SSE2: LV: Found an estimated cost of 23 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
+; SSE2: LV: Found an estimated cost of 47 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %valB, i8* %out, align 1
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
-; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
-; SSE42: LV: Found an estimated cost of 119 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
-; SSE42: LV: Found an estimated cost of 239 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
+; SSE42: LV: Found an estimated cost of 5 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
+; SSE42: LV: Found an estimated cost of 11 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
+; SSE42: LV: Found an estimated cost of 23 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
+; SSE42: LV: Found an estimated cost of 47 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX1: LV: Found an estimated cost of 216 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX1: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX1: LV: Found an estimated cost of 16 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX1: LV: Found an estimated cost of 32 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX1: LV: Found an estimated cost of 80 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 32 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 80 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 32 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 80 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 220 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 456 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 928 for VF 64 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 16 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 32 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 80 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 176 for VF 64 For instruction:   store i8 %valB, i8* %out, align 1
 ;
 ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i8 %valB, i8* %out
 define void @test() {
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -mtriple=x86_64-apple-macosx -S -mcpu=corei7-avx -enable-interleaved-mem-accesses=false < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -17,6 +18,144 @@
 ; CHECK-NOT: x float>
 
 define void @_Z4testmm(i64 %size, i64 %offset) {
+; CHECK-LABEL: @_Z4testmm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP53:%.*]] = icmp eq i64 [[SIZE:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP53]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SIZE]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SIZE]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SIZE]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[OFFSET:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[TMP4]], <i64 3, i64 3, i64 3, i64 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, <4 x i64> [[TMP5]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP6]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <4 x float> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul fast <4 x float> [[TMP10]], [[WIDE_LOAD3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast <4 x float> [[TMP14]], [[WIDE_LOAD4]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = fmul fast <4 x float> [[TMP18]], [[WIDE_LOAD5]]
+; CHECK-NEXT:    [[TMP23]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = add <4 x i64> [[TMP5]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, <4 x i64> [[TMP24]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP25]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; CHECK-NEXT:    [[TMP26:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER6]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fmul fast <4 x float> [[WIDE_LOAD3]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast <4 x float> [[WIDE_LOAD4]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = fmul fast <4 x float> [[WIDE_LOAD5]], [[TMP28]]
+; CHECK-NEXT:    [[TMP30]] = fadd fast <4 x float> [[VEC_PHI1]], [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = add <4 x i64> [[TMP5]], <i64 2, i64 2, i64 2, i64 2>
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, <4 x i64> [[TMP31]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP32]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; CHECK-NEXT:    [[TMP33:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER7]]
+; CHECK-NEXT:    [[TMP34:%.*]] = fmul fast <4 x float> [[WIDE_LOAD3]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast <4 x float> [[WIDE_LOAD4]], [[TMP34]]
+; CHECK-NEXT:    [[TMP36:%.*]] = fmul fast <4 x float> [[WIDE_LOAD5]], [[TMP35]]
+; CHECK-NEXT:    [[TMP37]] = fadd fast <4 x float> [[VEC_PHI2]], [[TMP36]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP39:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP37]])
+; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP30]])
+; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP23]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SIZE]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX8:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP40]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX9:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[R_057:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[G_056:%.*]] = phi float [ [[BC_MERGE_RDX8]], [[SCALAR_PH]] ], [ [[ADD20:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[V_055:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[B_054:%.*]] = phi float [ [[BC_MERGE_RDX9]], [[SCALAR_PH]] ], [ [[ADD30:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[V_055]], [[OFFSET]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[ADD]], 3
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[MUL]]
+; CHECK-NEXT:    [[TMP42:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 [[V_055]]
+; CHECK-NEXT:    [[TMP43:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[MUL3:%.*]] = fmul fast float [[TMP42]], [[TMP43]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 [[V_055]]
+; CHECK-NEXT:    [[TMP44:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[MUL5:%.*]] = fmul fast float [[MUL3]], [[TMP44]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 [[V_055]]
+; CHECK-NEXT:    [[TMP45:%.*]] = load float, float* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[MUL7:%.*]] = fmul fast float [[MUL5]], [[TMP45]]
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 [[V_055]]
+; CHECK-NEXT:    [[TMP46:%.*]] = load float, float* [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[MUL9:%.*]] = fmul fast float [[MUL7]], [[TMP46]]
+; CHECK-NEXT:    [[ADD10]] = fadd fast float [[R_057]], [[MUL9]]
+; CHECK-NEXT:    [[ARRAYIDX_SUM:%.*]] = add i64 [[MUL]], 1
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[ARRAYIDX_SUM]]
+; CHECK-NEXT:    [[TMP47:%.*]] = load float, float* [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[MUL13:%.*]] = fmul fast float [[TMP43]], [[TMP47]]
+; CHECK-NEXT:    [[MUL15:%.*]] = fmul fast float [[TMP44]], [[MUL13]]
+; CHECK-NEXT:    [[MUL17:%.*]] = fmul fast float [[TMP45]], [[MUL15]]
+; CHECK-NEXT:    [[MUL19:%.*]] = fmul fast float [[TMP46]], [[MUL17]]
+; CHECK-NEXT:    [[ADD20]] = fadd fast float [[G_056]], [[MUL19]]
+; CHECK-NEXT:    [[ARRAYIDX_SUM52:%.*]] = add i64 [[MUL]], 2
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[ARRAYIDX_SUM52]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load float, float* [[ARRAYIDX21]], align 4
+; CHECK-NEXT:    [[MUL23:%.*]] = fmul fast float [[TMP43]], [[TMP48]]
+; CHECK-NEXT:    [[MUL25:%.*]] = fmul fast float [[TMP44]], [[MUL23]]
+; CHECK-NEXT:    [[MUL27:%.*]] = fmul fast float [[TMP45]], [[MUL25]]
+; CHECK-NEXT:    [[MUL29:%.*]] = fmul fast float [[TMP46]], [[MUL27]]
+; CHECK-NEXT:    [[ADD30]] = fadd fast float [[B_054]], [[MUL29]]
+; CHECK-NEXT:    [[INC]] = add i64 [[V_055]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INC]], [[SIZE]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       for.cond.for.end_crit_edge:
+; CHECK-NEXT:    [[ADD30_LCSSA:%.*]] = phi float [ [[ADD30]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD20_LCSSA:%.*]] = phi float [ [[ADD20]], [[FOR_BODY]] ], [ [[TMP40]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD10_LCSSA:%.*]] = phi float [ [[ADD10]], [[FOR_BODY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[PHITMP:%.*]] = fptoui float [[ADD10_LCSSA]] to i8
+; CHECK-NEXT:    [[PHITMP60:%.*]] = fptoui float [[ADD20_LCSSA]] to i8
+; CHECK-NEXT:    [[PHITMP61:%.*]] = fptoui float [[ADD30_LCSSA]] to i8
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i8 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[G_0_LCSSA:%.*]] = phi i8 [ [[PHITMP60]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[B_0_LCSSA:%.*]] = phi i8 [ [[PHITMP61]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    store i8 [[R_0_LCSSA]], i8* @r_, align 1
+; CHECK-NEXT:    store i8 [[G_0_LCSSA]], i8* @g_, align 1
+; CHECK-NEXT:    store i8 [[B_0_LCSSA]], i8* @b_, align 1
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp53 = icmp eq i64 %size, 0
   br i1 %cmp53, label %for.end, label %for.body.lr.ph
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=SSE
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=sandybridge < %s | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=haswell < %s | FileCheck %s --check-prefix=AVX2
-; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=SSE
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=SLM
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=SSE
 
 define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b) {
@@ -33,19 +33,54 @@
 ; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX1-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
-; AVX1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>*
-; AVX1-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
-; AVX1-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX1-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX1-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
-; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; AVX1-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4
-; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; AVX1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; AVX1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX1-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX1-NEXT:    [[TMP0:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; AVX1-NEXT:    [[STEP_ADD:%.*]] = shl <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; AVX1-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[STEP_ADD]], <i64 4, i64 4>
+; AVX1-NEXT:    [[STEP_ADD1:%.*]] = shl <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; AVX1-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[STEP_ADD1]], <i64 8, i64 8>
+; AVX1-NEXT:    [[STEP_ADD2:%.*]] = shl <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; AVX1-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STEP_ADD2]], <i64 12, i64 12>
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <2 x i64> [[TMP0]]
+; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], <2 x i64> [[TMP1]]
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B]], <2 x i64> [[TMP2]]
+; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], <2 x i64> [[TMP3]]
+; AVX1-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP4]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; AVX1-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP5]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; AVX1-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP6]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; AVX1-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP7]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; AVX1-NEXT:    [[TMP8:%.*]] = or <2 x i64> [[TMP0]], <i64 1, i64 1>
+; AVX1-NEXT:    [[TMP9:%.*]] = or <2 x i64> [[TMP1]], <i64 1, i64 1>
+; AVX1-NEXT:    [[TMP10:%.*]] = or <2 x i64> [[TMP2]], <i64 1, i64 1>
+; AVX1-NEXT:    [[TMP11:%.*]] = or <2 x i64> [[TMP3]], <i64 1, i64 1>
+; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], <2 x i64> [[TMP8]]
+; AVX1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], <2 x i64> [[TMP9]]
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], <2 x i64> [[TMP10]]
+; AVX1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], <2 x i64> [[TMP11]]
+; AVX1-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP12]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; AVX1-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP13]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; AVX1-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP14]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; AVX1-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP15]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; AVX1-NEXT:    [[TMP16:%.*]] = add nsw <2 x i32> [[WIDE_MASKED_GATHER7]], [[WIDE_MASKED_GATHER]]
+; AVX1-NEXT:    [[TMP17:%.*]] = add nsw <2 x i32> [[WIDE_MASKED_GATHER8]], [[WIDE_MASKED_GATHER4]]
+; AVX1-NEXT:    [[TMP18:%.*]] = add nsw <2 x i32> [[WIDE_MASKED_GATHER9]], [[WIDE_MASKED_GATHER5]]
+; AVX1-NEXT:    [[TMP19:%.*]] = add nsw <2 x i32> [[WIDE_MASKED_GATHER10]], [[WIDE_MASKED_GATHER6]]
+; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
+; AVX1-NEXT:    store <2 x i32> [[TMP16]], <2 x i32>* [[TMP21]], align 4
+; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i64 2
+; AVX1-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <2 x i32>*
+; AVX1-NEXT:    store <2 x i32> [[TMP17]], <2 x i32>* [[TMP23]], align 4
+; AVX1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i64 4
+; AVX1-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <2 x i32>*
+; AVX1-NEXT:    store <2 x i32> [[TMP18]], <2 x i32>* [[TMP25]], align 4
+; AVX1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i64 6
+; AVX1-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <2 x i32>*
+; AVX1-NEXT:    store <2 x i32> [[TMP19]], <2 x i32>* [[TMP27]], align 4
+; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; AVX1-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 8, i64 8>
+; AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; AVX1-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; AVX1:       scalar.ph:
@@ -117,6 +152,37 @@
 ; AVX2:       for.body:
 ; AVX2-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ;
+; SLM-LABEL: @foo(
+; SLM-NEXT:  entry:
+; SLM-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SLM:       vector.ph:
+; SLM-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SLM:       vector.body:
+; SLM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SLM-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SLM-NEXT:    [[TMP0:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; SLM-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <4 x i64> [[TMP0]]
+; SLM-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; SLM-NEXT:    [[TMP2:%.*]] = or <4 x i64> [[TMP0]], <i64 1, i64 1, i64 1, i64 1>
+; SLM-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], <4 x i64> [[TMP2]]
+; SLM-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; SLM-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER1]], [[WIDE_MASKED_GATHER]]
+; SLM-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; SLM-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; SLM-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP6]], align 4
+; SLM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; SLM-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; SLM-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; SLM-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SLM:       middle.block:
+; SLM-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; SLM:       scalar.ph:
+; SLM-NEXT:    br label [[FOR_BODY:%.*]]
+; SLM:       for.cond.cleanup:
+; SLM-NEXT:    ret void
+; SLM:       for.body:
+; SLM-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -160,10 +160,14 @@
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -180,95 +184,56 @@
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4
-; CHECK-NEXT:    [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP71]], align 4
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP73]], align 4
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP75]], align 4
-; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD4]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_LOAD5]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP25]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP27]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP31]], align 4
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_LOAD10]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_LOAD11]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_LOAD12]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]]
+; CHECK-NEXT:    [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]]
+; CHECK-NEXT:    [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX16]]
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -287,7 +252,7 @@
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -328,13 +293,25 @@
 ; CHECK-NEXT:    call void @init(i32* [[BASE]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32*> poison, i32* [[BASE]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x i32*> poison, i32* [[BASE]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT11]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x i32*> poison, i32* [[BASE]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT14]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i32*> poison, i32* [[BASE]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT17]], <4 x i32*> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP100:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP101:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP102:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP103:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -351,111 +328,44 @@
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP65:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP66:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP67:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <4 x i32> poison, i32 [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP65]], i32 1
-; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <4 x i32> [[TMP69]], i32 [[TMP66]], i32 2
-; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP67]], i32 3
-; CHECK-NEXT:    [[TMP72:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP73:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP74:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP75:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> poison, i32 [[TMP72]], i32 0
-; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP73]], i32 1
-; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP74]], i32 2
-; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP75]], i32 3
-; CHECK-NEXT:    [[TMP80:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP81:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP82:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP83:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0
-; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1
-; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2
-; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3
-; CHECK-NEXT:    [[TMP88:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP89:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP90:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP91:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0
-; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1
-; CHECK-NEXT:    [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2
-; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3
-; CHECK-NEXT:    [[TMP96:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP97:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP98:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP99:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP71]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI4:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP79]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI5:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP87]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI6:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP95]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP100]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP101]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI4]]
-; CHECK-NEXT:    [[TMP102]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI5]]
-; CHECK-NEXT:    [[TMP103]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI6]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[BROADCAST_SPLAT]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[BROADCAST_SPLAT12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER16:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[BROADCAST_SPLAT15]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER19:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[BROADCAST_SPLAT18]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP21:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP22:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_GATHER10]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI20:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_GATHER13]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI21:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_GATHER16]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI22:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_GATHER19]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP25]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI20]]
+; CHECK-NEXT:    [[TMP26]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI21]]
+; CHECK-NEXT:    [[TMP27]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI22]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP104:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP104]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP101]], [[TMP100]]
-; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP102]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP103]], [[BIN_RDX7]]
-; CHECK-NEXT:    [[TMP105:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP25]], [[TMP24]]
+; CHECK-NEXT:    [[BIN_RDX23:%.*]] = add <4 x i32> [[TMP26]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX24:%.*]] = add <4 x i32> [[TMP27]], [[BIN_RDX23]]
+; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX24]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP105]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -473,7 +383,7 @@
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP105]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -512,276 +422,241 @@
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE33:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE33]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP181:%.*]], [[PRED_LOAD_CONTINUE33]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP182:%.*]], [[PRED_LOAD_CONTINUE33]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP183:%.*]], [[PRED_LOAD_CONTINUE33]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0
-; CHECK-NEXT:    br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE39:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE39]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP136:%.*]], [[PRED_LOAD_CONTINUE39]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[PRED_LOAD_CONTINUE39]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[PRED_LOAD_CONTINUE39]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP139:%.*]], [[PRED_LOAD_CONTINUE39]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP0]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP1]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP2]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP3]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER]], i32 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP65:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i16, i16* [[TMP65]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP67:%.*]] = bitcast i16* [[TMP66]] to i32*
-; CHECK-NEXT:    [[TMP68:%.*]] = load i32, i32* [[TMP67]], align 4
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <4 x i32> poison, i32 [[TMP68]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16* [[TMP7]] to i32*
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP9]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP70:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP69]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1
-; CHECK-NEXT:    br i1 [[TMP71]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]]
-; CHECK:       pred.load.if4:
-; CHECK-NEXT:    [[TMP72:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i16, i16* [[TMP72]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP74:%.*]] = bitcast i16* [[TMP73]] to i32*
-; CHECK-NEXT:    [[TMP75:%.*]] = load i32, i32* [[TMP74]], align 4
-; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP75]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE5]]
-; CHECK:       pred.load.continue5:
-; CHECK-NEXT:    [[TMP77:%.*]] = phi <4 x i32> [ [[TMP70]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP76]], [[PRED_LOAD_IF4]] ]
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2
-; CHECK-NEXT:    br i1 [[TMP78]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]]
-; CHECK:       pred.load.if6:
-; CHECK-NEXT:    [[TMP79:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i16, i16* [[TMP79]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP81:%.*]] = bitcast i16* [[TMP80]] to i32*
-; CHECK-NEXT:    [[TMP82:%.*]] = load i32, i32* [[TMP81]], align 4
-; CHECK-NEXT:    [[TMP83:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP82]], i32 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE7]]
-; CHECK:       pred.load.continue7:
-; CHECK-NEXT:    [[TMP84:%.*]] = phi <4 x i32> [ [[TMP77]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP83]], [[PRED_LOAD_IF6]] ]
-; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3
-; CHECK-NEXT:    br i1 [[TMP85]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
-; CHECK:       pred.load.if8:
-; CHECK-NEXT:    [[TMP86:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP87:%.*]] = getelementptr inbounds i16, i16* [[TMP86]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP88:%.*]] = bitcast i16* [[TMP87]] to i32*
-; CHECK-NEXT:    [[TMP89:%.*]] = load i32, i32* [[TMP88]], align 4
-; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP89]], i32 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
-; CHECK:       pred.load.continue9:
-; CHECK-NEXT:    [[TMP91:%.*]] = phi <4 x i32> [ [[TMP84]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP90]], [[PRED_LOAD_IF8]] ]
-; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0
-; CHECK-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER]], i32 1
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]]
 ; CHECK:       pred.load.if10:
-; CHECK-NEXT:    [[TMP93:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds i16, i16* [[TMP93]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP95:%.*]] = bitcast i16* [[TMP94]] to i32*
-; CHECK-NEXT:    [[TMP96:%.*]] = load i32, i32* [[TMP95]], align 4
-; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, i16* [[TMP14]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i16* [[TMP15]] to i32*
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP17]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE11]]
 ; CHECK:       pred.load.continue11:
-; CHECK-NEXT:    [[TMP98:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP97]], [[PRED_LOAD_IF10]] ]
-; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1
-; CHECK-NEXT:    br i1 [[TMP99]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]]
+; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP11]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP18]], [[PRED_LOAD_IF10]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER]], i32 2
+; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]]
 ; CHECK:       pred.load.if12:
-; CHECK-NEXT:    [[TMP100:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP101:%.*]] = getelementptr inbounds i16, i16* [[TMP100]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP102:%.*]] = bitcast i16* [[TMP101]] to i32*
-; CHECK-NEXT:    [[TMP103:%.*]] = load i32, i32* [[TMP102]], align 4
-; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP103]], i32 1
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, i16* [[TMP22]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i16* [[TMP23]] to i32*
+; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP25]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE13]]
 ; CHECK:       pred.load.continue13:
-; CHECK-NEXT:    [[TMP105:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP104]], [[PRED_LOAD_IF12]] ]
-; CHECK-NEXT:    [[TMP106:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2
-; CHECK-NEXT:    br i1 [[TMP106]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]]
+; CHECK-NEXT:    [[TMP27:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP26]], [[PRED_LOAD_IF12]] ]
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER]], i32 3
+; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]]
 ; CHECK:       pred.load.if14:
-; CHECK-NEXT:    [[TMP107:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds i16, i16* [[TMP107]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP109:%.*]] = bitcast i16* [[TMP108]] to i32*
-; CHECK-NEXT:    [[TMP110:%.*]] = load i32, i32* [[TMP109]], align 4
-; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <4 x i32> [[TMP105]], i32 [[TMP110]], i32 2
+; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i16, i16* [[TMP30]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i16* [[TMP31]] to i32*
+; CHECK-NEXT:    [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP33]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE15]]
 ; CHECK:       pred.load.continue15:
-; CHECK-NEXT:    [[TMP112:%.*]] = phi <4 x i32> [ [[TMP105]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP111]], [[PRED_LOAD_IF14]] ]
-; CHECK-NEXT:    [[TMP113:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3
-; CHECK-NEXT:    br i1 [[TMP113]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]]
+; CHECK-NEXT:    [[TMP35:%.*]] = phi <4 x i32> [ [[TMP27]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP34]], [[PRED_LOAD_IF14]] ]
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP36]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]]
 ; CHECK:       pred.load.if16:
-; CHECK-NEXT:    [[TMP114:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i16, i16* [[TMP114]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP116:%.*]] = bitcast i16* [[TMP115]] to i32*
-; CHECK-NEXT:    [[TMP117:%.*]] = load i32, i32* [[TMP116]], align 4
-; CHECK-NEXT:    [[TMP118:%.*]] = insertelement <4 x i32> [[TMP112]], i32 [[TMP117]], i32 3
+; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i16, i16* [[TMP38]], i64 [[TMP37]]
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast i16* [[TMP39]] to i32*
+; CHECK-NEXT:    [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4
+; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> poison, i32 [[TMP41]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE17]]
 ; CHECK:       pred.load.continue17:
-; CHECK-NEXT:    [[TMP119:%.*]] = phi <4 x i32> [ [[TMP112]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP118]], [[PRED_LOAD_IF16]] ]
-; CHECK-NEXT:    [[TMP120:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0
-; CHECK-NEXT:    br i1 [[TMP120]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]]
+; CHECK-NEXT:    [[TMP43:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE15]] ], [ [[TMP42]], [[PRED_LOAD_IF16]] ]
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP44]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]]
 ; CHECK:       pred.load.if18:
-; CHECK-NEXT:    [[TMP121:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP122:%.*]] = getelementptr inbounds i16, i16* [[TMP121]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP123:%.*]] = bitcast i16* [[TMP122]] to i32*
-; CHECK-NEXT:    [[TMP124:%.*]] = load i32, i32* [[TMP123]], align 4
-; CHECK-NEXT:    [[TMP125:%.*]] = insertelement <4 x i32> poison, i32 [[TMP124]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP46:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i16, i16* [[TMP46]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP48:%.*]] = bitcast i16* [[TMP47]] to i32*
+; CHECK-NEXT:    [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4
+; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[TMP49]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE19]]
 ; CHECK:       pred.load.continue19:
-; CHECK-NEXT:    [[TMP126:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP125]], [[PRED_LOAD_IF18]] ]
-; CHECK-NEXT:    [[TMP127:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1
-; CHECK-NEXT:    br i1 [[TMP127]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]]
+; CHECK-NEXT:    [[TMP51:%.*]] = phi <4 x i32> [ [[TMP43]], [[PRED_LOAD_CONTINUE17]] ], [ [[TMP50]], [[PRED_LOAD_IF18]] ]
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER7]], i32 2
+; CHECK-NEXT:    br i1 [[TMP52]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]]
 ; CHECK:       pred.load.if20:
-; CHECK-NEXT:    [[TMP128:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i16, i16* [[TMP128]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP130:%.*]] = bitcast i16* [[TMP129]] to i32*
-; CHECK-NEXT:    [[TMP131:%.*]] = load i32, i32* [[TMP130]], align 4
-; CHECK-NEXT:    [[TMP132:%.*]] = insertelement <4 x i32> [[TMP126]], i32 [[TMP131]], i32 1
+; CHECK-NEXT:    [[TMP53:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP54:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i16, i16* [[TMP54]], i64 [[TMP53]]
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast i16* [[TMP55]] to i32*
+; CHECK-NEXT:    [[TMP57:%.*]] = load i32, i32* [[TMP56]], align 4
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <4 x i32> [[TMP51]], i32 [[TMP57]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE21]]
 ; CHECK:       pred.load.continue21:
-; CHECK-NEXT:    [[TMP133:%.*]] = phi <4 x i32> [ [[TMP126]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP132]], [[PRED_LOAD_IF20]] ]
-; CHECK-NEXT:    [[TMP134:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2
-; CHECK-NEXT:    br i1 [[TMP134]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]]
+; CHECK-NEXT:    [[TMP59:%.*]] = phi <4 x i32> [ [[TMP51]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP58]], [[PRED_LOAD_IF20]] ]
+; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER7]], i32 3
+; CHECK-NEXT:    br i1 [[TMP60]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]]
 ; CHECK:       pred.load.if22:
-; CHECK-NEXT:    [[TMP135:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i16, i16* [[TMP135]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP137:%.*]] = bitcast i16* [[TMP136]] to i32*
-; CHECK-NEXT:    [[TMP138:%.*]] = load i32, i32* [[TMP137]], align 4
-; CHECK-NEXT:    [[TMP139:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP138]], i32 2
+; CHECK-NEXT:    [[TMP61:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT:    [[TMP62:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i16, i16* [[TMP62]], i64 [[TMP61]]
+; CHECK-NEXT:    [[TMP64:%.*]] = bitcast i16* [[TMP63]] to i32*
+; CHECK-NEXT:    [[TMP65:%.*]] = load i32, i32* [[TMP64]], align 4
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <4 x i32> [[TMP59]], i32 [[TMP65]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE23]]
 ; CHECK:       pred.load.continue23:
-; CHECK-NEXT:    [[TMP140:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP139]], [[PRED_LOAD_IF22]] ]
-; CHECK-NEXT:    [[TMP141:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3
-; CHECK-NEXT:    br i1 [[TMP141]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]]
+; CHECK-NEXT:    [[TMP67:%.*]] = phi <4 x i32> [ [[TMP59]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP66]], [[PRED_LOAD_IF22]] ]
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP68]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]]
 ; CHECK:       pred.load.if24:
-; CHECK-NEXT:    [[TMP142:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP143:%.*]] = getelementptr inbounds i16, i16* [[TMP142]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP144:%.*]] = bitcast i16* [[TMP143]] to i32*
-; CHECK-NEXT:    [[TMP145:%.*]] = load i32, i32* [[TMP144]], align 4
-; CHECK-NEXT:    [[TMP146:%.*]] = insertelement <4 x i32> [[TMP140]], i32 [[TMP145]], i32 3
+; CHECK-NEXT:    [[TMP69:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP70:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i16, i16* [[TMP70]], i64 [[TMP69]]
+; CHECK-NEXT:    [[TMP72:%.*]] = bitcast i16* [[TMP71]] to i32*
+; CHECK-NEXT:    [[TMP73:%.*]] = load i32, i32* [[TMP72]], align 4
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <4 x i32> poison, i32 [[TMP73]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE25]]
 ; CHECK:       pred.load.continue25:
-; CHECK-NEXT:    [[TMP147:%.*]] = phi <4 x i32> [ [[TMP140]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP146]], [[PRED_LOAD_IF24]] ]
-; CHECK-NEXT:    [[TMP148:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0
-; CHECK-NEXT:    br i1 [[TMP148]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]]
+; CHECK-NEXT:    [[TMP75:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE23]] ], [ [[TMP74]], [[PRED_LOAD_IF24]] ]
+; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP76]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]]
 ; CHECK:       pred.load.if26:
-; CHECK-NEXT:    [[TMP149:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP150:%.*]] = getelementptr inbounds i16, i16* [[TMP149]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP151:%.*]] = bitcast i16* [[TMP150]] to i32*
-; CHECK-NEXT:    [[TMP152:%.*]] = load i32, i32* [[TMP151]], align 4
-; CHECK-NEXT:    [[TMP153:%.*]] = insertelement <4 x i32> poison, i32 [[TMP152]], i32 0
+; CHECK-NEXT:    [[TMP77:%.*]] = add i64 [[INDEX]], 9
+; CHECK-NEXT:    [[TMP78:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr inbounds i16, i16* [[TMP78]], i64 [[TMP77]]
+; CHECK-NEXT:    [[TMP80:%.*]] = bitcast i16* [[TMP79]] to i32*
+; CHECK-NEXT:    [[TMP81:%.*]] = load i32, i32* [[TMP80]], align 4
+; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <4 x i32> [[TMP75]], i32 [[TMP81]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE27]]
 ; CHECK:       pred.load.continue27:
-; CHECK-NEXT:    [[TMP154:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP153]], [[PRED_LOAD_IF26]] ]
-; CHECK-NEXT:    [[TMP155:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1
-; CHECK-NEXT:    br i1 [[TMP155]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]]
+; CHECK-NEXT:    [[TMP83:%.*]] = phi <4 x i32> [ [[TMP75]], [[PRED_LOAD_CONTINUE25]] ], [ [[TMP82]], [[PRED_LOAD_IF26]] ]
+; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER8]], i32 2
+; CHECK-NEXT:    br i1 [[TMP84]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]]
 ; CHECK:       pred.load.if28:
-; CHECK-NEXT:    [[TMP156:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP157:%.*]] = getelementptr inbounds i16, i16* [[TMP156]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP158:%.*]] = bitcast i16* [[TMP157]] to i32*
-; CHECK-NEXT:    [[TMP159:%.*]] = load i32, i32* [[TMP158]], align 4
-; CHECK-NEXT:    [[TMP160:%.*]] = insertelement <4 x i32> [[TMP154]], i32 [[TMP159]], i32 1
+; CHECK-NEXT:    [[TMP85:%.*]] = add i64 [[INDEX]], 10
+; CHECK-NEXT:    [[TMP86:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP87:%.*]] = getelementptr inbounds i16, i16* [[TMP86]], i64 [[TMP85]]
+; CHECK-NEXT:    [[TMP88:%.*]] = bitcast i16* [[TMP87]] to i32*
+; CHECK-NEXT:    [[TMP89:%.*]] = load i32, i32* [[TMP88]], align 4
+; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x i32> [[TMP83]], i32 [[TMP89]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE29]]
 ; CHECK:       pred.load.continue29:
-; CHECK-NEXT:    [[TMP161:%.*]] = phi <4 x i32> [ [[TMP154]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP160]], [[PRED_LOAD_IF28]] ]
-; CHECK-NEXT:    [[TMP162:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2
-; CHECK-NEXT:    br i1 [[TMP162]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]]
+; CHECK-NEXT:    [[TMP91:%.*]] = phi <4 x i32> [ [[TMP83]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP90]], [[PRED_LOAD_IF28]] ]
+; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER8]], i32 3
+; CHECK-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]]
 ; CHECK:       pred.load.if30:
-; CHECK-NEXT:    [[TMP163:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i16, i16* [[TMP163]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP165:%.*]] = bitcast i16* [[TMP164]] to i32*
-; CHECK-NEXT:    [[TMP166:%.*]] = load i32, i32* [[TMP165]], align 4
-; CHECK-NEXT:    [[TMP167:%.*]] = insertelement <4 x i32> [[TMP161]], i32 [[TMP166]], i32 2
+; CHECK-NEXT:    [[TMP93:%.*]] = add i64 [[INDEX]], 11
+; CHECK-NEXT:    [[TMP94:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds i16, i16* [[TMP94]], i64 [[TMP93]]
+; CHECK-NEXT:    [[TMP96:%.*]] = bitcast i16* [[TMP95]] to i32*
+; CHECK-NEXT:    [[TMP97:%.*]] = load i32, i32* [[TMP96]], align 4
+; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <4 x i32> [[TMP91]], i32 [[TMP97]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE31]]
 ; CHECK:       pred.load.continue31:
-; CHECK-NEXT:    [[TMP168:%.*]] = phi <4 x i32> [ [[TMP161]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP167]], [[PRED_LOAD_IF30]] ]
-; CHECK-NEXT:    [[TMP169:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3
-; CHECK-NEXT:    br i1 [[TMP169]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]]
+; CHECK-NEXT:    [[TMP99:%.*]] = phi <4 x i32> [ [[TMP91]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP98]], [[PRED_LOAD_IF30]] ]
+; CHECK-NEXT:    [[TMP100:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER9]], i32 0
+; CHECK-NEXT:    br i1 [[TMP100]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33:%.*]]
 ; CHECK:       pred.load.if32:
-; CHECK-NEXT:    [[TMP170:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP171:%.*]] = getelementptr inbounds i16, i16* [[TMP170]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP172:%.*]] = bitcast i16* [[TMP171]] to i32*
-; CHECK-NEXT:    [[TMP173:%.*]] = load i32, i32* [[TMP172]], align 4
-; CHECK-NEXT:    [[TMP174:%.*]] = insertelement <4 x i32> [[TMP168]], i32 [[TMP173]], i32 3
+; CHECK-NEXT:    [[TMP101:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP102:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP103:%.*]] = getelementptr inbounds i16, i16* [[TMP102]], i64 [[TMP101]]
+; CHECK-NEXT:    [[TMP104:%.*]] = bitcast i16* [[TMP103]] to i32*
+; CHECK-NEXT:    [[TMP105:%.*]] = load i32, i32* [[TMP104]], align 4
+; CHECK-NEXT:    [[TMP106:%.*]] = insertelement <4 x i32> poison, i32 [[TMP105]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE33]]
 ; CHECK:       pred.load.continue33:
-; CHECK-NEXT:    [[TMP175:%.*]] = phi <4 x i32> [ [[TMP168]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP174]], [[PRED_LOAD_IF32]] ]
-; CHECK-NEXT:    [[TMP176:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP177:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP178:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP179:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP91]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI34:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP119]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI35:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP147]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI36:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP175]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP180]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP181]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI34]]
-; CHECK-NEXT:    [[TMP182]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI35]]
-; CHECK-NEXT:    [[TMP183]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI36]]
+; CHECK-NEXT:    [[TMP107:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE31]] ], [ [[TMP106]], [[PRED_LOAD_IF32]] ]
+; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER9]], i32 1
+; CHECK-NEXT:    br i1 [[TMP108]], label [[PRED_LOAD_IF34:%.*]], label [[PRED_LOAD_CONTINUE35:%.*]]
+; CHECK:       pred.load.if34:
+; CHECK-NEXT:    [[TMP109:%.*]] = add i64 [[INDEX]], 13
+; CHECK-NEXT:    [[TMP110:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP111:%.*]] = getelementptr inbounds i16, i16* [[TMP110]], i64 [[TMP109]]
+; CHECK-NEXT:    [[TMP112:%.*]] = bitcast i16* [[TMP111]] to i32*
+; CHECK-NEXT:    [[TMP113:%.*]] = load i32, i32* [[TMP112]], align 4
+; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <4 x i32> [[TMP107]], i32 [[TMP113]], i32 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE35]]
+; CHECK:       pred.load.continue35:
+; CHECK-NEXT:    [[TMP115:%.*]] = phi <4 x i32> [ [[TMP107]], [[PRED_LOAD_CONTINUE33]] ], [ [[TMP114]], [[PRED_LOAD_IF34]] ]
+; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER9]], i32 2
+; CHECK-NEXT:    br i1 [[TMP116]], label [[PRED_LOAD_IF36:%.*]], label [[PRED_LOAD_CONTINUE37:%.*]]
+; CHECK:       pred.load.if36:
+; CHECK-NEXT:    [[TMP117:%.*]] = add i64 [[INDEX]], 14
+; CHECK-NEXT:    [[TMP118:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i16, i16* [[TMP118]], i64 [[TMP117]]
+; CHECK-NEXT:    [[TMP120:%.*]] = bitcast i16* [[TMP119]] to i32*
+; CHECK-NEXT:    [[TMP121:%.*]] = load i32, i32* [[TMP120]], align 4
+; CHECK-NEXT:    [[TMP122:%.*]] = insertelement <4 x i32> [[TMP115]], i32 [[TMP121]], i32 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE37]]
+; CHECK:       pred.load.continue37:
+; CHECK-NEXT:    [[TMP123:%.*]] = phi <4 x i32> [ [[TMP115]], [[PRED_LOAD_CONTINUE35]] ], [ [[TMP122]], [[PRED_LOAD_IF36]] ]
+; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER9]], i32 3
+; CHECK-NEXT:    br i1 [[TMP124]], label [[PRED_LOAD_IF38:%.*]], label [[PRED_LOAD_CONTINUE39]]
+; CHECK:       pred.load.if38:
+; CHECK-NEXT:    [[TMP125:%.*]] = add i64 [[INDEX]], 15
+; CHECK-NEXT:    [[TMP126:%.*]] = bitcast i32* [[BASE]] to i16*
+; CHECK-NEXT:    [[TMP127:%.*]] = getelementptr inbounds i16, i16* [[TMP126]], i64 [[TMP125]]
+; CHECK-NEXT:    [[TMP128:%.*]] = bitcast i16* [[TMP127]] to i32*
+; CHECK-NEXT:    [[TMP129:%.*]] = load i32, i32* [[TMP128]], align 4
+; CHECK-NEXT:    [[TMP130:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP129]], i32 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE39]]
+; CHECK:       pred.load.continue39:
+; CHECK-NEXT:    [[TMP131:%.*]] = phi <4 x i32> [ [[TMP123]], [[PRED_LOAD_CONTINUE37]] ], [ [[TMP130]], [[PRED_LOAD_IF38]] ]
+; CHECK-NEXT:    [[TMP132:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP133:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP134:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP135:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[TMP35]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI40:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[TMP67]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI41:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[TMP99]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI42:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[TMP131]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP136]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP137]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI40]]
+; CHECK-NEXT:    [[TMP138]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI41]]
+; CHECK-NEXT:    [[TMP139]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI42]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP184:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP184]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP140:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP140]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP181]], [[TMP180]]
-; CHECK-NEXT:    [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP182]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP183]], [[BIN_RDX37]]
-; CHECK-NEXT:    [[TMP185:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP137]], [[TMP136]]
+; CHECK-NEXT:    [[BIN_RDX43:%.*]] = add <4 x i32> [[TMP138]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX44:%.*]] = add <4 x i32> [[TMP139]], [[BIN_RDX43]]
+; CHECK-NEXT:    [[TMP141:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX44]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP185]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP141]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -802,7 +677,7 @@
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP185]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP141]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -850,10 +725,14 @@
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP84:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 2
@@ -870,95 +749,56 @@
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> poison, i1 [[TMP33]], i32 0
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 1
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 2
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i1> [[TMP39]], i1 [[TMP36]], i32 3
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> poison, i1 [[TMP41]], i32 0
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 1
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 2
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i1> [[TMP47]], i1 [[TMP44]], i32 3
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> poison, i1 [[TMP49]], i32 0
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 1
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 2
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <4 x i1> [[TMP55]], i1 [[TMP52]], i32 3
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP60:%.*]] = load i1, i1* [[TMP32]], align 1
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> poison, i1 [[TMP57]], i32 0
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 1
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x i1> [[TMP63]], i1 [[TMP60]], i32 3
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 0
-; CHECK-NEXT:    [[TMP70:%.*]] = bitcast i32* [[TMP69]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP70]], i32 4, <4 x i1> [[TMP40]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 4
-; CHECK-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP72]], i32 4, <4 x i1> [[TMP48]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 8
-; CHECK-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP74]], i32 4, <4 x i1> [[TMP56]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 12
-; CHECK-NEXT:    [[TMP76:%.*]] = bitcast i32* [[TMP75]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP76]], i32 4, <4 x i1> [[TMP64]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP48]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP56]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP80:%.*]] = xor <4 x i1> [[TMP64]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP40]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP56]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP64]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP84]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP20]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP26]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 4
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i32* [[TMP27]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP28]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 8
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast i32* [[TMP29]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP30]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 12
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP32]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP36:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_LOAD10]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_LOAD11]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_LOAD12]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP37]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP38]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]]
+; CHECK-NEXT:    [[TMP39]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]]
+; CHECK-NEXT:    [[TMP40]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP85:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP82]], [[TMP81]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP84]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP38]], [[TMP37]]
+; CHECK-NEXT:    [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP40]], [[BIN_RDX16]]
+; CHECK-NEXT:    [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP42]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -977,7 +817,7 @@
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], [[MIN_N]]
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP42]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -1022,10 +862,14 @@
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1024, i64 1025, i64 1026, i64 1027>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1024, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 1
@@ -1043,95 +887,56 @@
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4
-; CHECK-NEXT:    [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP29]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP31]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_LOAD10]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_LOAD11]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_LOAD12]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]]
+; CHECK-NEXT:    [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]]
+; CHECK-NEXT:    [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3072
-; CHECK-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3072
+; CHECK-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX16]]
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 3072, 3072
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 1024, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -1150,7 +955,7 @@
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -1244,245 +1049,210 @@
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE33:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP148:%.*]], [[PRED_LOAD_CONTINUE33]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP149:%.*]], [[PRED_LOAD_CONTINUE33]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP150:%.*]], [[PRED_LOAD_CONTINUE33]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP151:%.*]], [[PRED_LOAD_CONTINUE33]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE39:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 2, i64 4, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE39]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP104:%.*]], [[PRED_LOAD_CONTINUE39]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP105:%.*]], [[PRED_LOAD_CONTINUE39]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP106:%.*]], [[PRED_LOAD_CONTINUE39]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP107:%.*]], [[PRED_LOAD_CONTINUE39]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 8, i64 8, i64 8, i64 8>
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 10
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 12
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 14
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 16
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 18
-; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 20
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 22
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 24
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 26
-; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 28
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 30
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0
-; CHECK-NEXT:    br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP0]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP1]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP2]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP3]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER]], i32 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP66:%.*]] = load i32, i32* [[TMP65]], align 4
-; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <4 x i32> poison, i32 [[TMP66]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP68:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP67]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1
-; CHECK-NEXT:    br i1 [[TMP69]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]]
-; CHECK:       pred.load.if4:
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP71:%.*]] = load i32, i32* [[TMP70]], align 4
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP71]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE5]]
-; CHECK:       pred.load.continue5:
-; CHECK-NEXT:    [[TMP73:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP72]], [[PRED_LOAD_IF4]] ]
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2
-; CHECK-NEXT:    br i1 [[TMP74]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]]
-; CHECK:       pred.load.if6:
-; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP76:%.*]] = load i32, i32* [[TMP75]], align 4
-; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP76]], i32 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE7]]
-; CHECK:       pred.load.continue7:
-; CHECK-NEXT:    [[TMP78:%.*]] = phi <4 x i32> [ [[TMP73]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP77]], [[PRED_LOAD_IF6]] ]
-; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3
-; CHECK-NEXT:    br i1 [[TMP79]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
-; CHECK:       pred.load.if8:
-; CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP81:%.*]] = load i32, i32* [[TMP80]], align 4
-; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP81]], i32 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
-; CHECK:       pred.load.continue9:
-; CHECK-NEXT:    [[TMP83:%.*]] = phi <4 x i32> [ [[TMP78]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP82]], [[PRED_LOAD_IF8]] ]
-; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0
-; CHECK-NEXT:    br i1 [[TMP84]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]]
 ; CHECK:       pred.load.if10:
-; CHECK-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP86:%.*]] = load i32, i32* [[TMP85]], align 4
-; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> poison, i32 [[TMP86]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP13]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE11]]
 ; CHECK:       pred.load.continue11:
-; CHECK-NEXT:    [[TMP88:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP87]], [[PRED_LOAD_IF10]] ]
-; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1
-; CHECK-NEXT:    br i1 [[TMP89]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF10]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER]], i32 2
+; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]]
 ; CHECK:       pred.load.if12:
-; CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP91:%.*]] = load i32, i32* [[TMP90]], align 4
-; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP91]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP19]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE13]]
 ; CHECK:       pred.load.continue13:
-; CHECK-NEXT:    [[TMP93:%.*]] = phi <4 x i32> [ [[TMP88]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP92]], [[PRED_LOAD_IF12]] ]
-; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2
-; CHECK-NEXT:    br i1 [[TMP94]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]]
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i32> [ [[TMP15]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP20]], [[PRED_LOAD_IF12]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER]], i32 3
+; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]]
 ; CHECK:       pred.load.if14:
-; CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP96:%.*]] = load i32, i32* [[TMP95]], align 4
-; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP96]], i32 2
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP25]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE15]]
 ; CHECK:       pred.load.continue15:
-; CHECK-NEXT:    [[TMP98:%.*]] = phi <4 x i32> [ [[TMP93]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP97]], [[PRED_LOAD_IF14]] ]
-; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3
-; CHECK-NEXT:    br i1 [[TMP99]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]]
+; CHECK-NEXT:    [[TMP27:%.*]] = phi <4 x i32> [ [[TMP21]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP26]], [[PRED_LOAD_IF14]] ]
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]]
 ; CHECK:       pred.load.if16:
-; CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP101:%.*]] = load i32, i32* [[TMP100]], align 4
-; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP101]], i32 3
+; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> poison, i32 [[TMP31]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE17]]
 ; CHECK:       pred.load.continue17:
-; CHECK-NEXT:    [[TMP103:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP102]], [[PRED_LOAD_IF16]] ]
-; CHECK-NEXT:    [[TMP104:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0
-; CHECK-NEXT:    br i1 [[TMP104]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]]
+; CHECK-NEXT:    [[TMP33:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE15]] ], [ [[TMP32]], [[PRED_LOAD_IF16]] ]
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]]
 ; CHECK:       pred.load.if18:
-; CHECK-NEXT:    [[TMP105:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP106:%.*]] = load i32, i32* [[TMP105]], align 4
-; CHECK-NEXT:    [[TMP107:%.*]] = insertelement <4 x i32> poison, i32 [[TMP106]], i32 0
+; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP35]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i32, i32* [[TMP36]], align 4
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP37]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE19]]
 ; CHECK:       pred.load.continue19:
-; CHECK-NEXT:    [[TMP108:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP107]], [[PRED_LOAD_IF18]] ]
-; CHECK-NEXT:    [[TMP109:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1
-; CHECK-NEXT:    br i1 [[TMP109]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]]
+; CHECK-NEXT:    [[TMP39:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_LOAD_CONTINUE17]] ], [ [[TMP38]], [[PRED_LOAD_IF18]] ]
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER7]], i32 2
+; CHECK-NEXT:    br i1 [[TMP40]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]]
 ; CHECK:       pred.load.if20:
-; CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP111:%.*]] = load i32, i32* [[TMP110]], align 4
-; CHECK-NEXT:    [[TMP112:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP111]], i32 1
+; CHECK-NEXT:    [[TMP41:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP41]]
+; CHECK-NEXT:    [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP43]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE21]]
 ; CHECK:       pred.load.continue21:
-; CHECK-NEXT:    [[TMP113:%.*]] = phi <4 x i32> [ [[TMP108]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP112]], [[PRED_LOAD_IF20]] ]
-; CHECK-NEXT:    [[TMP114:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2
-; CHECK-NEXT:    br i1 [[TMP114]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]]
+; CHECK-NEXT:    [[TMP45:%.*]] = phi <4 x i32> [ [[TMP39]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP44]], [[PRED_LOAD_IF20]] ]
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER7]], i32 3
+; CHECK-NEXT:    br i1 [[TMP46]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]]
 ; CHECK:       pred.load.if22:
-; CHECK-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP116:%.*]] = load i32, i32* [[TMP115]], align 4
-; CHECK-NEXT:    [[TMP117:%.*]] = insertelement <4 x i32> [[TMP113]], i32 [[TMP116]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4
+; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <4 x i32> [[TMP45]], i32 [[TMP49]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE23]]
 ; CHECK:       pred.load.continue23:
-; CHECK-NEXT:    [[TMP118:%.*]] = phi <4 x i32> [ [[TMP113]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP117]], [[PRED_LOAD_IF22]] ]
-; CHECK-NEXT:    [[TMP119:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3
-; CHECK-NEXT:    br i1 [[TMP119]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]]
+; CHECK-NEXT:    [[TMP51:%.*]] = phi <4 x i32> [ [[TMP45]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP50]], [[PRED_LOAD_IF22]] ]
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP52]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]]
 ; CHECK:       pred.load.if24:
-; CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP121:%.*]] = load i32, i32* [[TMP120]], align 4
-; CHECK-NEXT:    [[TMP122:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP121]], i32 3
+; CHECK-NEXT:    [[TMP53:%.*]] = add i64 [[OFFSET_IDX]], 16
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP53]]
+; CHECK-NEXT:    [[TMP55:%.*]] = load i32, i32* [[TMP54]], align 4
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <4 x i32> poison, i32 [[TMP55]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE25]]
 ; CHECK:       pred.load.continue25:
-; CHECK-NEXT:    [[TMP123:%.*]] = phi <4 x i32> [ [[TMP118]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP122]], [[PRED_LOAD_IF24]] ]
-; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0
-; CHECK-NEXT:    br i1 [[TMP124]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]]
+; CHECK-NEXT:    [[TMP57:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE23]] ], [ [[TMP56]], [[PRED_LOAD_IF24]] ]
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP58]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]]
 ; CHECK:       pred.load.if26:
-; CHECK-NEXT:    [[TMP125:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP126:%.*]] = load i32, i32* [[TMP125]], align 4
-; CHECK-NEXT:    [[TMP127:%.*]] = insertelement <4 x i32> poison, i32 [[TMP126]], i32 0
+; CHECK-NEXT:    [[TMP59:%.*]] = add i64 [[OFFSET_IDX]], 18
+; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP59]]
+; CHECK-NEXT:    [[TMP61:%.*]] = load i32, i32* [[TMP60]], align 4
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i32> [[TMP57]], i32 [[TMP61]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE27]]
 ; CHECK:       pred.load.continue27:
-; CHECK-NEXT:    [[TMP128:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP127]], [[PRED_LOAD_IF26]] ]
-; CHECK-NEXT:    [[TMP129:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1
-; CHECK-NEXT:    br i1 [[TMP129]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]]
+; CHECK-NEXT:    [[TMP63:%.*]] = phi <4 x i32> [ [[TMP57]], [[PRED_LOAD_CONTINUE25]] ], [ [[TMP62]], [[PRED_LOAD_IF26]] ]
+; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER8]], i32 2
+; CHECK-NEXT:    br i1 [[TMP64]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]]
 ; CHECK:       pred.load.if28:
-; CHECK-NEXT:    [[TMP130:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP131:%.*]] = load i32, i32* [[TMP130]], align 4
-; CHECK-NEXT:    [[TMP132:%.*]] = insertelement <4 x i32> [[TMP128]], i32 [[TMP131]], i32 1
+; CHECK-NEXT:    [[TMP65:%.*]] = add i64 [[OFFSET_IDX]], 20
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP65]]
+; CHECK-NEXT:    [[TMP67:%.*]] = load i32, i32* [[TMP66]], align 4
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <4 x i32> [[TMP63]], i32 [[TMP67]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE29]]
 ; CHECK:       pred.load.continue29:
-; CHECK-NEXT:    [[TMP133:%.*]] = phi <4 x i32> [ [[TMP128]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP132]], [[PRED_LOAD_IF28]] ]
-; CHECK-NEXT:    [[TMP134:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2
-; CHECK-NEXT:    br i1 [[TMP134]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]]
+; CHECK-NEXT:    [[TMP69:%.*]] = phi <4 x i32> [ [[TMP63]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP68]], [[PRED_LOAD_IF28]] ]
+; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER8]], i32 3
+; CHECK-NEXT:    br i1 [[TMP70]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]]
 ; CHECK:       pred.load.if30:
-; CHECK-NEXT:    [[TMP135:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP136:%.*]] = load i32, i32* [[TMP135]], align 4
-; CHECK-NEXT:    [[TMP137:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP136]], i32 2
+; CHECK-NEXT:    [[TMP71:%.*]] = add i64 [[OFFSET_IDX]], 22
+; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP71]]
+; CHECK-NEXT:    [[TMP73:%.*]] = load i32, i32* [[TMP72]], align 4
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <4 x i32> [[TMP69]], i32 [[TMP73]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE31]]
 ; CHECK:       pred.load.continue31:
-; CHECK-NEXT:    [[TMP138:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP137]], [[PRED_LOAD_IF30]] ]
-; CHECK-NEXT:    [[TMP139:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3
-; CHECK-NEXT:    br i1 [[TMP139]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]]
+; CHECK-NEXT:    [[TMP75:%.*]] = phi <4 x i32> [ [[TMP69]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP74]], [[PRED_LOAD_IF30]] ]
+; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER9]], i32 0
+; CHECK-NEXT:    br i1 [[TMP76]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33:%.*]]
 ; CHECK:       pred.load.if32:
-; CHECK-NEXT:    [[TMP140:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP141:%.*]] = load i32, i32* [[TMP140]], align 4
-; CHECK-NEXT:    [[TMP142:%.*]] = insertelement <4 x i32> [[TMP138]], i32 [[TMP141]], i32 3
+; CHECK-NEXT:    [[TMP77:%.*]] = add i64 [[OFFSET_IDX]], 24
+; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP77]]
+; CHECK-NEXT:    [[TMP79:%.*]] = load i32, i32* [[TMP78]], align 4
+; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i32> poison, i32 [[TMP79]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE33]]
 ; CHECK:       pred.load.continue33:
-; CHECK-NEXT:    [[TMP143:%.*]] = phi <4 x i32> [ [[TMP138]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP142]], [[PRED_LOAD_IF32]] ]
-; CHECK-NEXT:    [[TMP144:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP145:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP146:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP147:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP83]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI34:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI35:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP123]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI36:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP143]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP148]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP149]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI34]]
-; CHECK-NEXT:    [[TMP150]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI35]]
-; CHECK-NEXT:    [[TMP151]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI36]]
+; CHECK-NEXT:    [[TMP81:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE31]] ], [ [[TMP80]], [[PRED_LOAD_IF32]] ]
+; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER9]], i32 1
+; CHECK-NEXT:    br i1 [[TMP82]], label [[PRED_LOAD_IF34:%.*]], label [[PRED_LOAD_CONTINUE35:%.*]]
+; CHECK:       pred.load.if34:
+; CHECK-NEXT:    [[TMP83:%.*]] = add i64 [[OFFSET_IDX]], 26
+; CHECK-NEXT:    [[TMP84:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP83]]
+; CHECK-NEXT:    [[TMP85:%.*]] = load i32, i32* [[TMP84]], align 4
+; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP81]], i32 [[TMP85]], i32 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE35]]
+; CHECK:       pred.load.continue35:
+; CHECK-NEXT:    [[TMP87:%.*]] = phi <4 x i32> [ [[TMP81]], [[PRED_LOAD_CONTINUE33]] ], [ [[TMP86]], [[PRED_LOAD_IF34]] ]
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER9]], i32 2
+; CHECK-NEXT:    br i1 [[TMP88]], label [[PRED_LOAD_IF36:%.*]], label [[PRED_LOAD_CONTINUE37:%.*]]
+; CHECK:       pred.load.if36:
+; CHECK-NEXT:    [[TMP89:%.*]] = add i64 [[OFFSET_IDX]], 28
+; CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP89]]
+; CHECK-NEXT:    [[TMP91:%.*]] = load i32, i32* [[TMP90]], align 4
+; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> [[TMP87]], i32 [[TMP91]], i32 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE37]]
+; CHECK:       pred.load.continue37:
+; CHECK-NEXT:    [[TMP93:%.*]] = phi <4 x i32> [ [[TMP87]], [[PRED_LOAD_CONTINUE35]] ], [ [[TMP92]], [[PRED_LOAD_IF36]] ]
+; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER9]], i32 3
+; CHECK-NEXT:    br i1 [[TMP94]], label [[PRED_LOAD_IF38:%.*]], label [[PRED_LOAD_CONTINUE39]]
+; CHECK:       pred.load.if38:
+; CHECK-NEXT:    [[TMP95:%.*]] = add i64 [[OFFSET_IDX]], 30
+; CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP95]]
+; CHECK-NEXT:    [[TMP97:%.*]] = load i32, i32* [[TMP96]], align 4
+; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP97]], i32 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE39]]
+; CHECK:       pred.load.continue39:
+; CHECK-NEXT:    [[TMP99:%.*]] = phi <4 x i32> [ [[TMP93]], [[PRED_LOAD_CONTINUE37]] ], [ [[TMP98]], [[PRED_LOAD_IF38]] ]
+; CHECK-NEXT:    [[TMP100:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP101:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP102:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP103:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[TMP27]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI40:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[TMP51]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI41:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[TMP75]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI42:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[TMP99]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP104]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP105]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI40]]
+; CHECK-NEXT:    [[TMP106]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI41]]
+; CHECK-NEXT:    [[TMP107]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI42]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP152:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2048
-; CHECK-NEXT:    br i1 [[TMP152]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[TMP108:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2048
+; CHECK-NEXT:    br i1 [[TMP108]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP149]], [[TMP148]]
-; CHECK-NEXT:    [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP150]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP151]], [[BIN_RDX37]]
-; CHECK-NEXT:    [[TMP153:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP105]], [[TMP104]]
+; CHECK-NEXT:    [[BIN_RDX43:%.*]] = add <4 x i32> [[TMP106]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX44:%.*]] = add <4 x i32> [[TMP107]], [[BIN_RDX43]]
+; CHECK-NEXT:    [[TMP109:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX44]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 2048, 2048
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP153]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP109]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -1501,7 +1271,7 @@
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4093
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP153]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP109]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -1541,10 +1311,14 @@
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -1561,95 +1335,56 @@
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4
-; CHECK-NEXT:    [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP29]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP31]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_LOAD10]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_LOAD11]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_LOAD12]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]]
+; CHECK-NEXT:    [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]]
+; CHECK-NEXT:    [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX16]]
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -1668,7 +1403,7 @@
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -1708,10 +1443,14 @@
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -1728,95 +1467,56 @@
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4
-; CHECK-NEXT:    [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP29]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP31]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_LOAD10]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_LOAD11]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_LOAD12]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]]
+; CHECK-NEXT:    [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]]
+; CHECK-NEXT:    [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX16]]
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -1835,7 +1535,7 @@
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -1875,10 +1575,14 @@
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -1895,95 +1599,56 @@
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4
-; CHECK-NEXT:    [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP29]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP31]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_LOAD10]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_LOAD11]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_LOAD12]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]]
+; CHECK-NEXT:    [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]]
+; CHECK-NEXT:    [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX16]]
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -2002,7 +1667,7 @@
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -2051,10 +1716,14 @@
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP84:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 2
@@ -2071,95 +1740,56 @@
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> poison, i1 [[TMP33]], i32 0
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 1
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 2
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i1> [[TMP39]], i1 [[TMP36]], i32 3
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> poison, i1 [[TMP41]], i32 0
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 1
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 2
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i1> [[TMP47]], i1 [[TMP44]], i32 3
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> poison, i1 [[TMP49]], i32 0
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 1
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 2
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <4 x i1> [[TMP55]], i1 [[TMP52]], i32 3
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP60:%.*]] = load i1, i1* [[TMP32]], align 1
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> poison, i1 [[TMP57]], i32 0
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 1
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x i1> [[TMP63]], i1 [[TMP60]], i32 3
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 0
-; CHECK-NEXT:    [[TMP70:%.*]] = bitcast i32* [[TMP69]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP70]], align 4
-; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 4
-; CHECK-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP72]], align 4
-; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 8
-; CHECK-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
-; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 12
-; CHECK-NEXT:    [[TMP76:%.*]] = bitcast i32* [[TMP75]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP76]], align 4
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP48]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP56]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP80:%.*]] = xor <4 x i1> [[TMP64]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP40]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_LOAD4]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP56]], <4 x i32> [[WIDE_LOAD5]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP64]], <4 x i32> [[WIDE_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP84]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP20]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP26]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 4
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i32* [[TMP27]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP28]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 8
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast i32* [[TMP29]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP30]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 12
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP32]], align 4
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP36:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_LOAD10]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_LOAD11]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_LOAD12]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP37]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP38]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]]
+; CHECK-NEXT:    [[TMP39]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]]
+; CHECK-NEXT:    [[TMP40]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP85:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP82]], [[TMP81]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP84]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP38]], [[TMP37]]
+; CHECK-NEXT:    [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP40]], [[BIN_RDX16]]
+; CHECK-NEXT:    [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP42]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -2178,7 +1808,7 @@
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], [[MIN]]
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP42]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -2225,10 +1855,14 @@
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -2245,95 +1879,56 @@
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4
-; CHECK-NEXT:    [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP29]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP31]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_LOAD10]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_LOAD11]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_LOAD12]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]]
+; CHECK-NEXT:    [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]]
+; CHECK-NEXT:    [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX16]]
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -2352,7 +1947,7 @@
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -2393,10 +1988,14 @@
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -2413,95 +2012,56 @@
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4
-; CHECK-NEXT:    [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP29]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP31]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_LOAD10]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_LOAD11]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_LOAD12]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]]
+; CHECK-NEXT:    [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]]
+; CHECK-NEXT:    [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX16]]
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -2520,7 +2080,7 @@
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -2571,10 +2131,14 @@
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -2591,95 +2155,56 @@
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4
-; CHECK-NEXT:    [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> undef)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP29]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP31]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_LOAD10]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_LOAD11]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_LOAD12]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]]
+; CHECK-NEXT:    [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]]
+; CHECK-NEXT:    [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX16]]
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -2698,7 +2223,7 @@
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -12,9 +13,29 @@
 ;    }
 ;}
 
-;CHECK-LABEL: @loop(
-;CHECK-NOT: <4 x i32>
 define void @loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+; CHECK-LABEL: @loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[IDXPROM3:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IDXPROM3]]
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 512
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -42,9 +63,41 @@
 ; The same loop with parallel loop metadata added to the loop branch
 ; and the memory instructions.
 
-;CHECK-LABEL: @parallel_loop(
-;CHECK: <4 x i32>
 define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+; CHECK-LABEL: @parallel_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i32> [[WIDE_LOAD1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], <4 x i64> [[TMP4]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> [[WIDE_LOAD]], <4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !llvm.access.group !1
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD2]], <4 x i32>* [[TMP9]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -74,10 +127,30 @@
 ; The same loop with an illegal parallel loop metadata: the memory
 ; accesses refer to a different loop's identifier.
 
-;CHECK-LABEL: @mixed_metadata(
-;CHECK-NOT: <4 x i32>
 
 define void @mixed_metadata(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+; CHECK-LABEL: @mixed_metadata(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !llvm.access.group !7
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !llvm.access.group !7
+; CHECK-NEXT:    [[IDXPROM3:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IDXPROM3]]
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[ARRAYIDX4]], align 4, !llvm.access.group !8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !llvm.access.group !7
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[ARRAYIDX2]], align 4, !llvm.access.group !7
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 512
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -18,73 +18,49 @@
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, [[TBAA1:!tbaa !.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP4]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP5]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP6]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP7]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP12]], align 4, [[TBAA1]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA1]]
-; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA1]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP16]], align 4, [[TBAA1]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA1]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA1]]
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x i32> poison, i32 [[TMP19]], i32 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP20]], i32 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP21]], i32 2
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP22]], i32 3
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP23]], i32 4
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP24]], i32 5
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP25]], i32 6
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP33]], i32 [[TMP26]], i32 7
-; CHECK-NEXT:    [[TMP35:%.*]] = mul nsw <8 x i32> [[TMP34]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP36:%.*]] = add <8 x i32> [[VEC_PHI]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP37]] = add <8 x i32> [[TMP36]], [[TMP35]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
-; CHECK-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], <4 x i64> [[VEC_IND]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[VEC_PHI]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP10]] = add <4 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP37]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, 96
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, 100
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ADD7_LCSSA]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP40:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, [[TBAA1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, [[TBAA1]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP41]], [[TMP40]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP14]], [[TMP13]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SUM_015]], 4
 ; CHECK-NEXT:    [[ADD7]] = add i32 [[ADD]], [[MUL]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
 entry:
   %idxprom = sext i32 %i to i64
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
@@ -480,8 +480,8 @@
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP53:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
@@ -493,62 +493,34 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
 ; CHECK-NEXT:    [[TMP8:%.*]] = udiv <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
 ; CHECK-NEXT:    [[TMP9:%.*]] = udiv <4 x i64> [[STEP_ADD]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE:%.*]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP8]], i32 3
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i8, i8* [[TMP11]], align 1
-; CHECK-NEXT:    [[TMP27:%.*]] = load i8, i8* [[TMP13]], align 1
-; CHECK-NEXT:    [[TMP28:%.*]] = load i8, i8* [[TMP15]], align 1
-; CHECK-NEXT:    [[TMP29:%.*]] = load i8, i8* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i8> poison, i8 [[TMP26]], i32 0
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i8> [[TMP30]], i8 [[TMP27]], i32 1
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i8> [[TMP31]], i8 [[TMP28]], i32 2
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i8> [[TMP32]], i8 [[TMP29]], i32 3
-; CHECK-NEXT:    [[TMP34:%.*]] = load i8, i8* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i8, i8* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = load i8, i8* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP37:%.*]] = load i8, i8* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i8> poison, i8 [[TMP34]], i32 0
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i8> [[TMP38]], i8 [[TMP35]], i32 1
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i8> [[TMP39]], i8 [[TMP36]], i32 2
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i8> [[TMP40]], i8 [[TMP37]], i32 3
-; CHECK-NEXT:    [[TMP42:%.*]] = urem <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK-NEXT:    [[TMP43:%.*]] = urem <4 x i64> [[STEP_ADD]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK-NEXT:    [[TMP44:%.*]] = trunc <4 x i64> [[TMP42]] to <4 x i8>
-; CHECK-NEXT:    [[TMP45:%.*]] = trunc <4 x i64> [[TMP43]] to <4 x i8>
-; CHECK-NEXT:    [[TMP46:%.*]] = lshr <4 x i8> [[TMP33]], [[TMP44]]
-; CHECK-NEXT:    [[TMP47:%.*]] = lshr <4 x i8> [[TMP41]], [[TMP45]]
-; CHECK-NEXT:    [[TMP48:%.*]] = and <4 x i8> [[TMP46]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP49:%.*]] = and <4 x i8> [[TMP47]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP50:%.*]] = zext <4 x i8> [[TMP48]] to <4 x i32>
-; CHECK-NEXT:    [[TMP51:%.*]] = zext <4 x i8> [[TMP49]] to <4 x i32>
-; CHECK-NEXT:    [[TMP52]] = add <4 x i32> [[VEC_PHI]], [[TMP50]]
-; CHECK-NEXT:    [[TMP53]] = add <4 x i32> [[VEC_PHI2]], [[TMP51]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE:%.*]], <4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], <4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP10]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP11]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
+; CHECK-NEXT:    [[TMP12:%.*]] = urem <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[TMP13:%.*]] = urem <4 x i64> [[STEP_ADD]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[TMP12]] to <4 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = lshr <4 x i8> [[WIDE_MASKED_GATHER]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = lshr <4 x i8> [[WIDE_MASKED_GATHER3]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <4 x i8> [[TMP16]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP19:%.*]] = and <4 x i8> [[TMP17]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP20:%.*]] = zext <4 x i8> [[TMP18]] to <4 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = zext <4 x i8> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[TMP22]] = add <4 x i32> [[VEC_PHI]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23]] = add <4 x i32> [[VEC_PHI2]], [[TMP21]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP53]], [[TMP52]]
-; CHECK-NEXT:    [[TMP55:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP23]], [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -566,7 +538,7 @@
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LOOP]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LOOP]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -basic-aa -loop-vectorize -mcpu=corei7-avx -debug -S < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
@@ -16,7 +17,7 @@
 ; consecutive vector of pointers store, therefore we should count it towards the
 ; widest vector count.
 ;
-; CHECK: test_consecutive_store
+; CHECK: LAA: Found a loop in test_consecutive_store
 ; CHECK: The Smallest and Widest types: 64 / 64 bits.
 define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwtable align 2 {
   %4 = load %0*, %0** %2, align 8
@@ -50,8 +51,8 @@
 ;     for (int i = 0; i < 1024; ++i) {
 ;       p[i][y] = (int*) (1 + q[i]);
 ;     }
-; CHECK: test_nonconsecutive_store
-; CHECK: The Smallest and Widest types: 16 / 16 bits.
+; CHECK: LAA: Found a loop in test_nonconsecutive_store
+; CHECK: The Smallest and Widest types: 16 / 64 bits.
 define void @test_nonconsecutive_store() nounwind ssp uwtable {
   br label %1
 
@@ -92,7 +93,7 @@
 
 ;; Now we check the same rules for loads. We should take consecutive loads of
 ;; pointer types into account.
-; CHECK: test_consecutive_ptr_load
+; CHECK: LAA: Found a loop in test_consecutive_ptr_load
 ; CHECK: The Smallest and Widest types: 8 / 64 bits.
 define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable {
   br label %1
@@ -116,8 +117,8 @@
 }
 
 ;; However, we should not take unconsecutive loads of pointers into account.
-; CHECK: test_nonconsecutive_ptr_load
-; CHECK: LV: The Smallest and Widest types: 16 / 16 bits.
+; CHECK: LAA: Found a loop in test_nonconsecutive_ptr_load
+; CHECK: The Smallest and Widest types: 16 / 64 bits.
 define void @test_nonconsecutive_ptr_load() nounwind ssp uwtable {
   br label %1
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -805,45 +805,15 @@
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP3]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP0]], i32 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP5]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP0]], i32 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP7]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP0]], i32 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP9]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP0]], i32 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP11]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP0]], i32 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP13]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[TMP0]], i32 7
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP15]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = load i8, i8* [[TMP2]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = load i8, i8* [[TMP4]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = load i8, i8* [[TMP6]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP20:%.*]] = load i8, i8* [[TMP8]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = load i8, i8* [[TMP10]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP12]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP23:%.*]] = load i8, i8* [[TMP14]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP24:%.*]] = load i8, i8* [[TMP16]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP25:%.*]] = insertelement <8 x i8> poison, i8 [[TMP17]], i32 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP26:%.*]] = insertelement <8 x i8> [[TMP25]], i8 [[TMP18]], i32 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP27:%.*]] = insertelement <8 x i8> [[TMP26]], i8 [[TMP19]], i32 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP28:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP20]], i32 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP29:%.*]] = insertelement <8 x i8> [[TMP28]], i8 [[TMP21]], i32 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP30:%.*]] = insertelement <8 x i8> [[TMP29]], i8 [[TMP22]], i32 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP31:%.*]] = insertelement <8 x i8> [[TMP30]], i8 [[TMP23]], i32 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP32:%.*]] = insertelement <8 x i8> [[TMP31]], i8 [[TMP24]], i32 7
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP34:%.*]] = bitcast i8* [[TMP33]] to <8 x i8>*
-; DISABLED_MASKED_STRIDED-NEXT:    store <8 x i8> [[TMP32]], <8 x i8>* [[TMP34]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], <8 x i32> [[TMP0]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> [[TMP1]], i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>*
+; DISABLED_MASKED_STRIDED-NEXT:    store <8 x i8> [[WIDE_MASKED_GATHER]], <8 x i8>* [[TMP3]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP35]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; DISABLED_MASKED_STRIDED:       for.end:
 ; DISABLED_MASKED_STRIDED-NEXT:    ret void
 ;
@@ -852,17 +822,17 @@
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP2]], i32 1, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> poison)
-; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>*
-; ENABLED_MASKED_STRIDED-NEXT:    store <8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP4]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], <8 x i32> [[TMP0]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> [[TMP1]], i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    store <8 x i8> [[WIDE_MASKED_GATHER]], <8 x i8>* [[TMP3]], align 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; ENABLED_MASKED_STRIDED:       for.end:
 ; ENABLED_MASKED_STRIDED-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
@@ -29,70 +29,43 @@
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>*
 ; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl nuw nsw <4 x i64> [[VEC_IND]], <i64 2, i64 2, i64 2, i64 2>
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 [[TMP3]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP5]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP7]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP9]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 0
-; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP11]], i16* [[TMP4]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 1
-; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP12]], i16* [[TMP6]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP13]], i16* [[TMP8]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
-; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP14]], i16* [[TMP10]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = bitcast i16* [[TMP15]] to <4 x i16>*
-; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP16]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = or <4 x i64> [[TMP2]], <i64 1, i64 1, i64 1, i64 1>
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP18]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP20]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP22]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP24]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 0
-; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP26]], i16* [[TMP19]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 1
-; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP27]], i16* [[TMP21]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 2
-; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP28]], i16* [[TMP23]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP29:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 3
-; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP29]], i16* [[TMP25]], align 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], <4 x i64> [[TMP2]]
+; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> [[WIDE_LOAD]], <4 x i16*> [[TMP3]], i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[INDEX]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>*
+; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP2]], <i64 1, i64 1, i64 1, i64 1>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], <4 x i64> [[TMP6]]
+; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> [[WIDE_LOAD1]], <4 x i16*> [[TMP7]], i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP30]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP8]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; DISABLED_MASKED_STRIDED:       for.end:
 ; DISABLED_MASKED_STRIDED-NEXT:    ret void
 ;
 ; ENABLED_MASKED_STRIDED-LABEL: @test1(
 ; ENABLED_MASKED_STRIDED-NEXT:  entry:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 -1
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = bitcast i16* [[TMP1]] to <4 x i16>*
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP2]], align 2
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[INDEX]], 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl nuw nsw <4 x i64> [[VEC_IND]], <i64 2, i64 2, i64 2, i64 2>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], <4 x i64> [[TMP2]]
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> [[WIDE_LOAD]], <4 x i16*> [[TMP3]], i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[INDEX]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>*
 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 [[TMP6]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <16 x i16>*
-; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD1]], <16 x i32> <i32 0, i32 4, i32 undef, i32 undef, i32 1, i32 5, i32 undef, i32 undef, i32 2, i32 6, i32 undef, i32 undef, i32 3, i32 7, i32 undef, i32 undef>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> [[INTERLEAVED_VEC]], <16 x i16>* [[TMP8]], i32 2, <16 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>)
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP2]], <i64 1, i64 1, i64 1, i64 1>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], <4 x i64> [[TMP6]]
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> [[WIDE_LOAD1]], <4 x i16*> [[TMP7]], i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP9]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP8]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; ENABLED_MASKED_STRIDED:       for.end:
 ; ENABLED_MASKED_STRIDED-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -5,12 +6,22 @@
 
 @x = common global [1024 x x86_fp80] zeroinitializer, align 16
 
-;CHECK-LABEL: @example(
-;CHECK-NOT: bitcast x86_fp80* {{%[^ ]+}} to <{{[2-9][0-9]*}} x x86_fp80>*
-;CHECK: store
-;CHECK: ret void
-
 define void @example() nounwind ssp uwtable {
+; CHECK-LABEL: @example(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1024 x x86_fp80], [1024 x x86_fp80]* @x, i64 0, <2 x i64> [[VEC_IND]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2f80.v2p0f80(<2 x x86_fp80> <x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000>, <2 x x86_fp80*> [[TMP0]], i32 16, <2 x i1> <i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body