diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3844,7 +3844,15 @@
   // Even in the case of (loop invariant) stride whose value is not known at
   // compile time, the address computation will not incur more than one extra
   // ADD instruction.
-  if (Ty->isVectorTy() && SE) {
+  if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
+    // FIXME: this is a hack to artificially favor interleaved loads over gather
+    // on pre-AVX2. We do this because we can't correctly model the cost of an
+    // interleaved load, we overshoot by a an order of magnitude (because we
+    // essentially just fallback to the same sequence as gather) but we can
+    // model the cost of gather, and the cost is more favorable. Yet the
+    // interleaved load is better in general in reality.
+    // TODO: AVX2 is a cut-off because we have correct intereaving costs
+    // for a select number of triples.
     if (!BaseT::isStridedAccess(Ptr))
       return NumVectorInstToHideOverhead;
     if (!BaseT::getConstantStrideStep(SE, Ptr))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7058,6 +7058,7 @@
   unsigned AS = getLoadStoreAddressSpace(I);
   Value *Ptr = getLoadStorePointerOperand(I);
   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+  // FIXME: PtrTy should not be a vector, it's a hack.
 
   // Figure out whether the access is strided and get the stride value
   // if it's known in compile time
diff --git a/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll
@@ -36,26 +36,26 @@
 ; AVX1: LV: Found an estimated cost of 388 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 24 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 48 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 96 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 194 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 388 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 34 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 68 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 218 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 436 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 58 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 116 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 222 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 444 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX512: LV: Found an estimated cost of 888 for VF 64 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 14 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 30 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 62 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 124 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX512: LV: Found an estimated cost of 248 for VF 64 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %valB = load i16, i16* %inB, align 4
 define void @test() {
diff --git a/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll
@@ -36,11 +36,11 @@
 ; AVX1: LV: Found an estimated cost of 392 for VF 32 For instruction:   %valB = load i32, i32* %inB, align 4
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 24 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 48 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 98 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 196 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 392 for VF 32 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 18 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 36 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 72 for VF 32 For instruction:   %valB = load i32, i32* %inB, align 4
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll
@@ -36,11 +36,11 @@
 ; AVX1: LV: Found an estimated cost of 400 for VF 32 For instruction:   %valB = load i64, i64* %inB, align 8
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 24 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 50 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 100 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 200 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 400 for VF 32 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 10 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 20 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 40 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 80 for VF 32 For instruction:   %valB = load i64, i64* %inB, align 8
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll
@@ -36,26 +36,26 @@
 ; AVX1: LV: Found an estimated cost of 386 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 24 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 48 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 96 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 192 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 386 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 32 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 66 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 434 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 56 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 114 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 220 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 442 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX512: LV: Found an estimated cost of 884 for VF 64 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 14 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 30 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 60 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 122 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX512: LV: Found an estimated cost of 244 for VF 64 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %valB = load i8, i8* %inB, align 4
 define void @test() {
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
@@ -26,11 +26,11 @@
 ; AVX1: LV: Found an estimated cost of 430 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 50 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 99 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 215 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 430 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2: LV: Found an estimated cost of 40 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2: LV: Found an estimated cost of 80 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2: LV: Found an estimated cost of 170 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2: LV: Found an estimated cost of 340 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
@@ -26,7 +26,7 @@
 ; AVX1: LV: Found an estimated cost of 100 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll
@@ -26,11 +26,11 @@
 ; AVX1: LV: Found an estimated cost of 104 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 84 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/masked-scatter-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-scatter-i32-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-scatter-i32-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-scatter-i32-with-i8-index.ll
@@ -50,8 +50,8 @@
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 40 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX512: LV: Found an estimated cost of 10 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX512: LV: Found an estimated cost of 22 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
 ; AVX512: LV: Found an estimated cost of 18 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
 ; AVX512: LV: Found an estimated cost of 36 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll
@@ -36,26 +36,26 @@
 ; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 228 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 464 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
-; AVX512: LV: Found an estimated cost of 928 for VF 64 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 30 for VF 8 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 68 for VF 16 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 144 for VF 32 For instruction:   store i16 %valB, i16* %out, align 2
+; AVX512: LV: Found an estimated cost of 288 for VF 64 For instruction:   store i16 %valB, i16* %out, align 2
 ;
 ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i16 %valB, i16* %out
 define void @test() {
diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll
@@ -36,18 +36,18 @@
 ; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 32 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-FASTGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-FASTGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
-; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-FASTGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-FASTGATHER: LV: Found an estimated cost of 32 for VF 8 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-FASTGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i32 %valB, i32* %out, align 4
+; AVX2-FASTGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i32 %valB, i32* %out, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out
 ; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i32 %valB, i32* %out
diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll
@@ -36,18 +36,18 @@
 ; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i64 %valB, i64* %out, align 8
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 56 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 16 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 32 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i64 %valB, i64* %out, align 8
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-FASTGATHER: LV: Found an estimated cost of 56 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-FASTGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-FASTGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
-; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-FASTGATHER: LV: Found an estimated cost of 16 for VF 4 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-FASTGATHER: LV: Found an estimated cost of 32 for VF 8 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-FASTGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i64 %valB, i64* %out, align 8
+; AVX2-FASTGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i64 %valB, i64* %out, align 8
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
 ; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll
@@ -36,26 +36,26 @@
 ; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 56 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 56 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 220 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 456 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
-; AVX512: LV: Found an estimated cost of 928 for VF 64 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 30 for VF 8 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 60 for VF 16 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 136 for VF 32 For instruction:   store i8 %valB, i8* %out, align 1
+; AVX512: LV: Found an estimated cost of 288 for VF 64 For instruction:   store i8 %valB, i8* %out, align 1
 ;
 ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i8 %valB, i8* %out
 define void @test() {
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -1640,44 +1640,53 @@
 ; FVW2-NEXT:    [[IND_END14:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP12]]
 ; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FVW2:       vector.body:
-; FVW2-NEXT:    [[POINTER_PHI:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; FVW2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FVW2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX]]
-; FVW2-NEXT:    [[TMP13:%.*]] = getelementptr float, float* [[POINTER_PHI]], <2 x i64> <i64 0, i64 16>
-; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr float, float* [[POINTER_PHI]], <2 x i64> <i64 32, i64 48>
-; FVW2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]]
-; FVW2-NEXT:    [[TMP16:%.*]] = bitcast float* [[TMP15]] to <2 x float>*
-; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP16]], align 4, !alias.scope !7
-; FVW2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP15]], i64 2
-; FVW2-NEXT:    [[TMP18:%.*]] = bitcast float* [[TMP17]] to <2 x float>*
-; FVW2-NEXT:    [[WIDE_LOAD16:%.*]] = load <2 x float>, <2 x float>* [[TMP18]], align 4, !alias.scope !7
-; FVW2-NEXT:    call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD]], <2 x float*> [[TMP13]], i32 4, <2 x i1> <i1 true, i1 true>), !alias.scope !10, !noalias !12
-; FVW2-NEXT:    call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16]], <2 x float*> [[TMP14]], i32 4, <2 x i1> <i1 true, i1 true>), !alias.scope !10, !noalias !12
-; FVW2-NEXT:    [[TMP19:%.*]] = bitcast float* [[NEXT_GEP]] to <2 x float>*
-; FVW2-NEXT:    [[WIDE_LOAD17:%.*]] = load <2 x float>, <2 x float>* [[TMP19]], align 4, !alias.scope !14
-; FVW2-NEXT:    [[TMP20:%.*]] = getelementptr float, float* [[NEXT_GEP]], i64 2
+; FVW2-NEXT:    [[TMP13:%.*]] = shl i64 [[INDEX]], 4
+; FVW2-NEXT:    [[NEXT_GEP16:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP13]]
+; FVW2-NEXT:    [[TMP14:%.*]] = shl i64 [[INDEX]], 4
+; FVW2-NEXT:    [[TMP15:%.*]] = or i64 [[TMP14]], 16
+; FVW2-NEXT:    [[NEXT_GEP17:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP15]]
+; FVW2-NEXT:    [[TMP16:%.*]] = shl i64 [[INDEX]], 4
+; FVW2-NEXT:    [[TMP17:%.*]] = or i64 [[TMP16]], 32
+; FVW2-NEXT:    [[NEXT_GEP18:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP17]]
+; FVW2-NEXT:    [[TMP18:%.*]] = shl i64 [[INDEX]], 4
+; FVW2-NEXT:    [[TMP19:%.*]] = or i64 [[TMP18]], 48
+; FVW2-NEXT:    [[NEXT_GEP19:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP19]]
+; FVW2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]]
 ; FVW2-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <2 x float>*
-; FVW2-NEXT:    [[WIDE_LOAD18:%.*]] = load <2 x float>, <2 x float>* [[TMP21]], align 4, !alias.scope !14
-; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float*> [[TMP13]], i32 0
-; FVW2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP22]], i64 1
-; FVW2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float*> [[TMP13]], i32 1
-; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP24]], i64 1
-; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float*> [[TMP14]], i32 0
-; FVW2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP26]], i64 1
-; FVW2-NEXT:    [[TMP28:%.*]] = extractelement <2 x float*> [[TMP14]], i32 1
-; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP28]], i64 1
-; FVW2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[WIDE_LOAD17]], i32 0
-; FVW2-NEXT:    store float [[TMP30]], float* [[TMP23]], align 4, !alias.scope !10, !noalias !12
-; FVW2-NEXT:    [[TMP31:%.*]] = extractelement <2 x float> [[WIDE_LOAD17]], i32 1
-; FVW2-NEXT:    store float [[TMP31]], float* [[TMP25]], align 4, !alias.scope !10, !noalias !12
-; FVW2-NEXT:    [[TMP32:%.*]] = extractelement <2 x float> [[WIDE_LOAD18]], i32 0
-; FVW2-NEXT:    store float [[TMP32]], float* [[TMP27]], align 4, !alias.scope !10, !noalias !12
-; FVW2-NEXT:    [[TMP33:%.*]] = extractelement <2 x float> [[WIDE_LOAD18]], i32 1
-; FVW2-NEXT:    store float [[TMP33]], float* [[TMP29]], align 4, !alias.scope !10, !noalias !12
+; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP21]], align 4, !alias.scope !7
+; FVW2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 2
+; FVW2-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP22]] to <2 x float>*
+; FVW2-NEXT:    [[WIDE_LOAD20:%.*]] = load <2 x float>, <2 x float>* [[TMP23]], align 4, !alias.scope !7
+; FVW2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
+; FVW2-NEXT:    store float [[TMP24]], float* [[NEXT_GEP16]], align 4, !alias.scope !10, !noalias !12
+; FVW2-NEXT:    [[TMP25:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
+; FVW2-NEXT:    store float [[TMP25]], float* [[NEXT_GEP17]], align 4, !alias.scope !10, !noalias !12
+; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[WIDE_LOAD20]], i32 0
+; FVW2-NEXT:    store float [[TMP26]], float* [[NEXT_GEP18]], align 4, !alias.scope !10, !noalias !12
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x float> [[WIDE_LOAD20]], i32 1
+; FVW2-NEXT:    store float [[TMP27]], float* [[NEXT_GEP19]], align 4, !alias.scope !10, !noalias !12
+; FVW2-NEXT:    [[TMP28:%.*]] = bitcast float* [[NEXT_GEP]] to <2 x float>*
+; FVW2-NEXT:    [[WIDE_LOAD21:%.*]] = load <2 x float>, <2 x float>* [[TMP28]], align 4, !alias.scope !14
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr float, float* [[NEXT_GEP]], i64 2
+; FVW2-NEXT:    [[TMP30:%.*]] = bitcast float* [[TMP29]] to <2 x float>*
+; FVW2-NEXT:    [[WIDE_LOAD22:%.*]] = load <2 x float>, <2 x float>* [[TMP30]], align 4, !alias.scope !14
+; FVW2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP16]], i64 1
+; FVW2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP17]], i64 1
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP18]], i64 1
+; FVW2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP19]], i64 1
+; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x float> [[WIDE_LOAD21]], i32 0
+; FVW2-NEXT:    store float [[TMP35]], float* [[TMP31]], align 4, !alias.scope !10, !noalias !12
+; FVW2-NEXT:    [[TMP36:%.*]] = extractelement <2 x float> [[WIDE_LOAD21]], i32 1
+; FVW2-NEXT:    store float [[TMP36]], float* [[TMP32]], align 4, !alias.scope !10, !noalias !12
+; FVW2-NEXT:    [[TMP37:%.*]] = extractelement <2 x float> [[WIDE_LOAD22]], i32 0
+; FVW2-NEXT:    store float [[TMP37]], float* [[TMP33]], align 4, !alias.scope !10, !noalias !12
+; FVW2-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[WIDE_LOAD22]], i32 1
+; FVW2-NEXT:    store float [[TMP38]], float* [[TMP34]], align 4, !alias.scope !10, !noalias !12
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; FVW2-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; FVW2-NEXT:    [[PTR_IND]] = getelementptr float, float* [[POINTER_PHI]], i64 64
-; FVW2-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; FVW2-NEXT:    [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; FVW2-NEXT:    br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; FVW2:       middle.block:
 ; FVW2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; FVW2-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER]]
@@ -1689,11 +1698,11 @@
 ; FVW2-NEXT:    [[PTR_ADDR_012:%.*]] = phi float* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[PTR_ADDR_012_PH]], [[FOR_BODY_PREHEADER]] ]
 ; FVW2-NEXT:    [[DEST_ADDR_011:%.*]] = phi float* [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ], [ [[DEST_ADDR_011_PH]], [[FOR_BODY_PREHEADER]] ]
 ; FVW2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 [[IDXPROM]]
-; FVW2-NEXT:    [[TMP35:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; FVW2-NEXT:    store float [[TMP35]], float* [[DEST_ADDR_011]], align 4
-; FVW2-NEXT:    [[TMP36:%.*]] = load float, float* [[PTR_ADDR_012]], align 4
+; FVW2-NEXT:    [[TMP40:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; FVW2-NEXT:    store float [[TMP40]], float* [[DEST_ADDR_011]], align 4
+; FVW2-NEXT:    [[TMP41:%.*]] = load float, float* [[PTR_ADDR_012]], align 4
 ; FVW2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 1
-; FVW2-NEXT:    store float [[TMP36]], float* [[ARRAYIDX5]], align 4
+; FVW2-NEXT:    store float [[TMP41]], float* [[ARRAYIDX5]], align 4
 ; FVW2-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 1
 ; FVW2-NEXT:    [[ADD_PTR6]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 16
 ; FVW2-NEXT:    [[CMP_NOT:%.*]] = icmp eq float* [[INCDEC_PTR]], [[ADD_PTR]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -1429,29 +1429,169 @@
 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
-; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE31:%.*]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE31]] ]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP4]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC1]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i8> [[STRIDED_VEC1]], <8 x i8> [[STRIDED_VEC]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 [[TMP4]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>*
-; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = or <8 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i8> [[STRIDED_VEC1]], <8 x i8> [[STRIDED_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP9]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <8 x i8> [[TMP7]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP11]], i8* [[TMP10]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if2:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP13]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = extractelement <8 x i8> [[TMP7]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP15]], i8* [[TMP14]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE3]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue3:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if4:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP17]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = extractelement <8 x i8> [[TMP7]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP19]], i8* [[TMP18]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE5]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue5:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP20]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if6:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP21]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP23:%.*]] = extractelement <8 x i8> [[TMP7]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP23]], i8* [[TMP22]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue7:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP24:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP24]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if8:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP25]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP27:%.*]] = extractelement <8 x i8> [[TMP7]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP27]], i8* [[TMP26]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue9:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP28:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if10:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP29:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP29]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP31:%.*]] = extractelement <8 x i8> [[TMP7]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP31]], i8* [[TMP30]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue11:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP32:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP32]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if12:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP33:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP33]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP35:%.*]] = extractelement <8 x i8> [[TMP7]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP35]], i8* [[TMP34]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE13]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue13:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP36:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP36]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if14:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP37:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP37]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP39:%.*]] = extractelement <8 x i8> [[TMP7]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP39]], i8* [[TMP38]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE15]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue15:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP40:%.*]] = sub <8 x i8> zeroinitializer, [[TMP7]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP41:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP41]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if16:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP42:%.*]] = extractelement <8 x i32> [[TMP5]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP42]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP44:%.*]] = extractelement <8 x i8> [[TMP40]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP44]], i8* [[TMP43]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE17]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue17:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP45:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP45]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if18:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP46:%.*]] = extractelement <8 x i32> [[TMP5]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP46]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP48:%.*]] = extractelement <8 x i8> [[TMP40]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP48]], i8* [[TMP47]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE19]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue19:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP49:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP49]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if20:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP50:%.*]] = extractelement <8 x i32> [[TMP5]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP50]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP52:%.*]] = extractelement <8 x i8> [[TMP40]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP52]], i8* [[TMP51]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE21]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue21:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP53:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP53]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if22:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP54:%.*]] = extractelement <8 x i32> [[TMP5]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP54]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP56:%.*]] = extractelement <8 x i8> [[TMP40]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP56]], i8* [[TMP55]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE23]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue23:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP57:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP57]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if24:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP58:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP58]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP60:%.*]] = extractelement <8 x i8> [[TMP40]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP60]], i8* [[TMP59]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE25]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue25:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP61:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP61]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if26:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP62:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP62]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP64:%.*]] = extractelement <8 x i8> [[TMP40]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP64]], i8* [[TMP63]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE27]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue27:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP65:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP65]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if28:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP66:%.*]] = extractelement <8 x i32> [[TMP5]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP66]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP68:%.*]] = extractelement <8 x i8> [[TMP40]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP68]], i8* [[TMP67]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE29]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue29:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP69:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP69]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31]]
+; ENABLED_MASKED_STRIDED:       pred.store.if30:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP70:%.*]] = extractelement <8 x i32> [[TMP5]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP70]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP72:%.*]] = extractelement <8 x i8> [[TMP40]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP72]], i8* [[TMP71]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE31]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue31:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP11]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP73:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP73]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; ENABLED_MASKED_STRIDED:       for.end:
 ; ENABLED_MASKED_STRIDED-NEXT:    ret void
 ;
@@ -2615,31 +2755,171 @@
 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
-; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE33:%.*]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE33]] ]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl nuw nsw i32 [[INDEX]], 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[TMP1]], <8 x i1> [[TMP0]], <8 x i1> zeroinitializer
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
-; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x i1> [[TMP0]], <8 x i1> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP6]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = sub <8 x i8> zeroinitializer, [[TMP8]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 [[TMP6]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <16 x i8>*
-; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> [[TMP9]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP12]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = or <8 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP5]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP2]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP11]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <8 x i8> [[TMP9]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP13]], i8* [[TMP12]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP5]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if4:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[TMP2]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP15]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = extractelement <8 x i8> [[TMP9]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP17]], i8* [[TMP16]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE5]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue5:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = extractelement <8 x i1> [[TMP5]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if6:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP19]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = extractelement <8 x i8> [[TMP9]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP21]], i8* [[TMP20]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue7:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP22:%.*]] = extractelement <8 x i1> [[TMP5]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if8:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP23]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP25:%.*]] = extractelement <8 x i8> [[TMP9]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP25]], i8* [[TMP24]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue9:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP26:%.*]] = extractelement <8 x i1> [[TMP5]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if10:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP27:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP27]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP29:%.*]] = extractelement <8 x i8> [[TMP9]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP29]], i8* [[TMP28]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue11:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP30:%.*]] = extractelement <8 x i1> [[TMP5]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP30]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if12:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP31:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP31]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP33:%.*]] = extractelement <8 x i8> [[TMP9]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP33]], i8* [[TMP32]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE13]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue13:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP34:%.*]] = extractelement <8 x i1> [[TMP5]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if14:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP35:%.*]] = extractelement <8 x i32> [[TMP2]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP35]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP37:%.*]] = extractelement <8 x i8> [[TMP9]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP37]], i8* [[TMP36]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE15]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue15:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP38:%.*]] = extractelement <8 x i1> [[TMP5]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP38]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if16:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP39:%.*]] = extractelement <8 x i32> [[TMP2]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP39]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP41:%.*]] = extractelement <8 x i8> [[TMP9]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP41]], i8* [[TMP40]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE17]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue17:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP42:%.*]] = sub <8 x i8> zeroinitializer, [[TMP9]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP43:%.*]] = extractelement <8 x i1> [[TMP5]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP43]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if18:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP44:%.*]] = extractelement <8 x i32> [[TMP7]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP44]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP46:%.*]] = extractelement <8 x i8> [[TMP42]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP46]], i8* [[TMP45]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE19]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue19:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP47:%.*]] = extractelement <8 x i1> [[TMP5]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP47]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if20:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP48:%.*]] = extractelement <8 x i32> [[TMP7]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP48]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP50:%.*]] = extractelement <8 x i8> [[TMP42]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP50]], i8* [[TMP49]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE21]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue21:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = extractelement <8 x i1> [[TMP5]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP51]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if22:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP52:%.*]] = extractelement <8 x i32> [[TMP7]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP52]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP54:%.*]] = extractelement <8 x i8> [[TMP42]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP54]], i8* [[TMP53]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE23]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue23:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP55:%.*]] = extractelement <8 x i1> [[TMP5]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP55]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if24:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP56:%.*]] = extractelement <8 x i32> [[TMP7]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP56]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP58:%.*]] = extractelement <8 x i8> [[TMP42]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP58]], i8* [[TMP57]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE25]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue25:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP59:%.*]] = extractelement <8 x i1> [[TMP5]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP59]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if26:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP60:%.*]] = extractelement <8 x i32> [[TMP7]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP60]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP62:%.*]] = extractelement <8 x i8> [[TMP42]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP62]], i8* [[TMP61]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE27]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue27:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP63:%.*]] = extractelement <8 x i1> [[TMP5]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP63]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if28:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP64:%.*]] = extractelement <8 x i32> [[TMP7]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP64]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP66:%.*]] = extractelement <8 x i8> [[TMP42]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP66]], i8* [[TMP65]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE29]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue29:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP67:%.*]] = extractelement <8 x i1> [[TMP5]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP67]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if30:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP68:%.*]] = extractelement <8 x i32> [[TMP7]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP68]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP70:%.*]] = extractelement <8 x i8> [[TMP42]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP70]], i8* [[TMP69]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE31]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue31:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP71:%.*]] = extractelement <8 x i1> [[TMP5]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP71]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33]]
+; ENABLED_MASKED_STRIDED:       pred.store.if32:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP72:%.*]] = extractelement <8 x i32> [[TMP7]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP72]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP74:%.*]] = extractelement <8 x i8> [[TMP42]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP74]], i8* [[TMP73]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE33]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue33:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP13]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP75:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP75]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; ENABLED_MASKED_STRIDED:       for.end:
 ; ENABLED_MASKED_STRIDED-NEXT:    ret void
 ;
@@ -3053,30 +3333,169 @@
 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
-; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> poison, i32 [[INDEX]], i32 0
-; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer
-; ENABLED_MASKED_STRIDED-NEXT:    [[INDUCTION:%.*]] = or <8 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ule <8 x i32> [[INDUCTION]], [[BROADCAST_SPLAT]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE31:%.*]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE31]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP4]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 [[TMP4]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>*
-; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = or <8 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i8> [[STRIDED_VEC1]], <8 x i8> [[STRIDED_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP9]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <8 x i8> [[TMP7]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP11]], i8* [[TMP10]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if2:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP13]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = extractelement <8 x i8> [[TMP7]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP15]], i8* [[TMP14]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE3]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue3:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if4:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP17]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = extractelement <8 x i8> [[TMP7]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP19]], i8* [[TMP18]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE5]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue5:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP20]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if6:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP21]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP23:%.*]] = extractelement <8 x i8> [[TMP7]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP23]], i8* [[TMP22]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue7:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP24:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP24]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if8:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP25]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP27:%.*]] = extractelement <8 x i8> [[TMP7]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP27]], i8* [[TMP26]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue9:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP28:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if10:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP29:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP29]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP31:%.*]] = extractelement <8 x i8> [[TMP7]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP31]], i8* [[TMP30]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue11:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP32:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP32]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if12:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP33:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP33]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP35:%.*]] = extractelement <8 x i8> [[TMP7]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP35]], i8* [[TMP34]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE13]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue13:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP36:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP36]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if14:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP37:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP37]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP39:%.*]] = extractelement <8 x i8> [[TMP7]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP39]], i8* [[TMP38]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE15]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue15:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP40:%.*]] = sub <8 x i8> zeroinitializer, [[TMP7]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP41:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP41]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if16:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP42:%.*]] = extractelement <8 x i32> [[TMP5]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP42]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP44:%.*]] = extractelement <8 x i8> [[TMP40]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP44]], i8* [[TMP43]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE17]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue17:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP45:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP45]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if18:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP46:%.*]] = extractelement <8 x i32> [[TMP5]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP46]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP48:%.*]] = extractelement <8 x i8> [[TMP40]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP48]], i8* [[TMP47]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE19]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue19:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP49:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP49]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if20:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP50:%.*]] = extractelement <8 x i32> [[TMP5]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP50]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP52:%.*]] = extractelement <8 x i8> [[TMP40]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP52]], i8* [[TMP51]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE21]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue21:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP53:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP53]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if22:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP54:%.*]] = extractelement <8 x i32> [[TMP5]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP54]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP56:%.*]] = extractelement <8 x i8> [[TMP40]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP56]], i8* [[TMP55]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE23]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue23:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP57:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP57]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if24:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP58:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP58]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP60:%.*]] = extractelement <8 x i8> [[TMP40]], i32 4
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP60]], i8* [[TMP59]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE25]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue25:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP61:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP61]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if26:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP62:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP62]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP64:%.*]] = extractelement <8 x i8> [[TMP40]], i32 5
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP64]], i8* [[TMP63]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE27]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue27:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP65:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP65]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if28:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP66:%.*]] = extractelement <8 x i32> [[TMP5]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP66]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP68:%.*]] = extractelement <8 x i8> [[TMP40]], i32 6
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP68]], i8* [[TMP67]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE29]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue29:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP69:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP69]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31]]
+; ENABLED_MASKED_STRIDED:       pred.store.if30:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP70:%.*]] = extractelement <8 x i32> [[TMP5]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP70]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP72:%.*]] = extractelement <8 x i8> [[TMP40]], i32 7
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP72]], i8* [[TMP71]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE31]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue31:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP11]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP73:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP73]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; ENABLED_MASKED_STRIDED:       for.end:
 ; ENABLED_MASKED_STRIDED-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
@@ -246,31 +246,95 @@
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0
 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 -1
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
-; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0
-; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; ENABLED_MASKED_STRIDED-NEXT:    [[INDUCTION:%.*]] = or <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <4 x i16>*
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP3]], i32 2, <4 x i1> [[TMP1]], <4 x i16> poison)
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 2
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = bitcast i16* [[TMP5]] to <4 x i16>*
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP6]], i32 2, <4 x i1> [[TMP1]], <4 x i16> poison)
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = or i64 [[TMP4]], 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 [[TMP7]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP8]] to <16 x i16>*
-; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_MASKED_LOAD]], <4 x i16> [[WIDE_MASKED_LOAD3]], <16 x i32> <i32 0, i32 4, i32 undef, i32 undef, i32 1, i32 5, i32 undef, i32 undef, i32 2, i32 6, i32 undef, i32 undef, i32 3, i32 7, i32 undef, i32 undef>
-; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> [[INTERLEAVED_VEC]], <16 x i16>* [[TMP9]], i32 2, <16 x i1> [[TMP10]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE15:%.*]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[FOR_BODY_PREHEADER]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE15]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = bitcast i16* [[TMP1]] to <4 x i16>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP2]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 2, i64 2, i64 2, i64 2>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 [[TMP5]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP7]], i16* [[TMP6]], align 2
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if1:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP9]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP11]], i16* [[TMP10]], align 2
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue2:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if3:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP13]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP15]], i16* [[TMP14]], align 2
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue4:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if5:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP17]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP19]], i16* [[TMP18]], align 2
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue6:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = bitcast i16* [[TMP20]] to <4 x i16>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP21]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP22:%.*]] = or <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if8:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP24]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD7]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP26]], i16* [[TMP25]], align 2
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue9:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if10:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP28]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP30:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD7]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP30]], i16* [[TMP29]], align 2
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue11:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if12:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP32]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP34:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD7]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP34]], i16* [[TMP33]], align 2
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE13]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue13:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15]]
+; ENABLED_MASKED_STRIDED:       pred.store.if14:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP36:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP36]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP38:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD7]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP38]], i16* [[TMP37]], align 2
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE15]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue15:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP11]], label [[FOR_END_LOOPEXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP39]], label [[FOR_END_LOOPEXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; ENABLED_MASKED_STRIDED:       for.end.loopexit:
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[FOR_END]]
 ; ENABLED_MASKED_STRIDED:       for.end:
@@ -377,21 +441,53 @@
 ; ENABLED_MASKED_STRIDED-NEXT:  entry:
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
-; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>*
 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i16> [[WIDE_LOAD]], zeroinitializer
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = mul nuw nsw i64 [[INDEX]], 3
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 [[TMP3]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <12 x i16>*
-; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> poison, <12 x i32> <i32 0, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 3, i32 undef, i32 undef>
-; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = and <12 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v12i16.p0v12i16(<12 x i16> [[INTERLEAVED_VEC]], <12 x i16>* [[TMP5]], i32 2, <12 x i1> [[TMP6]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = mul nuw nsw <4 x i64> [[VEC_IND]], <i64 3, i64 3, i64 3, i64 3>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 [[TMP5]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP7]], i16* [[TMP6]], align 2
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if1:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP9]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 1
+; ENABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP11]], i16* [[TMP10]], align 2
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue2:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if3:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP13]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
+; ENABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP15]], i16* [[TMP14]], align 2
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue4:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
+; ENABLED_MASKED_STRIDED:       pred.store.if5:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP17]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; ENABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP19]], i16* [[TMP18]], align 2
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue6:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP7]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP20]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; ENABLED_MASKED_STRIDED:       for.end:
 ; ENABLED_MASKED_STRIDED-NEXT:    ret void
 ;