diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5205,6 +5205,7 @@
         Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
 
   // Get the cost of all the memory operations.
+  // FIXME: discount dead loads.
   InstructionCost MemOpCosts = getMemoryOpCost(
       Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
 
@@ -5424,22 +5425,25 @@
   };
 
   if (Opcode == Instruction::Load) {
-    // FIXME: if we have a partially-interleaved groups, with gaps,
-    //        should we discount the not-demanded indicies?
+    auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
+                              MemOpCosts](const CostTblEntry *Entry) {
+      return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
+    };
+
     if (ST->hasAVX2())
       if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
                                               ETy.getSimpleVT()))
-        return MemOpCosts + Entry->Cost;
+        return GetDiscountedCost(Entry);
 
     if (ST->hasSSSE3())
       if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
                                               ETy.getSimpleVT()))
-        return MemOpCosts + Entry->Cost;
+        return GetDiscountedCost(Entry);
 
     if (ST->hasSSE2())
       if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
                                               ETy.getSimpleVT()))
-        return MemOpCosts + Entry->Cost;
+        return GetDiscountedCost(Entry);
   } else {
     assert(Opcode == Instruction::Store &&
            "Expected Store Instruction at this point");
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll
@@ -13,24 +13,24 @@
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; SSE2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; SSE2: LV: Found an estimated cost of 4 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 30 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 60 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 2 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 24 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 48 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 96 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 12 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 24 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 2 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 16 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll
@@ -26,11 +26,11 @@
 ; AVX1: LV: Found an estimated cost of 188 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 44 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 16 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
@@ -26,11 +26,11 @@
 ; AVX1: LV: Found an estimated cost of 100 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 44 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 11 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 23 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll
@@ -26,11 +26,11 @@
 ; AVX1: LV: Found an estimated cost of 280 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 84 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 67 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
@@ -26,11 +26,11 @@
 ; AVX1: LV: Found an estimated cost of 192 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 84 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 50 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll
@@ -26,11 +26,11 @@
 ; AVX1: LV: Found an estimated cost of 104 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 84 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 16 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 33 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
@@ -27,24 +27,42 @@
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP6]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i64* [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 16, i64 20, i64 24, i64 28>
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 32, i64 36, i64 40, i64 44>
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 48, i64 52, i64 56, i64 60>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i64*> [[TMP7]] to <4 x %0**>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64*> [[TMP8]] to <4 x %0**>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i64*> [[TMP9]] to <4 x %0**>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64*> [[TMP10]] to <4 x %0**>
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP11]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP12]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP13]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP14]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i64* [[NEXT_GEP]] to %0**
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64* [[NEXT_GEP5]] to %0**
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i64* [[NEXT_GEP6]] to %0**
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64* [[NEXT_GEP7]] to %0**
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr %0*, %0** [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast %0** [[TMP19]] to <16 x %0*>*
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr %0*, %0** [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast %0** [[TMP21]] to <16 x %0*>*
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr %0*, %0** [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast %0** [[TMP23]] to <16 x %0*>*
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr %0*, %0** [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast %0** [[TMP25]] to <16 x %0*>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x %0*>, <16 x %0*>* [[TMP20]], align 8
+; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <16 x %0*>, <16 x %0*>* [[TMP22]], align 8
+; CHECK-NEXT:    [[WIDE_VEC9:%.*]] = load <16 x %0*>, <16 x %0*>* [[TMP24]], align 8
+; CHECK-NEXT:    [[WIDE_VEC10:%.*]] = load <16 x %0*>, <16 x %0*>* [[TMP26]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x %0*> [[WIDE_VEC]], <16 x %0*> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC11:%.*]] = shufflevector <16 x %0*> [[WIDE_VEC8]], <16 x %0*> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC12:%.*]] = shufflevector <16 x %0*> [[WIDE_VEC9]], <16 x %0*> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC13:%.*]] = shufflevector <16 x %0*> [[WIDE_VEC10]], <16 x %0*> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    [[PTR_IND]] = getelementptr i64, i64* [[POINTER_PHI]], i64 64
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -95,24 +113,42 @@
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP6]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i64* [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 16, i64 20, i64 24, i64 28>
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 32, i64 36, i64 40, i64 44>
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 48, i64 52, i64 56, i64 60>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i64*> [[TMP7]] to <4 x %1**>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64*> [[TMP8]] to <4 x %1**>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i64*> [[TMP9]] to <4 x %1**>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64*> [[TMP10]] to <4 x %1**>
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP11]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP12]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP13]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP14]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i64* [[NEXT_GEP]] to %1**
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64* [[NEXT_GEP5]] to %1**
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i64* [[NEXT_GEP6]] to %1**
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64* [[NEXT_GEP7]] to %1**
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr %1*, %1** [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast %1** [[TMP19]] to <16 x %1*>*
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr %1*, %1** [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast %1** [[TMP21]] to <16 x %1*>*
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr %1*, %1** [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast %1** [[TMP23]] to <16 x %1*>*
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr %1*, %1** [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast %1** [[TMP25]] to <16 x %1*>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x %1*>, <16 x %1*>* [[TMP20]], align 8
+; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <16 x %1*>, <16 x %1*>* [[TMP22]], align 8
+; CHECK-NEXT:    [[WIDE_VEC9:%.*]] = load <16 x %1*>, <16 x %1*>* [[TMP24]], align 8
+; CHECK-NEXT:    [[WIDE_VEC10:%.*]] = load <16 x %1*>, <16 x %1*>* [[TMP26]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x %1*> [[WIDE_VEC]], <16 x %1*> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC11:%.*]] = shufflevector <16 x %1*> [[WIDE_VEC8]], <16 x %1*> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC12:%.*]] = shufflevector <16 x %1*> [[WIDE_VEC9]], <16 x %1*> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC13:%.*]] = shufflevector <16 x %1*> [[WIDE_VEC10]], <16 x %1*> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    [[PTR_IND]] = getelementptr i64, i64* [[POINTER_PHI]], i64 64
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph: