diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5028,12 +5028,6 @@
                                              Alignment, AddressSpace, CostKind,
                                              UseMaskForCond, UseMaskForGaps);
 
-  // We currently Support only fully-interleaved groups, with no gaps.
-  // TODO: Support also strided loads (interleaved-groups with gaps).
-  if (Indices.size() && Indices.size() != Factor)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, CostKind);
-
   // VecTy for interleave memop is <VF*Factor x Elt>.
   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
   // VecTy = <12 x i32>.
@@ -5249,12 +5243,16 @@
   };
 
   if (Opcode == Instruction::Load) {
+    // FIXME: if we have a partially-interleaved groups, with gaps,
+    //        should we discount the not-demanded indicies?
     if (const auto *Entry =
             CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
       return MemOpCosts + Entry->Cost;
   } else {
     assert(Opcode == Instruction::Store &&
            "Expected Store Instruction at this  point");
+    assert((!Indices.size() || Indices.size() == Factor) &&
+           "Interleaved store only supports fully-interleaved groups.");
     if (const auto *Entry =
             CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
       return MemOpCosts + Entry->Cost;
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll
@@ -24,10 +24,10 @@
 ; AVX1: LV: Found an estimated cost of 48 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 11 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 24 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 48 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 12 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll
@@ -24,10 +24,10 @@
 ; AVX1: LV: Found an estimated cost of 94 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 12 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 21 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 47 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 94 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
@@ -24,10 +24,10 @@
 ; AVX1: LV: Found an estimated cost of 50 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 11 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 25 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 50 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll
@@ -24,10 +24,10 @@
 ; AVX1: LV: Found an estimated cost of 140 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 16 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 32 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 70 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 140 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
@@ -24,10 +24,10 @@
 ; AVX1: LV: Found an estimated cost of 96 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 11 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 22 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 48 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 96 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll
@@ -24,10 +24,10 @@
 ; AVX1: LV: Found an estimated cost of 52 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 26 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 52 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize --force-vector-width=4 --force-vector-interleave=0 -S -o - < %s | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
@@ -8,11 +9,75 @@
 
 define void @foo(i64* %p, i64* %p.last) unnamed_addr #0 {
 ; CHECK-LABEL: @foo(
-; CHECK: vector.body:
-; CHECK:         [[WIDE_MASKED_GATHER0:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP5:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP6:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP7:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP8:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P4:%.*]] = ptrtoint i64* [[P:%.*]] to i64
+; CHECK-NEXT:    [[P_LAST1:%.*]] = ptrtoint i64* [[P_LAST:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[P_LAST1]], -16
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[P4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP3]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 16, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP6]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 2
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i64* [[NEXT_GEP]] to %0**
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64* [[NEXT_GEP5]] to %0**
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i64* [[NEXT_GEP6]] to %0**
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64* [[NEXT_GEP7]] to %0**
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr %0*, %0** [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast %0** [[TMP19]] to <8 x %0*>*
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr %0*, %0** [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast %0** [[TMP21]] to <8 x %0*>*
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr %0*, %0** [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast %0** [[TMP23]] to <8 x %0*>*
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr %0*, %0** [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast %0** [[TMP25]] to <8 x %0*>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x %0*>, <8 x %0*>* [[TMP20]], align 8
+; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <8 x %0*>, <8 x %0*>* [[TMP22]], align 8
+; CHECK-NEXT:    [[WIDE_VEC9:%.*]] = load <8 x %0*>, <8 x %0*>* [[TMP24]], align 8
+; CHECK-NEXT:    [[WIDE_VEC10:%.*]] = load <8 x %0*>, <8 x %0*>* [[TMP26]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x %0*> [[WIDE_VEC]], <8 x %0*> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC11:%.*]] = shufflevector <8 x %0*> [[WIDE_VEC8]], <8 x %0*> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC12:%.*]] = shufflevector <8 x %0*> [[WIDE_VEC9]], <8 x %0*> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC13:%.*]] = shufflevector <8 x %0*> [[WIDE_VEC10]], <8 x %0*> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[P2:%.*]] = phi i64* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[P_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[P_INC]] = getelementptr inbounds i64, i64* [[P2]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = bitcast i64* [[P2]] to %0**
+; CHECK-NEXT:    [[V:%.*]] = load %0*, %0** [[P3]], align 8
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i64* [[P_INC]], [[P_LAST]]
+; CHECK-NEXT:    br i1 [[B]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -30,11 +95,75 @@
 
 define void @bar(i64* %p, i64* %p.last) unnamed_addr #0 {
 ; CHECK-LABEL: @bar(
-; CHECK: vector.body:
-; CHECK:         [[WIDE_MASKED_GATHER0:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP5:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP6:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP7:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP8:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P4:%.*]] = ptrtoint i64* [[P:%.*]] to i64
+; CHECK-NEXT:    [[P_LAST1:%.*]] = ptrtoint i64* [[P_LAST:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[P_LAST1]], -16
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[P4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP3]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 16, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP6]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 2
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i64* [[NEXT_GEP]] to %1**
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64* [[NEXT_GEP5]] to %1**
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i64* [[NEXT_GEP6]] to %1**
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64* [[NEXT_GEP7]] to %1**
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr %1*, %1** [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast %1** [[TMP19]] to <8 x %1*>*
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr %1*, %1** [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast %1** [[TMP21]] to <8 x %1*>*
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr %1*, %1** [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast %1** [[TMP23]] to <8 x %1*>*
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr %1*, %1** [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast %1** [[TMP25]] to <8 x %1*>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x %1*>, <8 x %1*>* [[TMP20]], align 8
+; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <8 x %1*>, <8 x %1*>* [[TMP22]], align 8
+; CHECK-NEXT:    [[WIDE_VEC9:%.*]] = load <8 x %1*>, <8 x %1*>* [[TMP24]], align 8
+; CHECK-NEXT:    [[WIDE_VEC10:%.*]] = load <8 x %1*>, <8 x %1*>* [[TMP26]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x %1*> [[WIDE_VEC]], <8 x %1*> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC11:%.*]] = shufflevector <8 x %1*> [[WIDE_VEC8]], <8 x %1*> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC12:%.*]] = shufflevector <8 x %1*> [[WIDE_VEC9]], <8 x %1*> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC13:%.*]] = shufflevector <8 x %1*> [[WIDE_VEC10]], <8 x %1*> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[P2:%.*]] = phi i64* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[P_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[P_INC]] = getelementptr inbounds i64, i64* [[P2]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = bitcast i64* [[P2]] to %1**
+; CHECK-NEXT:    [[V:%.*]] = load %1*, %1** [[P3]], align 8
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i64* [[P_INC]], [[P_LAST]]
+; CHECK-NEXT:    br i1 [[B]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop