Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Vectorize.h"
+#include <numeric>
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -100,6 +101,9 @@
   Type *ScalarTy = Scalar->getType();
   if (!Load || !Load->isSimple())
     return false;
+  auto *Ty = dyn_cast<FixedVectorType>(I.getType());
+  if (!Ty)
+    return false;
 
   // TODO: Extend this to match GEP with constant offsets.
   Value *PtrOp = Load->getPointerOperand()->stripPointerCasts();
@@ -113,9 +117,6 @@
   // Check safety of replacing the scalar load with a larger vector load.
   unsigned VecNumElts = VectorSize / ScalarSize;
   auto *VectorTy = VectorType::get(ScalarTy, VecNumElts, false);
-  // TODO: Allow insert/extract subvector if the type does not match.
-  if (VectorTy != I.getType())
-    return false;
   Align Alignment = Load->getAlign();
   const DataLayout &DL = I.getModule()->getDataLayout();
   if (!isSafeToLoadUnconditionally(PtrOp, VectorTy, Alignment, DL, Load, &DT))
@@ -140,7 +141,15 @@
   // inselt undef, load Scalar, 0 --> load VecPtr
   IRBuilder<> Builder(Load);
   Value *CastedPtr = Builder.CreateBitCast(PtrOp, VectorTy->getPointerTo(AS));
-  LoadInst *VecLd = Builder.CreateAlignedLoad(VectorTy, CastedPtr, Alignment);
+  Value *VecLd = Builder.CreateAlignedLoad(VectorTy, CastedPtr, Alignment);
+
+  // If the insert type does not match the target's minimum vector type,
+  // use an identity shuffle to shrink/grow the vector.
+  if (Ty != VectorTy) {
+    SmallVector<int, 16> Mask(Ty->getNumElements());
+    std::iota(Mask.begin(), Mask.end(), 0);
+    VecLd = Builder.CreateShuffleVector(VecLd, UndefValue::get(VectorTy), Mask);
+  }
   replaceValue(I, *VecLd);
   ++NumVecLoad;
   return true;
Index: llvm/test/Transforms/VectorCombine/X86/load.ll
===================================================================
--- llvm/test/Transforms/VectorCombine/X86/load.ll
+++ llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -346,12 +346,11 @@
   ret <4 x float> %r
 }
 
-; TODO: Should load v4i32.
-
 define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_i32_insert_v8i32(
-; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
   %s = load i32, i32* %p, align 4
@@ -359,13 +358,10 @@
   ret <8 x i32> %r
 }
 
-; TODO: Should load v4i32.
-
 define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_i32_insert_v8i32(
-; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i32>* [[P:%.*]] to i32*
-; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
   %b = bitcast <4 x i32>* %p to i32*
@@ -374,12 +370,11 @@
   ret <8 x i32> %r
 }
 
-; TODO: Should load v4f32.
-
 define <8 x float> @load_f32_insert_v8f32(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v8f32(
-; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x float> undef, float [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x float> [[R]]
 ;
   %s = load float, float* %p, align 4
@@ -387,12 +382,11 @@
   ret <8 x float> %r
 }
 
-; TODO: Should load v4f32.
-
 define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v2f32(
-; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %s = load float, float* %p, align 4