diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -86,6 +86,7 @@
   InstructionWorklist Worklist;
 
   bool vectorizeLoadInsert(Instruction &I);
+  bool widenSubvectorLoad(Instruction &I);
   ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
                                         ExtractElementInst *Ext1,
                                         unsigned PreferredExtractIndex) const;
@@ -265,6 +266,66 @@
   return true;
 }
 
+/// If we are loading a vector and then inserting it into a larger vector with
+/// undefined elements, try to load the larger vector and eliminate the insert.
+/// This removes a shuffle in IR and may allow combining of other loaded values.
+bool VectorCombine::widenSubvectorLoad(Instruction &I) {
+  // Match subvector insert of fixed vector.
+  auto *Ty = dyn_cast<FixedVectorType>(I.getType());
+  auto *Shuf = dyn_cast<ShuffleVectorInst>(&I);
+  if (!Ty || !Shuf || !Shuf->isIdentityWithPadding())
+    return false;
+
+  // Allow a non-canonical shuffle mask that is choosing elements from op1.
+  unsigned NumOpElts =
+      cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
+  unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
+    return M >= (int)(NumOpElts);
+  });
+
+  auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
+  if (!canWidenLoad(Load, TTI))
+    return false;
+
+  // We use minimal alignment (maximum flexibility) because we only care about
+  // the dereferenceable region. When calculating cost and creating a new op,
+  // we may use a larger value based on alignment attributes.
+  const DataLayout &DL = I.getModule()->getDataLayout();
+  Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
+  assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
+  Align Alignment = Load->getAlign();
+  if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), DL, Load, &AC, &DT))
+    return false;
+
+  Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment);
+  Type *LoadTy = Load->getType();
+  unsigned AS = Load->getPointerAddressSpace();
+
+  // Original pattern: insert_subvector (load PtrOp)
+  // This conservatively assumes that the cost of a subvector insert into an
+  // undef value is 0. We could add that cost if the cost model accurately
+  // reflects the real cost of that operation.
+  InstructionCost OldCost =
+      TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
+
+  // New pattern: load PtrOp
+  InstructionCost NewCost =
+      TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS);
+
+  // We can aggressively convert to the vector form because the backend can
+  // invert this transform if it does not result in a performance win.
+  if (OldCost < NewCost || !NewCost.isValid())
+    return false;
+
+  IRBuilder<> Builder(Load);
+  Value *CastedPtr =
+      Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Ty->getPointerTo(AS));
+  Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
+  replaceValue(I, *VecLd);
+  ++NumVecLoad;
+  return true;
+}
+
 /// Determine which, if any, of the inputs should be replaced by a shuffle
 /// followed by extract from a different index.
 ExtractElementInst *VectorCombine::getShuffleExtract(
@@ -1646,6 +1707,7 @@
     Builder.SetInsertPoint(&I);
     if (!ScalarizationOnly) {
       MadeChange |= vectorizeLoadInsert(I);
+      MadeChange |= widenSubvectorLoad(I);
       MadeChange |= foldExtractExtract(I);
       MadeChange |= foldInsExtFNeg(I);
       MadeChange |= foldBitcastShuf(I);
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="e" | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="e" | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="E" | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="E" | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="e" | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="e" | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="E" | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="E" | FileCheck %s --check-prefixes=CHECK,AVX
 
 ;-------------------------------------------------------------------------------
 ; Here we know we can load 128 bits as per dereferenceability and alignment.
@@ -252,10 +252,11 @@
   ret <2 x i4> %r
 }
 
+; Load the 128-bit vector because there is no additional cost.
+
 define <4 x float> @load_v1f32_v4f32(ptr dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_v1f32_v4f32(
-; CHECK-NEXT:    [[L:%.*]] = load <1 x float>, ptr [[P:%.*]], align 16
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <1 x float> [[L]], <1 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
 ; CHECK-NEXT:    ret <4 x float> [[S]]
 ;
   %l = load <1 x float>, ptr %p, align 16
@@ -263,10 +264,12 @@
   ret <4 x float> %s
 }
 
+; Load the 128-bit vector because there is no additional cost.
+; Alignment is taken from param attr.
+
 define <4 x float> @load_v2f32_v4f32(ptr align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_v2f32_v4f32(
-; CHECK-NEXT:    [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
 ; CHECK-NEXT:    ret <4 x float> [[S]]
 ;
   %l = load <2 x float>, ptr %p, align 1
@@ -274,10 +277,11 @@
   ret <4 x float> %s
 }
 
+; Load the 128-bit vector because there is no additional cost.
+
 define <4 x float> @load_v3f32_v4f32(ptr dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_v3f32_v4f32(
-; CHECK-NEXT:    [[L:%.*]] = load <3 x float>, ptr [[P:%.*]], align 1
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <3 x float> [[L]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    ret <4 x float> [[S]]
 ;
   %l = load <3 x float>, ptr %p, align 1
@@ -285,6 +289,8 @@
   ret <4 x float> %s
 }
 
+; Negative test - the shuffle must be a simple subvector insert.
+
 define <4 x float> @load_v3f32_v4f32_wrong_mask(ptr dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_v3f32_v4f32_wrong_mask(
 ; CHECK-NEXT:    [[L:%.*]] = load <3 x float>, ptr [[P:%.*]], align 1
@@ -296,6 +302,8 @@
   ret <4 x float> %s
 }
 
+; Negative test - must be dereferenceable to vector width.
+
 define <4 x float> @load_v3f32_v4f32_not_deref(ptr dereferenceable(15) %p) {
 ; CHECK-LABEL: @load_v3f32_v4f32_not_deref(
 ; CHECK-NEXT:    [[L:%.*]] = load <3 x float>, ptr [[P:%.*]], align 16
@@ -307,21 +315,28 @@
   ret <4 x float> %s
 }
 
+; Without AVX, the cost of loading 256-bits would be greater.
+
 define <8 x float> @load_v2f32_v8f32(ptr dereferenceable(32) %p) {
-; CHECK-LABEL: @load_v2f32_v8f32(
-; CHECK-NEXT:    [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    ret <8 x float> [[S]]
+; SSE-LABEL: @load_v2f32_v8f32(
+; SSE-NEXT:    [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1
+; SSE-NEXT:    [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    ret <8 x float> [[S]]
+;
+; AVX-LABEL: @load_v2f32_v8f32(
+; AVX-NEXT:    [[S:%.*]] = load <8 x float>, ptr [[P:%.*]], align 1
+; AVX-NEXT:    ret <8 x float> [[S]]
 ;
   %l = load <2 x float>, ptr %p, align 1
   %s = shufflevector <2 x float> %l, <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x float> %s
 }
 
+; Integer type is ok too.
+
 define <4 x i32> @load_v2i32_v4i32(ptr dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_v2i32_v4i32(
-; CHECK-NEXT:    [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    ret <4 x i32> [[S]]
 ;
   %l = load <2 x i32>, ptr %p, align 1
@@ -329,6 +344,8 @@
   ret <4 x i32> %s
 }
 
+; TODO: We assumed the shuffle mask is canonical.
+
 define <4 x i32> @load_v2i32_v4i32_non_canonical_mask(ptr dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_v2i32_v4i32_non_canonical_mask(
 ; CHECK-NEXT:    [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
@@ -340,10 +357,11 @@
   ret <4 x i32> %s
 }
 
+; Allow non-canonical commuted shuffle.
+
 define <4 x i32> @load_v2i32_v4i32_non_canonical_mask_commute(ptr dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_v2i32_v4i32_non_canonical_mask_commute(
-; CHECK-NEXT:    [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i32> poison, <2 x i32> [[L]], <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    ret <4 x i32> [[S]]
 ;
   %l = load <2 x i32>, ptr %p, align 1
@@ -351,11 +369,12 @@
   ret <4 x i32> %s
 }
 
+; The wide load must be in the same addrspace as the original load.
+
 define <4 x i32> @load_v2i32_v4i32_addrspacecast(ptr addrspace(5) align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_v2i32_v4i32_addrspacecast(
-; CHECK-NEXT:    [[ASC:%.*]] = addrspacecast ptr addrspace(5) [[P:%.*]] to ptr addrspace(42)
-; CHECK-NEXT:    [[L:%.*]] = load <2 x i32>, ptr addrspace(42) [[ASC]], align 4
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[P:%.*]] to ptr addrspace(42)
+; CHECK-NEXT:    [[S:%.*]] = load <4 x i32>, ptr addrspace(42) [[TMP1]], align 16
 ; CHECK-NEXT:    ret <4 x i32> [[S]]
 ;
   %asc = addrspacecast ptr addrspace(5) %p to ptr addrspace(42)