diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1183,6 +1183,10 @@
   /// split during legalization. Zero is returned when the answer is unknown.
   unsigned getNumberOfParts(Type *Tp) const;
 
+  /// \returns The type of the piece into which the provided type must be
+  /// split during legalization.
+  Type *getLegalizedPartType(Type *Tp) const;
+
   /// \returns The cost of the address computation. For most targets this can be
   /// merged into the instruction indexing mode. Some targets might want to
   /// distinguish between address computation for memory operations on vector
@@ -1632,6 +1636,7 @@
                                            ArrayRef<Type *> Tys,
                                            TTI::TargetCostKind CostKind) = 0;
   virtual unsigned getNumberOfParts(Type *Tp) = 0;
+  virtual Type *getLegalizedPartType(Type *Tp) = 0;
   virtual InstructionCost
   getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) = 0;
   virtual InstructionCost
@@ -2145,6 +2150,9 @@
   unsigned getNumberOfParts(Type *Tp) override {
     return Impl.getNumberOfParts(Tp);
   }
+  Type *getLegalizedPartType(Type *Tp) override {
+    return Impl.getLegalizedPartType(Tp);
+  }
   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
                                             const SCEV *Ptr) override {
     return Impl.getAddressComputationCost(Ty, SE, Ptr);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -615,6 +615,7 @@
   }
 
   unsigned getNumberOfParts(Type *Tp) const { return 0; }
+  Type *getLegalizedPartType(Type *Tp) const { return nullptr; }
 
   InstructionCost getAddressComputationCost(Type *Tp, ScalarEvolution *,
                                             const SCEV *) const {
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1984,6 +1984,12 @@
     return *LT.first.getValue();
   }
 
+  Type *getLegalizedPartType(Type *Tp) {
+    std::pair<InstructionCost, MVT> LT =
+        getTLI()->getTypeLegalizationCost(DL, Tp);
+    return EVT(LT.second).getTypeForEVT(Tp->getContext());
+  }
+
   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
                                             const SCEV *) {
     return 0;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -879,6 +879,10 @@
   return TTIImpl->getNumberOfParts(Tp);
 }
 
+Type *TargetTransformInfo::getLegalizedPartType(Type *Tp) const {
+  return TTIImpl->getLegalizedPartType(Tp);
+}
+
 InstructionCost
 TargetTransformInfo::getAddressComputationCost(Type *Tp, ScalarEvolution *SE,
                                                const SCEV *Ptr) const {
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -36,6 +36,7 @@
 
 #define DEBUG_TYPE "vector-combine"
 STATISTIC(NumVecLoad, "Number of vector loads formed");
+STATISTIC(NumVecLoadWiden, "Number of vector loads widened");
 STATISTIC(NumVecCmp, "Number of vector compares formed");
 STATISTIC(NumVecBO, "Number of vector binops formed");
 STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
@@ -75,6 +76,7 @@
   AssumptionCache &AC;
 
   bool vectorizeLoadInsert(Instruction &I);
+  bool widenPartialVectorLoad(Instruction &I);
   ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
                                         ExtractElementInst *Ext1,
                                         unsigned PreferredExtractIndex) const;
@@ -231,6 +233,88 @@
   return true;
 }
 
+bool VectorCombine::widenPartialVectorLoad(Instruction &I) {
+  const DataLayout &DL = I.getModule()->getDataLayout();
+
+  auto *Load = dyn_cast<LoadInst>(&I);
+  if (!Load)
+    return false;
+
+  Value *OrigPtr = Load->getPointerOperand();
+  Align Alignment = Load->getAlign();
+  unsigned AS = Load->getPointerAddressSpace();
+
+  // What vector type do we currently load?
+  auto *OrigVecTy = dyn_cast<FixedVectorType>(Load->getType());
+  if (!OrigVecTy)
+    return false;
+
+  Type *ScalarEltTy = OrigVecTy->getScalarType();
+  unsigned OrigNumElts = OrigVecTy->getNumElements();
+  unsigned NumBitsPerElt = DL.getTypeSizeInBits(ScalarEltTy);
+
+  // How will that type be legalized? I.e. into what vector register
+  // will it be loaded, and how many registers will be occupied?
+  auto *LegalizedPartVecTy =
+      dyn_cast_or_null<FixedVectorType>(TTI.getLegalizedPartType(OrigVecTy));
+  unsigned NumOfLegalizedVecParts = TTI.getNumberOfParts(OrigVecTy);
+
+  // If it doesn't legalize into (a number of) vector registers, don't bother.
+  if (!LegalizedPartVecTy || !NumOfLegalizedVecParts)
+    return false;
+
+  unsigned OrigBitCount = DL.getTypeSizeInBits(OrigVecTy);
+  unsigned LegalizedVecBitCount =
+      NumOfLegalizedVecParts * DL.getTypeSizeInBits(LegalizedPartVecTy);
+  assert(LegalizedVecBitCount >= OrigBitCount &&
+         "Number of bits-to-be-loaded shouldn't decrease!");
+
+  // Do we already load a multiple of the legalized type?
+  if (OrigBitCount == LegalizedVecBitCount)
+    return false;
+
+  // How many more elements would we need to load?
+  unsigned NumExtraBits = LegalizedVecBitCount - OrigBitCount;
+  if (NumExtraBits % NumBitsPerElt != 0)
+    return false; // Not a multiple of element size.
+  // FIXME: might be able to handle some cases if they are multiple of byte.
+
+  unsigned NumExtraElts = NumExtraBits / NumBitsPerElt;
+
+  auto *WideVecTy =
+      FixedVectorType::get(ScalarEltTy, OrigNumElts + NumExtraElts);
+  assert(DL.getTypeSizeInBits(WideVecTy) == LegalizedVecBitCount &&
+         "Failed to properly widen OrigVecTy to match the total legalized "
+         "vector size?");
+
+  // Okay, we currently load less than full width of the legalized vectors.
+  // If we'd widen the load, would that be more costly than the current load?
+  InstructionCost OldLoadCost =
+      TTI.getMemoryOpCost(Instruction::Load, OrigVecTy, Alignment, AS);
+  InstructionCost NewLoadCost =
+      TTI.getMemoryOpCost(Instruction::Load, WideVecTy, Alignment, AS);
+  if (NewLoadCost > OldLoadCost)
+    return false;
+
+  // It would not be more costly. But can we perform such a wide load?
+  if (!isSafeToLoadUnconditionally(OrigPtr, WideVecTy, Align(1), DL, Load, &DT,
+                                   /*TLI=*/nullptr))
+    return false;
+
+  IRBuilder<> Builder(Load);
+  Value *CastedPtr =
+      Builder.CreateBitCast(OrigPtr, WideVecTy->getPointerTo(AS));
+  Value *WideVecLd = Builder.CreateAlignedLoad(WideVecTy, CastedPtr, Alignment);
+  // We loaded some extra elements, we only need the low NumElts ones.
+  // This is endiannes-insensitive.
+  SmallVector<int, 32> Mask(OrigNumElts);
+  std::iota(Mask.begin(), Mask.end(), 0);
+  Value *ExtractedLowSubvector = Builder.CreateShuffleVector(WideVecLd, Mask);
+  replaceValue(I, *ExtractedLowSubvector);
+  ++NumVecLoadWiden;
+  return true;
+}
+
 /// Determine which, if any, of the inputs should be replaced by a shuffle
 /// followed by extract from a different index.
 ExtractElementInst *VectorCombine::getShuffleExtract(
@@ -967,6 +1051,7 @@
         continue;
       Builder.SetInsertPoint(&I);
       MadeChange |= vectorizeLoadInsert(I);
+      MadeChange |= widenPartialVectorLoad(I);
       MadeChange |= foldExtractExtract(I);
       MadeChange |= foldBitcastShuf(I);
       MadeChange |= scalarizeBinopOrCmp(I);
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -587,8 +587,10 @@
 define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) nofree nosync {
 ; CHECK-LABEL: @load_v2f32_extract_insert_v4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[L:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[S:%.*]] = extractelement <2 x float> [[L]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %l = load <2 x float>, <2 x float>* %p, align 4
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -29,7 +29,9 @@
 
 define <2 x float> @vec_with_2elts(<2 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @vec_with_2elts(
-; CHECK-NEXT:    [[R:%.*]] = load <2 x float>, <2 x float>* [[P:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %r = load <2 x float>, <2 x float>* %p, align 16
@@ -38,7 +40,9 @@
 
 define <3 x float> @vec_with_3elts(<3 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @vec_with_3elts(
-; CHECK-NEXT:    [[R:%.*]] = load <3 x float>, <3 x float>* [[P:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x float>* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
 ; CHECK-NEXT:    ret <3 x float> [[R]]
 ;
   %r = load <3 x float>, <3 x float>* %p, align 16
@@ -70,7 +74,9 @@
 ; We can load 128 bits, and the fact that it's underaligned isn't relevant.
 define <3 x float> @vec_with_3elts_underaligned(<3 x float>* align 8 dereferenceable(16) %p) {
 ; CHECK-LABEL: @vec_with_3elts_underaligned(
-; CHECK-NEXT:    [[R:%.*]] = load <3 x float>, <3 x float>* [[P:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x float>* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 8
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
 ; CHECK-NEXT:    ret <3 x float> [[R]]
 ;
   %r = load <3 x float>, <3 x float>* %p, align 8
@@ -112,7 +118,9 @@
 
 define <2 x float> @vec_with_2elts_256bits(<2 x float>* align 32 dereferenceable(32) %p) {
 ; CHECK-LABEL: @vec_with_2elts_256bits(
-; CHECK-NEXT:    [[R:%.*]] = load <2 x float>, <2 x float>* [[P:%.*]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 32
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %r = load <2 x float>, <2 x float>* %p, align 32
@@ -121,7 +129,9 @@
 
 define <3 x float> @vec_with_3elts_256bits(<3 x float>* align 32 dereferenceable(32) %p) {
 ; CHECK-LABEL: @vec_with_3elts_256bits(
-; CHECK-NEXT:    [[R:%.*]] = load <3 x float>, <3 x float>* [[P:%.*]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x float>* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 32
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
 ; CHECK-NEXT:    ret <3 x float> [[R]]
 ;
   %r = load <3 x float>, <3 x float>* %p, align 32
@@ -139,7 +149,9 @@
 
 define <5 x float> @vec_with_5elts_256bits(<5 x float>* align 32 dereferenceable(32) %p) {
 ; CHECK-LABEL: @vec_with_5elts_256bits(
-; CHECK-NEXT:    [[R:%.*]] = load <5 x float>, <5 x float>* [[P:%.*]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <5 x float>* [[P:%.*]] to <8 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* [[TMP1]], align 32
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
 ; CHECK-NEXT:    ret <5 x float> [[R]]
 ;
   %r = load <5 x float>, <5 x float>* %p, align 32
@@ -148,7 +160,9 @@
 
 define <6 x float> @vec_with_6elts_256bits(<6 x float>* align 32 dereferenceable(32) %p) {
 ; CHECK-LABEL: @vec_with_6elts_256bits(
-; CHECK-NEXT:    [[R:%.*]] = load <6 x float>, <6 x float>* [[P:%.*]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <6 x float>* [[P:%.*]] to <8 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* [[TMP1]], align 32
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
 ; CHECK-NEXT:    ret <6 x float> [[R]]
 ;
   %r = load <6 x float>, <6 x float>* %p, align 32
@@ -157,7 +171,9 @@
 
 define <7 x float> @vec_with_7elts_256bits(<7 x float>* align 32 dereferenceable(32) %p) {
 ; CHECK-LABEL: @vec_with_7elts_256bits(
-; CHECK-NEXT:    [[R:%.*]] = load <7 x float>, <7 x float>* [[P:%.*]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <7 x float>* [[P:%.*]] to <8 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* [[TMP1]], align 32
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    ret <7 x float> [[R]]
 ;
   %r = load <7 x float>, <7 x float>* %p, align 32
@@ -187,6 +203,7 @@
 ;-------------------------------------------------------------------------------
 
 ; Weird types we don't deal with
+
 define <2 x i7> @vec_with_two_subbyte_elts(<2 x i7>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @vec_with_two_subbyte_elts(
 ; CHECK-NEXT:    [[R:%.*]] = load <2 x i7>, <2 x i7>* [[P:%.*]], align 16
@@ -216,7 +233,9 @@
 
 define <2 x float> @vec_with_2elts_addressspace(<2 x float> addrspace(2)* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @vec_with_2elts_addressspace(
-; CHECK-NEXT:    [[R:%.*]] = load <2 x float>, <2 x float> addrspace(2)* [[P:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> addrspace(2)* [[P:%.*]] to <4 x float> addrspace(2)*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float> addrspace(2)* [[TMP1]], align 16
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %r = load <2 x float>, <2 x float> addrspace(2)* %p, align 16
@@ -225,11 +244,13 @@
 
 ;-------------------------------------------------------------------------------
 
-; Widening these would change the legalized type, so leave them alone.
+; Weird types we do deal with
 
 define <2 x i1> @vec_with_2elts_128bits_i1(<2 x i1>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @vec_with_2elts_128bits_i1(
-; CHECK-NEXT:    [[R:%.*]] = load <2 x i1>, <2 x i1>* [[P:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i1>* [[P:%.*]] to <128 x i1>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <128 x i1>, <128 x i1>* [[TMP1]], align 16
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <128 x i1> [[TMP2]], <128 x i1> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %r = load <2 x i1>, <2 x i1>* %p, align 16
@@ -237,7 +258,9 @@
 }
 define <2 x i2> @vec_with_2elts_128bits_i2(<2 x i2>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @vec_with_2elts_128bits_i2(
-; CHECK-NEXT:    [[R:%.*]] = load <2 x i2>, <2 x i2>* [[P:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i2>* [[P:%.*]] to <64 x i2>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i2>, <64 x i2>* [[TMP1]], align 16
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <64 x i2> [[TMP2]], <64 x i2> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    ret <2 x i2> [[R]]
 ;
   %r = load <2 x i2>, <2 x i2>* %p, align 16
@@ -245,7 +268,9 @@
 }
 define <2 x i4> @vec_with_2elts_128bits_i4(<2 x i4>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @vec_with_2elts_128bits_i4(
-; CHECK-NEXT:    [[R:%.*]] = load <2 x i4>, <2 x i4>* [[P:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i4>* [[P:%.*]] to <32 x i4>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i4>, <32 x i4>* [[TMP1]], align 16
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <32 x i4> [[TMP2]], <32 x i4> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    ret <2 x i4> [[R]]
 ;
   %r = load <2 x i4>, <2 x i4>* %p, align 16
diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll
--- a/llvm/test/Transforms/VectorCombine/X86/load.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -587,8 +587,10 @@
 define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) nofree nosync {
 ; CHECK-LABEL: @load_v2f32_extract_insert_v4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[L:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[S:%.*]] = extractelement <2 x float> [[L]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %l = load <2 x float>, <2 x float>* %p, align 4