Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -6058,7 +6058,6 @@
 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
                                         SDLoc &DL, SelectionDAG &DAG,
                                         bool isAfterLegalize) {
-  EVT EltVT = VT.getVectorElementType();
   unsigned NumElems = Elts.size();
 
   LoadSDNode *LDBase = nullptr;
@@ -6069,7 +6068,9 @@
   // non-consecutive, bail out.
   for (unsigned i = 0; i < NumElems; ++i) {
     SDValue Elt = Elts[i];
-
+    // Look through a bitcast.
+    if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
+      Elt = Elt.getOperand(0);
     if (!Elt.getNode() ||
         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
       return SDValue();
@@ -6084,7 +6085,8 @@
       continue;
 
     LoadSDNode *LD = cast<LoadSDNode>(Elt);
-    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+    EVT LdVT = Elt.getValueType();
+    if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
       return SDValue();
     LastLoadedElt = i;
   }
@@ -6119,6 +6121,7 @@
 
   //TODO: The code below fires only for for loading the low v2i32 / v2f32
   //of a v4i32 / v4f32. It's probably worth generalizing.
+  EVT EltVT = VT.getVectorElementType();
   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
@@ -13206,7 +13209,19 @@
   SDValue Idx = Op.getOperand(2);
   MVT OpVT = Op.getSimpleValueType();
   MVT SubVecVT = SubVec.getSimpleValueType();
-    
+
+  // Fold two 16-byte subvector loads into one 32-byte load:
+  // (insert_subvector (insert_subvector undef, (load addr)), (load addr + 16))
+  // --> load32 addr
+  if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      OpVT.is256BitVector() &&
+      !Subtarget->isUnalignedMem32Slow()) {
+    SDValue Ops[] = { Vec.getOperand(1), SubVec };
+    SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
+    if (LD.getNode())
+      return LD;
+  }
+  
   if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
       SubVecVT.is128BitVector() && isa<ConstantSDNode>(Idx)) {
     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
Index: lib/Target/X86/X86InstrSSE.td
===================================================================
--- lib/Target/X86/X86InstrSSE.td
+++ lib/Target/X86/X86InstrSSE.td
@@ -8141,49 +8141,6 @@
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 }
 
-// Combine two consecutive 16-byte loads with a common destination register into
-// one 32-byte load to that register.
-let Predicates = [HasAVX, HasFastMem32] in {
-  def : Pat<(insert_subvector
-              (v8f32 (insert_subvector undef, (loadv4f32 addr:$src), (iPTR 0))),
-              (loadv4f32 (add addr:$src, (iPTR 16))),
-              (iPTR 4)),
-            (VMOVUPSYrm addr:$src)>;
-
-  def : Pat<(insert_subvector
-              (v4f64 (insert_subvector undef, (loadv2f64 addr:$src), (iPTR 0))),
-              (loadv2f64 (add addr:$src, (iPTR 16))),
-              (iPTR 2)),
-            (VMOVUPDYrm addr:$src)>;
-
-  def : Pat<(insert_subvector
-              (v32i8 (insert_subvector
-                undef, (bc_v16i8 (loadv2i64 addr:$src)), (iPTR 0))),
-              (bc_v16i8 (loadv2i64 (add addr:$src, (iPTR 16)))),
-              (iPTR 16)),
-            (VMOVDQUYrm addr:$src)>;
-
-  def : Pat<(insert_subvector
-              (v16i16 (insert_subvector
-                undef, (bc_v8i16 (loadv2i64 addr:$src)), (iPTR 0))),
-              (bc_v8i16 (loadv2i64 (add addr:$src, (iPTR 16)))),
-              (iPTR 8)),
-            (VMOVDQUYrm addr:$src)>;
-
-  def : Pat<(insert_subvector
-              (v8i32 (insert_subvector
-                undef, (bc_v4i32 (loadv2i64 addr:$src)), (iPTR 0))),
-              (bc_v4i32 (loadv2i64 (add addr:$src, (iPTR 16)))),
-              (iPTR 4)),
-            (VMOVDQUYrm addr:$src)>;
-
-  def : Pat<(insert_subvector
-              (v4i64 (insert_subvector undef, (loadv2i64 addr:$src), (iPTR 0))),
-              (loadv2i64 (add addr:$src, (iPTR 16))),
-              (iPTR 2)),
-            (VMOVDQUYrm addr:$src)>;
-}
-
 let Predicates = [HasAVX1Only] in {
 def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
                                    (iPTR imm)),
Index: test/CodeGen/X86/unaligned-32-byte-memops.ll
===================================================================
--- test/CodeGen/X86/unaligned-32-byte-memops.ll
+++ test/CodeGen/X86/unaligned-32-byte-memops.ll
@@ -65,8 +65,9 @@
   ; HASWELL: vmovups
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
-  %v1 = load <4 x float>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 1
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 2
+  %v1 = load <4 x float>* %ptr1, align 1
   %v2 = load <4 x float>* %ptr2, align 1
   %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)
@@ -88,8 +89,9 @@
   ; HASWELL: vmovups
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
-  %v1 = load <4 x float>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 2
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 3
+  %v1 = load <4 x float>* %ptr1, align 1
   %v2 = load <4 x float>* %ptr2, align 1
   %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
   %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)
@@ -111,8 +113,9 @@
   ; HASWELL: vmovups
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
-  %v1 = load <4 x float>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 3
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 4
+  %v1 = load <4 x float>* %ptr1, align 1
   %v2 = load <4 x float>* %ptr2, align 1
   %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x float> %v3
@@ -133,8 +136,9 @@
   ; HASWELL: vmovups
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
-  %v1 = load <4 x float>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 4
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 5
+  %v1 = load <4 x float>* %ptr1, align 1
   %v2 = load <4 x float>* %ptr2, align 1
   %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <8 x float> %v3
@@ -160,12 +164,13 @@
   ; BTVER2-NEXT: vinsertf128
   ; BTVER2-NEXT: retq
 
-  ; HASWELL: vmovdqu
-  ; HASWELL-NEXT: vpaddq
+  ; HASWELL-NOT: vextract
+  ; HASWELL: vpaddq
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 1
-  %v1 = load <2 x i64>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <2 x i64>* %ptr, i64 5
+  %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 6
+  %v1 = load <2 x i64>* %ptr1, align 1
   %v2 = load <2 x i64>* %ptr2, align 1
   %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %v4 = add <4 x i64> %v3, %x
@@ -187,12 +192,13 @@
   ; BTVER2-NEXT: vinsertf128
   ; BTVER2-NEXT: retq
 
-  ; HASWELL: vmovdqu
-  ; HASWELL-NEXT: vpaddd
+  ; HASWELL-NOT: vextract
+  ; HASWELL: vpaddd
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 1
-  %v1 = load <4 x i32>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <4 x i32>* %ptr, i64 6
+  %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 7
+  %v1 = load <4 x i32>* %ptr1, align 1
   %v2 = load <4 x i32>* %ptr2, align 1
   %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %v4 = add <8 x i32> %v3, %x
@@ -214,12 +220,13 @@
   ; BTVER2-NEXT: vinsertf128
   ; BTVER2-NEXT: retq
 
-  ; HASWELL: vmovdqu
-  ; HASWELL-NEXT: vpaddw
+  ; HASWELL-NOT: vextract
+  ; HASWELL: vpaddw
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 1
-  %v1 = load <8 x i16>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <8 x i16>* %ptr, i64 7
+  %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 8
+  %v1 = load <8 x i16>* %ptr1, align 1
   %v2 = load <8 x i16>* %ptr2, align 1
   %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %v4 = add <16 x i16> %v3, %x
@@ -241,12 +248,13 @@
   ; BTVER2-NEXT: vinsertf128
   ; BTVER2-NEXT: retq
 
-  ; HASWELL: vmovdqu
-  ; HASWELL-NEXT: vpaddb
+  ; HASWELL-NOT: vextract
+  ; HASWELL: vpaddb
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 1
-  %v1 = load <16 x i8>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <16 x i8>* %ptr, i64 8
+  %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 9
+  %v1 = load <16 x i8>* %ptr1, align 1
   %v2 = load <16 x i8>* %ptr2, align 1
   %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %v4 = add <32 x i8> %v3, %x
@@ -261,16 +269,17 @@
   ; SANDYB-NEXT: vaddpd
   ; SANDYB-NEXT: retq
 
-  ; BTVER2: vmovupd
-  ; BTVER2-NEXT: vaddpd
+  ; BTVER2-NOT: vinsertf128
+  ; BTVER2: vaddpd
   ; BTVER2-NEXT: retq
 
-  ; HASWELL: vmovupd
+  ; HASWELL-NOT: vinsertf128
   ; HASWELL: vaddpd
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 1
-  %v1 = load <2 x double>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <2 x double>* %ptr, i64 9
+  %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 10
+  %v1 = load <2 x double>* %ptr1, align 1
   %v2 = load <2 x double>* %ptr2, align 1
   %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %v4 = fadd <4 x double> %v3, %x