Index: lib/Target/X86/X86InstrInfo.td
===================================================================
--- lib/Target/X86/X86InstrInfo.td
+++ lib/Target/X86/X86InstrInfo.td
@@ -773,6 +773,7 @@
 def CallImmAddr  : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
 def FavorMemIndirectCall  : Predicate<"!Subtarget->callRegIndirect()">;
 def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
+def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
 
 //===----------------------------------------------------------------------===//
 // X86 Instruction Format Definitions.
Index: lib/Target/X86/X86InstrSSE.td
===================================================================
--- lib/Target/X86/X86InstrSSE.td
+++ lib/Target/X86/X86InstrSSE.td
@@ -8454,6 +8454,17 @@
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 }
 
+// Combine two consecutive 16-byte loads with a common destination register into
+// one 32-byte load to that register.
+let Predicates = [HasAVX, HasFastMem32] in {
+  // TODO: Add patterns for other data types, aligned ops, and stores.
+  def : Pat<(insert_subvector
+              (v8f32 (insert_subvector undef, (loadv4f32 addr:$src), (iPTR 0))),
+              (loadv4f32 (add addr:$src, (iPTR 16))),
+              (iPTR 4)),
+            (VMOVUPSYrm addr:$src)>;
+}
+
 let Predicates = [HasAVX1Only] in {
 def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
                                    (iPTR imm)),
Index: test/CodeGen/X86/unaligned-32-byte-memops.ll
===================================================================
--- test/CodeGen/X86/unaligned-32-byte-memops.ll
+++ test/CodeGen/X86/unaligned-32-byte-memops.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL --check-prefix=CHECK
 
 ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
 ; because that is slower than two 16-byte loads. 
@@ -44,3 +44,103 @@
   store <8 x float> %A, <8 x float>* %P, align 16
   ret void
 }
+
+; Merge two consecutive 16-byte subvector loads into a single 32-byte load
+; if it's faster.
+
+declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8)
+
+; Use the vinsertf128 intrinsic to model source code 
+; that explicitly uses AVX intrinsics.
+define <8 x float> @combine_16_byte_loads(float* %ptr) {
+  ; CHECK-LABEL: combine_16_byte_loads
+
+  ; SANDYB: vmovups
+  ; SANDYB-NEXT: vinsertf128
+  ; SANDYB-NEXT: retq
+
+  ; BTVER2: vmovups
+  ; BTVER2-NEXT: retq
+
+  ; HASWELL: vmovups
+  ; HASWELL-NEXT: retq
+
+  %p1 = bitcast float* %ptr to <4 x float>*
+  %v1 = load <4 x float>* %p1, align 1
+  %ptr2 = getelementptr inbounds float* %ptr, i64 4
+  %p2 = bitcast float* %ptr2 to <4 x float>*
+  %v2 = load <4 x float>* %p2, align 1
+  %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)
+  ret <8 x float> %v3
+}
+
+; Swap the operands of the shufflevector and vinsertf128 to ensure that the
+; pattern still matches.
+define <8 x float> @combine_16_byte_loads_swap(float* %ptr) {
+  ; CHECK-LABEL: combine_16_byte_loads_swap
+
+  ; SANDYB: vmovups
+  ; SANDYB-NEXT: vinsertf128
+  ; SANDYB-NEXT: retq
+
+  ; BTVER2: vmovups
+  ; BTVER2-NEXT: retq
+
+  ; HASWELL: vmovups
+  ; HASWELL-NEXT: retq
+
+  %p1 = bitcast float* %ptr to <4 x float>*
+  %v1 = load <4 x float>* %p1, align 1
+  %ptr2 = getelementptr inbounds float* %ptr, i64 4
+  %p2 = bitcast float* %ptr2 to <4 x float>*
+  %v2 = load <4 x float>* %p2, align 1
+  %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+  %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)
+  ret <8 x float> %v3
+}
+
+; Replace the vinsertf128 intrinsic with a shufflevector as might be
+; expected from auto-vectorized code.
+define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
+  ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic
+
+  ; SANDYB: vmovups
+  ; SANDYB-NEXT: vinsertf128
+  ; SANDYB-NEXT: retq
+
+  ; BTVER2: vmovups
+  ; BTVER2-NEXT: retq
+
+  ; HASWELL: vmovups
+  ; HASWELL-NEXT: retq
+
+  %v1 = load <4 x float>* %ptr, align 1
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
+  %v2 = load <4 x float>* %ptr2, align 1
+  %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %v3
+}
+
+; Swap the order of the shufflevector operands to ensure that the
+; pattern still matches.
+define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) {
+  ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic_swap
+
+  ; SANDYB: vmovups
+  ; SANDYB-NEXT: vinsertf128
+  ; SANDYB-NEXT: retq
+
+  ; BTVER2: vmovups
+  ; BTVER2-NEXT: retq
+
+  ; HASWELL: vmovups
+  ; HASWELL-NEXT: retq
+
+  %v1 = load <4 x float>* %ptr, align 1
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
+  %v2 = load <4 x float>* %ptr2, align 1
+  %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x float> %v3
+}
+