Index: lib/Target/ARM/ARMISelLowering.h
===================================================================
--- lib/Target/ARM/ARMISelLowering.h
+++ lib/Target/ARM/ARMISelLowering.h
@@ -431,6 +431,10 @@
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                            Type *Ty) const override;
 
+    /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
+    /// with this index.
+    bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override;
+
     /// \brief Returns true if an argument of type Ty needs to be passed in a
     /// contiguous block of registers in calling convention CallConv.
     bool functionArgumentNeedsConsecutiveRegisters(
Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -12899,6 +12899,14 @@
   return true;
 }
 
+bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT,
+                                                unsigned Index) const {
+  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+    return false;
+
+  return (Index == 0 || Index == ResVT.getVectorNumElements());
+}
+
 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
                                         ARM_MB::MemBOpt Domain) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Index: test/CodeGen/ARM/vext.ll
===================================================================
--- test/CodeGen/ARM/vext.ll
+++ test/CodeGen/ARM/vext.ll
@@ -134,28 +134,34 @@
         ret <4 x i16> %tmp3
 }
 
-; We should ignore a build_vector with more than two sources.
-; Use illegal <32 x i16> type to produce such a shuffle after legalizing types.
-; Try to look for fallback to by-element inserts.
+; FIXME: Lower this more efficiently.  (Given an arbitrary <32 x i16>, I think
+; the most efficient lowering is three vext shuffles.)
 define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind {
 ;CHECK-LABEL: test_multisource:
-;CHECK: vmov.16 [[REG:d[0-9]+]][0]
-;CHECK: vmov.16 [[REG]][1]
-;CHECK: vmov.16 [[REG]][2]
-;CHECK: vmov.16 [[REG]][3]
+;CHECK: vld1.16
+;CHECK-NEXT: vld1.64
+;CHECK-NEXT: vld1.64
+;CHECK-NEXT: vld1.64
+;CHECK-NEXT: vorr
+;CHECK-NEXT: vzip.16
+;CHECK-NEXT: vext.16
+;CHECK-NEXT: vtrn.16
+;CHECK-NEXT: vext.16
+;CHECK-NEXT: vext.16
+;CHECK-NEXT: vmov    r0, r1
         %tmp1 = load <32 x i16>, <32 x i16>* %B
         %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
         ret <4 x i16> %tmp2
 }
 
-; We don't handle shuffles using more than half of a 128-bit vector.
-; Again, test for fallback to by-element inserts.
+; If we split the operand into two <4 x i16> vectors, this becomes
+; a vuzp.
 define <4 x i16> @test_largespan(<8 x i16>* %B) nounwind {
 ;CHECK-LABEL: test_largespan:
-;CHECK: vmov.16 [[REG:d[0-9]+]][0]
-;CHECK: vmov.16 [[REG]][1]
-;CHECK: vmov.16 [[REG]][2]
-;CHECK: vmov.16 [[REG]][3]
+;CHECK: vld1.64
+;CHECK-NEXT: vorr
+;CHECK-NEXT: vuzp.16
+;CHECK-NEXT: vmov r0, r1
         %tmp1 = load <8 x i16>, <8 x i16>* %B
         %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
         ret <4 x i16> %tmp2
Index: test/CodeGen/ARM/vpadd.ll
===================================================================
--- test/CodeGen/ARM/vpadd.ll
+++ test/CodeGen/ARM/vpadd.ll
@@ -138,20 +138,36 @@
 	ret <2 x i64> %tmp2
 }
 
-; Test AddCombine optimization that generates a vpaddl.s
-define void @addCombineToVPADDL() nounwind ssp {
-; CHECK: vpaddl.s8
-  %cbcr = alloca <16 x i8>, align 16
-  %X = alloca <8 x i8>, align 8
+; Combine vuzp+vadd->vpadd.
+; FIXME: Implement this optimization
+define void @addCombineToVPADD(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADD:
+; CHECK: vuzp.8
+; CHECK: vadd.i8
   %tmp = load <16 x i8>, <16 x i8>* %cbcr
   %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-  %tmp2 = load <16 x i8>, <16 x i8>* %cbcr
-  %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   %add = add <8 x i8> %tmp3, %tmp1
   store <8 x i8> %add, <8 x i8>* %X, align 8
   ret void
 }
 
+; Combine vuzp+vaddl->vpaddl
+; FIXME: Implement this optimization.
+define void @addCombineToVPADDL_sext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDL_sext:
+; CHECK: vuzp.8
+; CHECK: vaddl.s8
+  %tmp = load <16 x i8>, <16 x i8>* %cbcr
+  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %tmp4 = sext <8 x i8> %tmp3 to <8 x i16>
+  %tmp5 = sext <8 x i8> %tmp1 to <8 x i16>
+  %add = add <8 x i16> %tmp4, %tmp5
+  store <8 x i16> %add, <8 x i16>* %X, align 8
+  ret void
+}
+
 ; Legalization produces a EXTRACT_VECTOR_ELT DAG node which performs an extend from
 ; i16 to i32. In this case the input for the formed VPADDL needs to be a vector of i16s.
 define <2 x i16> @fromExtendingExtractVectorElt(<4 x i16> %in) {
Index: test/CodeGen/ARM/vuzp.ll
===================================================================
--- test/CodeGen/ARM/vuzp.ll
+++ test/CodeGen/ARM/vuzp.ll
@@ -6,14 +6,14 @@
 ; CHECK-NEXT:    vldr d16, [r1]
 ; CHECK-NEXT:    vldr d17, [r0]
 ; CHECK-NEXT:    vuzp.8 d17, d16
-; CHECK-NEXT:    vadd.i8 d16, d17, d16
+; CHECK-NEXT:    vmul.i8 d16, d17, d16
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = load <8 x i8>, <8 x i8>* %B
 	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        %tmp5 = mul <8 x i8> %tmp3, %tmp4
 	ret <8 x i8> %tmp5
 }
 
@@ -38,14 +38,14 @@
 ; CHECK-NEXT:    vldr d16, [r1]
 ; CHECK-NEXT:    vldr d17, [r0]
 ; CHECK-NEXT:    vuzp.16 d17, d16
-; CHECK-NEXT:    vadd.i16 d16, d17, d16
+; CHECK-NEXT:    vmul.i16 d16, d17, d16
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = load <4 x i16>, <4 x i16>* %B
 	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        %tmp5 = mul <4 x i16> %tmp3, %tmp4
 	ret <4 x i16> %tmp5
 }
 
@@ -206,14 +206,14 @@
 ; CHECK-NEXT:    vldr d16, [r1]
 ; CHECK-NEXT:    vldr d17, [r0]
 ; CHECK-NEXT:    vuzp.8 d17, d16
-; CHECK-NEXT:    vadd.i8 d16, d17, d16
+; CHECK-NEXT:    vmul.i8 d16, d17, d16
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = load <8 x i8>, <8 x i8>* %B
 	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
 	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        %tmp5 = mul <8 x i8> %tmp3, %tmp4
 	ret <8 x i8> %tmp5
 }
 
@@ -370,3 +370,20 @@
   %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1
   ret <10 x i8> %rv
 }
+
+%struct.uint8x8x2_t = type { [2 x <8 x i8>] }
+define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 {
+; CHECK-LABEL: vuzp_extract_subvector
+; CHECK: vmov
+; CHECK-NEXT: vmov
+; CHECK-NEXT: vorr
+; CHECK-NEXT: vuzp.8
+; CHECK-NEXT: vmov
+; CHECK-NEXT: vmov
+
+  %vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %vuzp1.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
+  ret %struct.uint8x8x2_t %.fca.0.1.insert
+}
Index: test/CodeGen/ARM/vzip.ll
===================================================================
--- test/CodeGen/ARM/vzip.ll
+++ test/CodeGen/ARM/vzip.ll
@@ -309,8 +309,11 @@
 define void @vzip_vext_factor(<8 x i16>* %A, <4 x i16>* %B) {
 entry:
   ; CHECK-LABEL: vzip_vext_factor
-  ; CHECK: vext.16 d16, d16, d17, #3
-  ; CHECK: vzip
+  ; CHECK: vld1.64
+  ; CHECK-NEXT: vext.16
+  ; CHECK-NEXT: vext.16
+  ; CHECK-NEXT: vext.16
+  ; CHECK-NEXT: vstr
   %tmp1 = load <8 x i16>, <8 x i16>* %A
   %0 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 4, i32 5, i32 3>
   store <4 x i16> %0, <4 x i16>* %B