Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -9420,6 +9420,48 @@
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
 }
 
+static SDValue AddCombineVUZPToVPADD(SDNode *N, SDValue N0, SDValue N1,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const ARMSubtarget *Subtarget) {
+  // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
+  if (!N->getValueType(0).is64BitVector())
+    return SDValue();
+
+  // Check for ADD(EXTR_SUBV(VUZP.0), EXTR_SUBV(VUZP.1)).
+  if (!(N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        N1.getOpcode() == ISD::EXTRACT_SUBVECTOR))
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  SDValue N10 = N1.getOperand(0);
+
+  if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
+      N00 == N10)
+    return SDValue();
+
+  // Generate vpadd with the right subvectors.
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc dl(N);
+  SDNode *Unzip = N00.getNode();
+  EVT VT = N->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SDValue extract0 =  DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+				  Unzip->getOperand(0),
+				  DAG.getIntPtrConstant(0, dl));
+  SDValue extract1 =  DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+				  Unzip->getOperand(0),
+				  DAG.getIntPtrConstant(NumElts, dl));
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
+                                TLI.getPointerTy(DAG.getDataLayout())));
+  Ops.push_back(extract0);
+  Ops.push_back(extract1);
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
+}
+
 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,
                                       TargetLowering::DAGCombinerInfo &DCI,
                                       const ARMSubtarget *Subtarget) {
@@ -9926,6 +9968,9 @@
   if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
     return Result;
 
+  if (SDValue Result = AddCombineVUZPToVPADD(N, N0, N1, DCI, Subtarget))
+    return Result;
+
   // Attempt to create vpaddl for this add.
   if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
     return Result;
Index: test/CodeGen/ARM/vpadd.ll
===================================================================
--- test/CodeGen/ARM/vpadd.ll
+++ test/CodeGen/ARM/vpadd.ll
@@ -385,6 +385,26 @@
   ret void
 }
 
+; PR32999: combine vuzp+add->vpadd
+define void @pr32999(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: pr32999:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vpadd.i16 d16, d16, d17
+; CHECK-NEXT:    vstr d16, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <16 x i8>, <16 x i8>* %cbcr
+  %tmp1 = zext <16 x i8> %tmp to <16 x i16>
+  %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
+  %tmp2a = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %tmp3a = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %add = add <4 x i16> %tmp2a, %tmp3a
+  store <4 x i16> %add, <4 x i16>* %X, align 8
+  ret void
+}
+
 ; Combine vuzp+vaddl->vpaddl
 define void @addCombineToVPADDLq_s16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp {
 ; CHECK-LABEL: addCombineToVPADDLq_s16: