Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -490,6 +490,7 @@
   setTargetDAGCombine(ISD::INTRINSIC_VOID);
   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 
   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
@@ -8564,6 +8565,81 @@
   return SDValue();
 }
 
+/// Target-specific DAG combine for the across vector add reduction (addv).
+/// Example:
+///   ext  v1.16b, v0.16b, v0.16b, #8
+///   add  v0.4s, v1.4s, v0.4s
+///   dup  v1.4s, v0.s[1]
+///   add  v0.4s, v1.4s, v0.4s
+/// becomes:
+///   addv s0, v0.4s
+static SDValue
+performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG,
+                                  const AArch64Subtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  // If the input vector is not an ADD, can't do such combine.
+  if (N0->getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // Vector extract idx must constant zero.
+  if (!isa<ConstantSDNode>(N1) || cast<ConstantSDNode>(N1)->getZExtValue())
+    return SDValue();
+
+  EVT EltTy = N0.getValueType().getVectorElementType();
+  if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+    return SDValue();
+
+  unsigned NumVecElts = N0.getValueType().getVectorNumElements();
+  unsigned NumMaxSubAddElts = NumVecElts / 2;
+  unsigned NumAddElts = 1;
+  SDValue InputADD = N0;
+  // Iterate over each step of the reduction.
+  while (NumAddElts <= NumMaxSubAddElts) {
+    if (InputADD.getOpcode() != ISD::ADD)
+      return SDValue();
+    SDValue ADD = InputADD.getOperand(0);
+    SDValue SV = InputADD.getOperand(1);
+    if (SV.getOpcode() != ISD::VECTOR_SHUFFLE) {
+      ADD = InputADD.getOperand(1);
+      SV = InputADD.getOperand(0);
+      if (SV.getOpcode() != ISD::VECTOR_SHUFFLE)
+        return SDValue();
+    }
+    // Check if this is one step of addition reduction.
+    // E.g.,
+    //   %add = add %1, %0
+    //   %svn = vector_shuffle %add, <2, 3, u, u>
+    //   %inputadd = add %add, %svn
+    if (SV.getOperand(0) != ADD)
+      return SDValue();
+
+    ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(SV);
+    ArrayRef<int> Mask = SVN->getMask();
+    // Check the mask value in each step of sub-addition.
+    // E.g., for the add reduction for <8 xi16> vector,
+    // mask values in each sub-addition :
+    //   step 3 : <4,5,6,7,u,u,u,u>
+    //   step 2 : <2,3,u,u,u,u,u,u>
+    //   step 1 : <1,u,u,u,u,u,u,u>
+    for (unsigned int i = 0; i < NumVecElts; ++i)
+      if ((i >= NumAddElts && Mask[i] >= 0) ||
+          (i < NumAddElts &&
+           static_cast<unsigned>(Mask[i]) != (NumAddElts + i)))
+        return SDValue();
+    // Move the next step.
+    InputADD = ADD;
+    NumAddElts = NumAddElts << 1;
+  }
+  SDLoc DL(N);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
+                     DAG.getNode(AArch64ISD::UADDV, DL,
+                                 InputADD.getSimpleValueType(), InputADD),
+                     DAG.getConstant(0, DL, MVT::i64));
+}
+
 /// Target-specific DAG combine function for NEON load/store intrinsics
 /// to merge base address updates.
 static SDValue performNEONPostLDSTCombine(SDNode *N,
@@ -9158,6 +9234,8 @@
     return performNVCASTCombine(N);
   case ISD::INSERT_VECTOR_ELT:
     return performPostLD1Combine(N, DCI, true);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return performAcrossLaneReductionCombine(N, DAG, Subtarget);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
Index: test/CodeGen/AArch64/aarch64-addv.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/aarch64-addv.ll
@@ -0,0 +1,56 @@
+; RUN: llc -march=aarch64 < %s | FileCheck %s
+
+define i8 @f_v16i8(<16 x i8>* %arr)  {
+; CHECK-LABEL: f_v16i8
+; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
+  %bin.rdx = load <16 x i8>, <16 x i8>* %arr
+  %rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx0 = add <16 x i8> %bin.rdx, %rdx.shuf0
+  %rdx.shuf = shufflevector <16 x i8> %bin.rdx0, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef >
+  %bin.rdx11 = add <16 x i8> %bin.rdx0, %rdx.shuf
+  %rdx.shuf12 = shufflevector <16 x i8> %bin.rdx11, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
+  %bin.rdx13 = add <16 x i8> %bin.rdx11, %rdx.shuf12
+  %rdx.shuf13 = shufflevector <16 x i8> %bin.rdx13, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
+  %bin.rdx14 = add <16 x i8> %bin.rdx13, %rdx.shuf13
+  %r = extractelement <16 x i8> %bin.rdx14, i32 0
+  ret i8 %r
+}
+
+
+define i16 @f_v8i16(<8 x i16>* %arr)  {
+; CHECK-LABEL: f_v8i16
+; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
+  %bin.rdx = load <8 x i16>, <8 x i16>* %arr
+  %rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef>
+  %bin.rdx11 = add <8 x i16> %bin.rdx, %rdx.shuf
+  %rdx.shuf12 = shufflevector <8 x i16> %bin.rdx11, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx13 = add <8 x i16> %bin.rdx11, %rdx.shuf12
+  %rdx.shuf13 = shufflevector <8 x i16> %bin.rdx13, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx14 = add <8 x i16> %bin.rdx13, %rdx.shuf13
+  %r = extractelement <8 x i16> %bin.rdx14, i32 0
+  ret i16 %r
+}
+
+
+define i32 @f_v4i32( <4 x i32>* %arr)  {
+; CHECK-LABEL: f_v4i32
+; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
+  %bin.rdx = load <4 x i32>, <4 x i32>* %arr
+  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx11 = add <4 x i32> %bin.rdx, %rdx.shuf
+  %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx13 = add <4 x i32> %bin.rdx11, %rdx.shuf12
+  %r = extractelement <4 x i32> %bin.rdx13, i32 0
+  ret i32 %r
+}
+
+
+define i64 @f_v2i64(<2 x i64>* %arr)  {
+; CHECK-LABEL: f_v2i64
+; CHECK-NOT: addv
+  %bin.rdx = load <2 x i64>, <2 x i64>* %arr
+  %rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %bin.rdx0 = add <2 x i64> %bin.rdx, %rdx.shuf0
+  %r = extractelement <2 x i64> %bin.rdx0, i32 0
+  ret i64 %r
+}