diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -310,6 +310,7 @@
   findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr, unsigned Opc_ri,
                            const SDValue &OldBase, const SDValue &OldOffset,
                            unsigned Scale);
+  void optimizeShiftAccumulate(SDNode *N);
 
   bool tryBitfieldExtractOp(SDNode *N);
   bool tryBitfieldExtractOpFromSExt(SDNode *N);
@@ -2730,6 +2731,31 @@
   return false;
 }
 
+// Turn an OR into an ADD if it is adding 2 operands with no common bits
+// and one of the operand is a VLSHR.
+//
+// We can select this into a USRA instruction.
+void AArch64DAGToDAGISel::optimizeShiftAccumulate(SDNode *N) {
+  if (N->getOpcode() != ISD::OR)
+    return;
+
+  // Only be able to turn an OR into an ADD if no common bits set.
+  if (!CurDAG->haveNoCommonBitsSet(N->getOperand(0), N->getOperand(1)))
+    return;
+
+  for (const SDValue &Op : N->op_values()) {
+    if (Op->getOpcode() == AArch64ISD::VLSHR) {
+      SmallVector<SDValue, 3> Ops;
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+        Ops.push_back(N->getOperand(i));
+
+      CurDAG->MorphNodeTo(N, ISD::ADD, CurDAG->getVTList(N->getValueType(0)),
+                          Ops);
+      return;
+    }
+  }
+}
+
 bool AArch64DAGToDAGISel::tryBitfieldInsertOp(SDNode *N) {
   if (N->getOpcode() != ISD::OR)
     return false;
@@ -3469,6 +3495,9 @@
   case ISD::OR:
     if (tryBitfieldInsertOp(Node))
       return;
+
+    // See whether we can turn into a shift accumulate.
+    optimizeShiftAccumulate(Node);
     break;
 
   case ISD::EXTRACT_SUBVECTOR: {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1796,6 +1796,13 @@
     Known = KnownBits::commonBits(Known, Known2);
     break;
   }
+  case AArch64ISD::VLSHR: {
+    KnownBits Known2;
+    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
+    Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
+    Known = KnownBits::lshr(Known, Known2);
+    break;
+  }
   case AArch64ISD::LOADgot:
   case AArch64ISD::ADDlow: {
     if (!Subtarget->isTargetILP32())
diff --git a/llvm/test/CodeGen/AArch64/shift-accumulate.ll b/llvm/test/CodeGen/AArch64/shift-accumulate.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/shift-accumulate.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+
+define dso_local i32 @usra(<16 x i8> %0) local_unnamed_addr #0 align 32 {
+; CHECK-LABEL: usra:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:   ushr    v0.16b, v0.16b, #7
+; CHECK-NEXT:   usra    v0.8h, v0.8h, #7
+  %2 = lshr <16 x i8> %0, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  %3 = bitcast <16 x i8> %2 to <8 x i16>
+  %4 = lshr <8 x i16> %3, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  %5 = or <8 x i16> %4, %3
+  %6 = bitcast <8 x i16> %5 to <16 x i8>
+  %7 = extractelement <16 x i8> %6, i32 0
+  %8 = zext i8 %7 to i32
+  ret i32 %8
+}