Index: llvm/include/llvm/CodeGen/ISDOpcodes.h
===================================================================
--- llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -611,6 +611,15 @@
   MULHU,
   MULHS,
 
+  /// HADDS/HADDU - Having add - Add two integers using an integer of type
+  /// i[N+1], halving the result by shifting it one bit right.
+  HADDS,
+  HADDU,
+  /// RHADDS/RHADDU - Rounding having add - Add two integers using an integer of
+  /// type i[N+2], add 1 and halve the result by shifting it one bit right.
+  RHADDS,
+  RHADDU,
+
   // ABDS/ABDU - Absolute difference - Return the absolute difference between
   // two numbers interpreted as signed/unsigned.
   // i.e trunc(abs(sext(Op0) - sext(Op1))) becomes abds(Op0, Op1)
Index: llvm/include/llvm/Target/TargetSelectionDAG.td
===================================================================
--- llvm/include/llvm/Target/TargetSelectionDAG.td
+++ llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -369,6 +369,10 @@
                         [SDNPCommutative, SDNPAssociative]>;
 def mulhs      : SDNode<"ISD::MULHS"     , SDTIntBinOp, [SDNPCommutative]>;
 def mulhu      : SDNode<"ISD::MULHU"     , SDTIntBinOp, [SDNPCommutative]>;
+def hadds      : SDNode<"ISD::HADDS"     , SDTIntBinOp, [SDNPCommutative]>;
+def haddu      : SDNode<"ISD::HADDU"     , SDTIntBinOp, [SDNPCommutative]>;
+def rhadds     : SDNode<"ISD::RHADDS"    , SDTIntBinOp, [SDNPCommutative]>;
+def rhaddu     : SDNode<"ISD::RHADDU"    , SDTIntBinOp, [SDNPCommutative]>;
 def abds       : SDNode<"ISD::ABDS"      , SDTIntBinOp, [SDNPCommutative]>;
 def abdu       : SDNode<"ISD::ABDU"      , SDTIntBinOp, [SDNPCommutative]>;
 def smullohi   : SDNode<"ISD::SMUL_LOHI" , SDTIntBinHiLoOp, [SDNPCommutative]>;
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -231,6 +231,10 @@
   case ISD::MUL:                        return "mul";
   case ISD::MULHU:                      return "mulhu";
   case ISD::MULHS:                      return "mulhs";
+  case ISD::HADDU:                      return "haddu";
+  case ISD::HADDS:                      return "hadds";
+  case ISD::RHADDU:                     return "rhaddu";
+  case ISD::RHADDS:                     return "rhadds";
   case ISD::ABDS:                       return "abds";
   case ISD::ABDU:                       return "abdu";
   case ISD::SDIV:                       return "sdiv";
Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -880,6 +880,91 @@
                                          Depth);
 }
 
+// Attempt to form ext(hadd(A, B)) from shr(add(ext(A), ext(B)), 1).
+//      or to form ext(rhadd(A, B)) from shr(add(ext(A), ext(B), 1), 1).
+static SDValue combineShiftToHADD(SDValue Op, SelectionDAG &DAG,
+                                  const TargetLowering &TLI,
+                                  const APInt &DemandedBits) {
+  assert((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) &&
+         "SRL or SRA node is required here!");
+  // Is the right shift using an immediate value of 1?
+  ConstantSDNode *N1C = isConstOrConstSplat(Op.getOperand(1));
+  if (!N1C || !N1C->isOne())
+    return SDValue();
+
+  // We are looking for an hadd
+  // add(ext, ext)
+  // or one of these as a rhadd
+  // add(add(ext, ext), 1)
+  // add(add(ext, 1), ext)
+  // add(ext, add(ext, 1))
+  SDValue Add = Op.getOperand(0);
+  if (Add.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  SDValue ExtOpA = Add.getOperand(0);
+  SDValue ExtOpB = Add.getOperand(1);
+  auto MatchOperands = [&](SDValue Op1, SDValue Op2, SDValue Op3) {
+    ConstantSDNode *ConstOp;
+    if ((ConstOp = isConstOrConstSplat(Op1)) && ConstOp->isOne()) {
+      ExtOpA = Op2;
+      ExtOpB = Op3;
+      return true;
+    }
+    if ((ConstOp = isConstOrConstSplat(Op2)) && ConstOp->isOne()) {
+      ExtOpA = Op1;
+      ExtOpB = Op3;
+      return true;
+    }
+    if ((ConstOp = isConstOrConstSplat(Op3)) && ConstOp->isOne()) {
+      ExtOpA = Op1;
+      ExtOpB = Op2;
+      return true;
+    }
+    return false;
+  };
+  bool IsRHADD =
+      (ExtOpA.getOpcode() == ISD::ADD &&
+       MatchOperands(ExtOpA.getOperand(0), ExtOpA.getOperand(1), ExtOpB)) ||
+      (ExtOpB.getOpcode() == ISD::ADD &&
+       MatchOperands(ExtOpB.getOperand(0), ExtOpB.getOperand(1), ExtOpA));
+
+  // This handles the valid cases of:
+  // The two extends are zext and the shift is a srl or sra.
+  // The two extends are sext and the shift is a sra.
+  // The two extends are zext and the shift is srl and the top bit is not
+  // demanded.
+  auto IsValidCombo = [](unsigned ShiftOp, unsigned ExtOp,
+                         const APInt &DemandedBits) {
+    if (ExtOp == ISD::ZERO_EXTEND)
+      return true;
+    if (ExtOp != ISD::SIGN_EXTEND)
+      return false;
+    if (ShiftOp == ISD::SRA)
+      return true;
+    return DemandedBits.countLeadingZeros() >= 1;
+  };
+  unsigned ExtOpAOpc = ExtOpA.getOpcode();
+  if (ExtOpAOpc != ExtOpB.getOpcode() ||
+      !IsValidCombo(Op.getOpcode(), ExtOpAOpc, DemandedBits))
+    return SDValue();
+
+  // Is the result of the right shift being truncated to the same value type as
+  // the original operands, OpA and OpB?
+  SDValue OpA = ExtOpA.getOperand(0);
+  SDValue OpB = ExtOpB.getOperand(0);
+  EVT VT = OpA.getValueType();
+  bool IsSignExtend = ExtOpAOpc == ISD::SIGN_EXTEND;
+  unsigned HADDOpc = IsRHADD ? (IsSignExtend ? ISD::RHADDS : ISD::RHADDU)
+                             : (IsSignExtend ? ISD::HADDS : ISD::HADDU);
+  if (VT != OpB.getValueType() || !TLI.isOperationLegalOrCustom(HADDOpc, VT))
+    return SDValue();
+
+  SDLoc DL(Op);
+  SDValue ResultHADD = DAG.getNode(HADDOpc, DL, VT, OpA, OpB);
+  return DAG.getNode(ExtOpAOpc, DL, Op.getValueType(), ResultHADD);
+}
+
 /// Look at Op. At this point, we know that only the OriginalDemandedBits of the
 /// result of Op are ever used downstream. If we can use this information to
 /// simplify Op, create a new simplified DAG node and return true, returning the
@@ -1542,6 +1627,10 @@
     SDValue Op1 = Op.getOperand(1);
     EVT ShiftVT = Op1.getValueType();
 
+    // Try to match HADD/RHADD patterns.
+    if (SDValue HADD = combineShiftToHADD(Op, TLO.DAG, *this, DemandedBits))
+      return TLO.CombineTo(Op, HADD);
+
     if (const APInt *SA =
             TLO.DAG.getValidShiftAmountConstant(Op, DemandedElts)) {
       unsigned ShAmt = SA->getZExtValue();
@@ -1608,6 +1697,10 @@
     if (DemandedBits.isOneValue())
       return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1));
 
+    // Try to match HADD/RHADD patterns.
+    if (SDValue HADD = combineShiftToHADD(Op, TLO.DAG, *this, DemandedBits))
+      return TLO.CombineTo(Op, HADD);
+
     if (const APInt *SA =
             TLO.DAG.getValidShiftAmountConstant(Op, DemandedElts)) {
       unsigned ShAmt = SA->getZExtValue();
Index: llvm/lib/CodeGen/TargetLoweringBase.cpp
===================================================================
--- llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -813,6 +813,12 @@
     setOperationAction(ISD::SUBC, VT, Expand);
     setOperationAction(ISD::SUBE, VT, Expand);
 
+    // Halving adds
+    setOperationAction(ISD::HADDS, VT, Expand);
+    setOperationAction(ISD::HADDU, VT, Expand);
+    setOperationAction(ISD::RHADDS, VT, Expand);
+    setOperationAction(ISD::RHADDU, VT, Expand);
+
     // Absolute difference
     setOperationAction(ISD::ABDS, VT, Expand);
     setOperationAction(ISD::ABDU, VT, Expand);
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -228,14 +228,6 @@
   SADDV,
   UADDV,
 
-  // Vector halving addition
-  SHADD,
-  UHADD,
-
-  // Vector rounding halving addition
-  SRHADD,
-  URHADD,
-
   // Unsigned Add Long Pairwise
   UADDLP,
 
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -896,7 +896,6 @@
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
-  setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::CONCAT_VECTORS);
   setTargetDAGCombine(ISD::STORE);
   if (Subtarget->supportsAddressTopByteIgnored())
@@ -1052,6 +1051,10 @@
 
     for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
                    MVT::v4i32}) {
+      setOperationAction(ISD::HADDS, VT, Legal);
+      setOperationAction(ISD::HADDU, VT, Legal);
+      setOperationAction(ISD::RHADDS, VT, Legal);
+      setOperationAction(ISD::RHADDU, VT, Legal);
       setOperationAction(ISD::ABDS, VT, Legal);
       setOperationAction(ISD::ABDU, VT, Legal);
     }
@@ -1994,10 +1997,6 @@
     MAKE_CASE(AArch64ISD::FCMLTz)
     MAKE_CASE(AArch64ISD::SADDV)
     MAKE_CASE(AArch64ISD::UADDV)
-    MAKE_CASE(AArch64ISD::SRHADD)
-    MAKE_CASE(AArch64ISD::URHADD)
-    MAKE_CASE(AArch64ISD::SHADD)
-    MAKE_CASE(AArch64ISD::UHADD)
     MAKE_CASE(AArch64ISD::SDOT)
     MAKE_CASE(AArch64ISD::UDOT)
     MAKE_CASE(AArch64ISD::SMINV)
@@ -4104,9 +4103,8 @@
                         IntNo == Intrinsic::aarch64_neon_shadd);
     bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
                           IntNo == Intrinsic::aarch64_neon_urhadd);
-    unsigned Opcode =
-        IsSignedAdd ? (IsRoundingAdd ? AArch64ISD::SRHADD : AArch64ISD::SHADD)
-                    : (IsRoundingAdd ? AArch64ISD::URHADD : AArch64ISD::UHADD);
+    unsigned Opcode = IsSignedAdd ? (IsRoundingAdd ? ISD::RHADDS : ISD::HADDS)
+                                  : (IsRoundingAdd ? ISD::RHADDU : ISD::HADDU);
     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
                        Op.getOperand(2));
   }
@@ -13224,89 +13222,6 @@
   return SDValue();
 }
 
-// Attempt to form urhadd(OpA, OpB) from
-// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1))
-// or uhadd(OpA, OpB) from truncate(vlshr(add(zext(OpA), zext(OpB)), 1)).
-// The original form of the first expression is
-// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and the
-// (OpA + OpB + 1) subexpression will have been changed to (OpB - (~OpA)).
-// Before this function is called the srl will have been lowered to
-// AArch64ISD::VLSHR.
-// This pass can also recognize signed variants of the patterns that use sign
-// extension instead of zero extension and form a srhadd(OpA, OpB) or a
-// shadd(OpA, OpB) from them.
-static SDValue
-performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
-                             SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-
-  // Since we are looking for a right shift by a constant value of 1 and we are
-  // operating on types at least 16 bits in length (sign/zero extended OpA and
-  // OpB, which are at least 8 bits), it follows that the truncate will always
-  // discard the shifted-in bit and therefore the right shift will be logical
-  // regardless of the signedness of OpA and OpB.
-  SDValue Shift = N->getOperand(0);
-  if (Shift.getOpcode() != AArch64ISD::VLSHR)
-    return SDValue();
-
-  // Is the right shift using an immediate value of 1?
-  uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
-  if (ShiftAmount != 1)
-    return SDValue();
-
-  SDValue ExtendOpA, ExtendOpB;
-  SDValue ShiftOp0 = Shift.getOperand(0);
-  unsigned ShiftOp0Opc = ShiftOp0.getOpcode();
-  if (ShiftOp0Opc == ISD::SUB) {
-
-    SDValue Xor = ShiftOp0.getOperand(1);
-    if (Xor.getOpcode() != ISD::XOR)
-      return SDValue();
-
-    // Is the XOR using a constant amount of all ones in the right hand side?
-    uint64_t C;
-    if (!isAllConstantBuildVector(Xor.getOperand(1), C))
-      return SDValue();
-
-    unsigned ElemSizeInBits = VT.getScalarSizeInBits();
-    APInt CAsAPInt(ElemSizeInBits, C);
-    if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits))
-      return SDValue();
-
-    ExtendOpA = Xor.getOperand(0);
-    ExtendOpB = ShiftOp0.getOperand(0);
-  } else if (ShiftOp0Opc == ISD::ADD) {
-    ExtendOpA = ShiftOp0.getOperand(0);
-    ExtendOpB = ShiftOp0.getOperand(1);
-  } else
-    return SDValue();
-
-  unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
-  unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
-  if (!(ExtendOpAOpc == ExtendOpBOpc &&
-        (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND)))
-    return SDValue();
-
-  // Is the result of the right shift being truncated to the same value type as
-  // the original operands, OpA and OpB?
-  SDValue OpA = ExtendOpA.getOperand(0);
-  SDValue OpB = ExtendOpB.getOperand(0);
-  EVT OpAVT = OpA.getValueType();
-  assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
-  if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
-    return SDValue();
-
-  SDLoc DL(N);
-  bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
-  bool IsRHADD = ShiftOp0Opc == ISD::SUB;
-  unsigned HADDOpc = IsSignExtend
-                         ? (IsRHADD ? AArch64ISD::SRHADD : AArch64ISD::SHADD)
-                         : (IsRHADD ? AArch64ISD::URHADD : AArch64ISD::UHADD);
-  SDValue ResultHADD = DAG.getNode(HADDOpc, DL, VT, OpA, OpB);
-
-  return ResultHADD;
-}
-
 static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
   switch (Opcode) {
   case ISD::FADD:
@@ -13418,8 +13333,8 @@
   // ->
   //  (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
   if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
-      (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD ||
-       N0Opc == AArch64ISD::UHADD || N0Opc == AArch64ISD::SHADD)) {
+      (N0Opc == ISD::RHADDU || N0Opc == ISD::RHADDS || N0Opc == ISD::HADDU ||
+       N0Opc == ISD::HADDS)) {
     SDValue N00 = N0->getOperand(0);
     SDValue N01 = N0->getOperand(1);
     SDValue N10 = N1->getOperand(0);
@@ -16520,8 +16435,6 @@
     return performExtendCombine(N, DCI, DAG);
   case ISD::SIGN_EXTEND_INREG:
     return performSignExtendInRegCombine(N, DCI, DAG);
-  case ISD::TRUNCATE:
-    return performVectorTruncateCombine(N, DCI, DAG);
   case ISD::CONCAT_VECTORS:
     return performConcatVectorsCombine(N, DCI, DAG);
   case ISD::SELECT:
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -580,11 +580,6 @@
 def AArch64smaxv    : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>;
 def AArch64umaxv    : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
 
-def AArch64srhadd   : SDNode<"AArch64ISD::SRHADD", SDT_AArch64binvec>;
-def AArch64urhadd   : SDNode<"AArch64ISD::URHADD", SDT_AArch64binvec>;
-def AArch64shadd   : SDNode<"AArch64ISD::SHADD", SDT_AArch64binvec>;
-def AArch64uhadd   : SDNode<"AArch64ISD::UHADD", SDT_AArch64binvec>;
-
 def AArch64uabd     : PatFrags<(ops node:$lhs, node:$rhs),
                                [(abdu node:$lhs, node:$rhs),
                                 (int_aarch64_neon_uabd node:$lhs, node:$rhs)]>;
@@ -4325,7 +4320,7 @@
 defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
       TriOpFrag<(add node:$LHS, (AArch64sabd node:$MHS, node:$RHS))> >;
 defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", AArch64sabd>;
-defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", AArch64shadd>;
+defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", hadds>;
 defm SHSUB    : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
 defm SMAXP    : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
 defm SMAX     : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>;
@@ -4337,14 +4332,14 @@
 defm SQRSHL   : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
 defm SQSHL    : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
 defm SQSUB    : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
-defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", AArch64srhadd>;
+defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", rhadds>;
 defm SRSHL    : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
 defm SSHL     : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
 defm SUB      : SIMDThreeSameVector<1,0b10000,"sub", sub>;
 defm UABA     : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
       TriOpFrag<(add node:$LHS, (AArch64uabd node:$MHS, node:$RHS))> >;
 defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", AArch64uabd>;
-defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", AArch64uhadd>;
+defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", haddu>;
 defm UHSUB    : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
 defm UMAXP    : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
 defm UMAX     : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>;
@@ -4354,7 +4349,7 @@
 defm UQRSHL   : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
 defm UQSHL    : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
 defm UQSUB    : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
-defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", AArch64urhadd>;
+defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", rhaddu>;
 defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
 defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
 defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
Index: llvm/test/CodeGen/AArch64/arm64-vhadd.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-vhadd.ll
+++ llvm/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -705,8 +705,8 @@
 define <4 x i32> @hadd16_sext_asr(<4 x i16> %src1, <4 x i16> %src2) nounwind {
 ; CHECK-LABEL: hadd16_sext_asr:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    saddl.4s v0, v0, v1
-; CHECK-NEXT:    sshr.4s v0, v0, #1
+; CHECK-NEXT:    shadd.4h v0, v0, v1
+; CHECK-NEXT:    sshll.4s v0, v0, #0
 ; CHECK-NEXT:    ret
   %zextsrc1 = sext <4 x i16> %src1 to <4 x i32>
   %zextsrc2 = sext <4 x i16> %src2 to <4 x i32>
@@ -718,8 +718,8 @@
 define <4 x i32> @hadd16_zext_asr(<4 x i16> %src1, <4 x i16> %src2) nounwind {
 ; CHECK-LABEL: hadd16_zext_asr:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uaddl.4s v0, v0, v1
-; CHECK-NEXT:    ushr.4s v0, v0, #1
+; CHECK-NEXT:    uhadd.4h v0, v0, v1
+; CHECK-NEXT:    ushll.4s v0, v0, #0
 ; CHECK-NEXT:    ret
   %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
   %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
@@ -744,8 +744,8 @@
 define <4 x i32> @hadd16_zext_lsr(<4 x i16> %src1, <4 x i16> %src2) nounwind {
 ; CHECK-LABEL: hadd16_zext_lsr:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uaddl.4s v0, v0, v1
-; CHECK-NEXT:    ushr.4s v0, v0, #1
+; CHECK-NEXT:    uhadd.4h v0, v0, v1
+; CHECK-NEXT:    ushll.4s v0, v0, #0
 ; CHECK-NEXT:    ret
   %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
   %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
@@ -759,10 +759,9 @@
 define <4 x i64> @hadd32_sext_asr(<4 x i32> %src1, <4 x i32> %src2) nounwind {
 ; CHECK-LABEL: hadd32_sext_asr:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    saddl.2d v2, v0, v1
-; CHECK-NEXT:    saddl2.2d v0, v0, v1
-; CHECK-NEXT:    sshr.2d v1, v0, #1
-; CHECK-NEXT:    sshr.2d v0, v2, #1
+; CHECK-NEXT:    shadd.4s v0, v0, v1
+; CHECK-NEXT:    sshll2.2d v1, v0, #0
+; CHECK-NEXT:    sshll.2d v0, v0, #0
 ; CHECK-NEXT:    ret
   %zextsrc1 = sext <4 x i32> %src1 to <4 x i64>
   %zextsrc2 = sext <4 x i32> %src2 to <4 x i64>
@@ -774,10 +773,9 @@
 define <4 x i64> @hadd32_zext_asr(<4 x i32> %src1, <4 x i32> %src2) nounwind {
 ; CHECK-LABEL: hadd32_zext_asr:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uaddl.2d v2, v0, v1
-; CHECK-NEXT:    uaddl2.2d v0, v0, v1
-; CHECK-NEXT:    ushr.2d v1, v0, #1
-; CHECK-NEXT:    ushr.2d v0, v2, #1
+; CHECK-NEXT:    uhadd.4s v0, v0, v1
+; CHECK-NEXT:    ushll2.2d v1, v0, #0
+; CHECK-NEXT:    ushll.2d v0, v0, #0
 ; CHECK-NEXT:    ret
   %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
   %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
@@ -804,10 +802,9 @@
 define <4 x i64> @hadd32_zext_lsr(<4 x i32> %src1, <4 x i32> %src2) nounwind {
 ; CHECK-LABEL: hadd32_zext_lsr:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uaddl.2d v2, v0, v1
-; CHECK-NEXT:    uaddl2.2d v0, v0, v1
-; CHECK-NEXT:    ushr.2d v1, v0, #1
-; CHECK-NEXT:    ushr.2d v0, v2, #1
+; CHECK-NEXT:    uhadd.4s v0, v0, v1
+; CHECK-NEXT:    ushll2.2d v1, v0, #0
+; CHECK-NEXT:    ushll.2d v0, v0, #0
 ; CHECK-NEXT:    ret
   %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
   %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>