Index: docs/LangRef.rst
===================================================================
--- docs/LangRef.rst
+++ docs/LangRef.rst
@@ -10387,6 +10387,55 @@
     %1 = select <4 x i1> %ispos, <4 x i32> %sub, <4 x i32> %neg
 
 
+'``llvm.hadd.*``'
+^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. The loaded data is a vector of any integer or floating point data type.
+Floating point data type is only supported in fast-math mode.
+
+.. code-block:: llvm
+
+      declare <integer> @llvm.hadd.i32.v4i32(<4 x integer> %a)
+      declare <float> @llvm.hadd.f32.v4f32(<4 x float> %a)
+
+
+Overview:
+"""""""""
+
+The ``llvm.hadd`` intrinsic returns the result of the horizontal or reduction sum of the elements of the
+vector operand, treating it as integers or floats.
+
+.. note::
+
+    These intrinsics are primarily used during the code generation stage of
+    compilation.They are generated by the compiler passes such as the Loop and
+    SLP vectorizers.
+    The expectation is that, the frontends should not need to generate these
+    intrinsics themselves.
+
+Arguments:
+""""""""""
+
+The argument is vector of integer or floating point number.
+
+Semantics:
+""""""""""
+
+The expression::
+
+    call i32 @llvm.hadd.i32.v4i32(<4 x i32> %a)
+
+is equivalent to::
+
+    %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+    %2 = add <4 x i32> %a, %1
+    %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+    %4 = add <4 x i32> %2, %3
+    %5 = extractelement <4 x i32> %4, i32 0
+
+
 Half Precision Floating Point Intrinsics
 ----------------------------------------
 
Index: include/llvm/CodeGen/ISDOpcodes.h
===================================================================
--- include/llvm/CodeGen/ISDOpcodes.h
+++ include/llvm/CodeGen/ISDOpcodes.h
@@ -342,6 +342,11 @@
     /// vector. These nodes are generated from llvm.*absdiff* intrinsics.
     SABSDIFF, UABSDIFF,
 
+    /// HADD/FHADD - Horizontal sum across the elements of sole
+    /// integer or float input vector.
+    /// These nodes are generated from llvm.hadd* intrinsics.
+    HADD, FHADD,
+
     /// Bit counting operators with an undefined result for zero inputs.
     CTTZ_ZERO_UNDEF, CTLZ_ZERO_UNDEF,
 
Index: include/llvm/IR/Intrinsics.td
===================================================================
--- include/llvm/IR/Intrinsics.td
+++ include/llvm/IR/Intrinsics.td
@@ -612,6 +612,9 @@
 def int_uabsdiff : Intrinsic<[llvm_anyvector_ty],
                         [ LLVMMatchType<0>, LLVMMatchType<0> ], [IntrNoMem]>;
 
+// Calculate the horizontal/reduction sum across the elements of input vector.
+def int_hadd : Intrinsic<[llvm_any_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+
 //===-------------------------- Masked Intrinsics -------------------------===//
 //
 def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, LLVMPointerTo<0>,
Index: include/llvm/Target/TargetSelectionDAG.td
===================================================================
--- include/llvm/Target/TargetSelectionDAG.td
+++ include/llvm/Target/TargetSelectionDAG.td
@@ -388,6 +388,8 @@
 
 def sabsdiff   : SDNode<"ISD::SABSDIFF"   , SDTIntBinOp>;
 def uabsdiff   : SDNode<"ISD::UABSDIFF"   , SDTIntBinOp>;
+def hadd       : SDNode<"ISD::HADD"       , SDTIntUnaryOp>;
+def fhadd      : SDNode<"ISD::FHADD"      , SDTIntUnaryOp>;
 def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
 def bswap      : SDNode<"ISD::BSWAP"      , SDTIntUnaryOp>;
 def ctlz       : SDNode<"ISD::CTLZ"       , SDTIntUnaryOp>;
Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2891,6 +2891,34 @@
   case ISD::BSWAP:
     Results.push_back(ExpandBSWAP(Node->getOperand(0), dl));
     break;
+  case ISD::FHADD:
+  case ISD::HADD: {
+    if (Node->getOpcode() == ISD::FHADD)
+      assert(TM.Options.UnsafeFPMath &&
+             "Floating point horizontal sum only supported for fast-math");
+    SDValue OpVal = Node->getOperand(0);
+    EVT VT = OpVal.getValueType();
+    unsigned NumElems = VT.getVectorNumElements();
+    EVT EltVT = VT.getVectorElementType();
+
+    SmallVector<int, 16> ShuffleMask(NumElems, -1);
+    for (unsigned i = NumElems; i != 1; i >>= 1) {
+      // Move the upper half of the vector to the lower half.
+      for (unsigned j = 0; j != i / 2; ++j)
+        ShuffleMask[j] = (i / 2 + j);
+      SDValue Shuffle = DAG.getVectorShuffle(VT, dl, OpVal, DAG.getUNDEF(VT),
+                                             &ShuffleMask[0]);
+      OpVal = DAG.getNode(Node->getOpcode() == ISD::HADD ? ISD::ADD : ISD::FADD,
+                          dl, VT, OpVal, Shuffle);
+    }
+    Tmp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, OpVal,
+                       DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    if (Node->getOpcode() == ISD::HADD) {
+      Tmp1 = DAG.getAnyExtOrTrunc(Tmp1, dl, Node->getValueType(0));
+    }
+    Results.push_back(Tmp1);
+    break;
+  }
   case ISD::FRAMEADDR:
   case ISD::RETURNADDR:
   case ISD::FRAME_TO_ARGS_OFFSET:
Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -890,6 +890,7 @@
   case ISD::SRL:
   case ISD::ROTL:
   case ISD::ROTR: Res = PromoteIntOp_Shift(N); break;
+  case ISD::HADD: Res = PromoteIntOp_HADD(N); break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -1223,6 +1224,11 @@
                                 N->getOperand(0).getValueType().getScalarType());
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_HADD(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
+                     N->getValueType(0), Op);
+}
 
 //===----------------------------------------------------------------------===//
 //  Integer Result Expansion
Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -294,6 +294,7 @@
   SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N);
   SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_HADD(SDNode *N);
 
   void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
 
@@ -639,6 +640,7 @@
   SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo);
   SDValue SplitVecOp_UnaryOp(SDNode *N);
   SDValue SplitVecOp_TruncateHelper(SDNode *N);
+  SDValue SplitVecOp_HADD(SDNode *N);
 
   SDValue SplitVecOp_BITCAST(SDNode *N);
   SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
@@ -710,6 +712,7 @@
   SDValue WidenVecOp_SETCC(SDNode* N);
 
   SDValue WidenVecOp_Convert(SDNode *N);
+  SDValue WidenVecOp_HADD(SDNode *N);
 
   //===--------------------------------------------------------------------===//
   // Vector Widening Utilities Support: LegalizeVectorTypes.cpp
Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -59,6 +59,9 @@
   /// \brief Implements unrolling a VSETCC.
   SDValue UnrollVSETCC(SDValue Op);
 
+  /// \brief Implements unrolling a HADD/FHADD.
+  SDValue UnrollHADD(SDValue Op);
+
   /// \brief Implement expand-based legalization of vector operations.
   ///
   /// This is just a high-level routine to dispatch to specific code paths for
@@ -714,6 +717,9 @@
   case ISD::UABSDIFF:
   case ISD::SABSDIFF:
     return ExpandABSDIFF(Op);
+  case ISD::HADD:
+  case ISD::FHADD:
+    return UnrollHADD(Op);
   default:
     return DAG.UnrollVectorOp(Op.getNode());
   }
@@ -1045,6 +1051,34 @@
   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
 }
 
+SDValue VectorLegalizer::UnrollHADD(SDValue Op) {
+  if (Op->getOpcode() == ISD::FHADD)
+    assert(DAG.getTarget().Options.UnsafeFPMath &&
+           "Floating point horizontal sum only supported for fast-math");
+  SDLoc dl(Op);
+  SDValue OpVal = Op.getOperand(0);
+  EVT VT = OpVal.getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+
+  SmallVector<int, 16> ShuffleMask(NumElems, -1);
+  for (unsigned i = NumElems; i != 1; i >>= 1) {
+    // Move the upper half of the vector to the lower half.
+    for (unsigned j = 0; j != i / 2; ++j)
+      ShuffleMask[j] = (i / 2 + j);
+    SDValue Shuffle =
+        DAG.getVectorShuffle(VT, dl, OpVal, DAG.getUNDEF(VT), &ShuffleMask[0]);
+    OpVal = DAG.getNode(Op->getOpcode() == ISD::HADD ? ISD::ADD : ISD::FADD, dl,
+                        VT, OpVal, Shuffle);
+  }
+  SDValue Tmp1 = DAG.getNode(
+      ISD::EXTRACT_VECTOR_ELT, dl, EltVT, OpVal,
+      DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  if (Op->getOpcode() == ISD::HADD) {
+    Tmp1 = DAG.getAnyExtOrTrunc(Tmp1, dl, Op.getValueType());
+  }
+  return Tmp1;
+}
 }
 
 bool SelectionDAG::LegalizeVectors() {
Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -643,6 +643,8 @@
   case ISD::SINT_TO_FP:
   case ISD::TRUNCATE:
   case ISD::UINT_TO_FP:
+  case ISD::HADD:
+  case ISD::FHADD:
     SplitVecRes_UnaryOp(N, Lo, Hi);
     break;
 
@@ -1398,6 +1400,10 @@
     case ISD::FTRUNC:
       Res = SplitVecOp_UnaryOp(N);
       break;
+    case ISD::HADD:
+    case ISD::FHADD:
+      Res = SplitVecOp_HADD(N);
+      break;
     }
   }
 
@@ -1483,6 +1489,18 @@
                      JoinIntegers(Lo, Hi));
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_HADD(SDNode *N) {
+  SDValue Lo, Hi;
+  EVT SubVT = N->getValueType(0);
+  SDValue OpVal = N->getOperand(0);
+  SDLoc dl(N);
+  GetSplitVector(OpVal, Lo, Hi);
+  Lo = DAG.getNode(N->getOpcode(), dl, SubVT, Lo);
+  Hi = DAG.getNode(N->getOpcode(), dl, SubVT, Hi);
+  return DAG.getNode(N->getOpcode() == ISD::HADD ? ISD::ADD : ISD::FADD, dl,
+                     SubVT, Lo, Hi);
+}
+
 SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
   // We know that the extracted result type is legal.
   EVT SubVT = N->getValueType(0);
@@ -2833,6 +2851,10 @@
   case ISD::TRUNCATE:
     Res = WidenVecOp_Convert(N);
     break;
+  case ISD::HADD:
+  case ISD::FHADD:
+    Res = WidenVecOp_HADD(N);
+    break;
   }
 
   // If Res is null, the sub-method took care of registering the result.
@@ -2851,6 +2873,37 @@
   return false;
 }
 
+SDValue DAGTypeLegalizer::WidenVecOp_HADD(SDNode *N) {
+  if (N->getOpcode() == ISD::FHADD)
+      assert(DAG.getTarget().Options.UnsafeFPMath &&
+             "Floating point horizontal sum only supported for fast-math");
+  // Since widen introduces undefs, scalarize the horizontal sum for actual
+  // number of vector elements.
+  // TODO: Improve scalarization using vector shift and add on supported
+  // targets.
+  SDValue Op = N->getOperand(0);
+  EVT VT = Op.getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+  SDLoc dl(Op);
+
+  SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op,
+                                DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op,
+                                DAG.getConstant(1, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  SDValue Ops;
+  Ops = DAG.getNode(N->getOpcode() == ISD::HADD ? ISD::ADD : ISD::FADD, dl,
+                    EltVT, LHSElem, RHSElem);
+  for (unsigned i = 2; i < NumElems; ++i) {
+    LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op,
+                          DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    Ops = DAG.getNode(N->getOpcode() == ISD::HADD ? ISD::ADD : ISD::FADD, dl,
+                      EltVT, LHSElem, Ops);
+  }
+  return Ops;
+}
+
+
 SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4663,6 +4663,16 @@
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1))));
     return nullptr;
+  case Intrinsic::hadd: {
+    SDValue Arg = getValue(I.getArgOperand(0));
+    EVT Ty = Arg.getValueType();
+    EVT EltVT = Ty.getVectorElementType();
+    setValue(&I, DAG.getNode(
+                     EltVT.isFloatingPoint() ? ISD::FHADD : ISD::HADD,
+                     sdl, TLI.getValueType(DAG.getDataLayout(), I.getType()),
+                     Arg));
+    return nullptr;
+  }
   case Intrinsic::cttz: {
     SDValue Arg = getValue(I.getArgOperand(0));
     ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -162,6 +162,8 @@
   case ISD::FLOG:                       return "flog";
   case ISD::FLOG2:                      return "flog2";
   case ISD::FLOG10:                     return "flog10";
+  case ISD::HADD:                       return "hadd";
+  case ISD::FHADD:                      return "fhadd";
 
   // Binary operators
   case ISD::ADD:                        return "add";
Index: lib/CodeGen/TargetLoweringBase.cpp
===================================================================
--- lib/CodeGen/TargetLoweringBase.cpp
+++ lib/CodeGen/TargetLoweringBase.cpp
@@ -829,6 +829,8 @@
     setOperationAction(ISD::UMULO, VT, Expand);
     setOperationAction(ISD::UABSDIFF, VT, Expand);
     setOperationAction(ISD::SABSDIFF, VT, Expand);
+    setOperationAction(ISD::HADD, VT, Expand);
+    setOperationAction(ISD::FHADD, VT, Expand);
 
     // These library functions default to expand.
     setOperationAction(ISD::FROUND, VT, Expand);
Index: test/CodeGen/X86/vec-hadd-float-128.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/vec-hadd-float-128.ll
@@ -0,0 +1,43 @@
+; RUN:  llc < %s -mtriple=x86_64-unknown-linux-gnu -enable-unsafe-fp-math | FileCheck --check-prefix=UNSAFE %s
+
+
+declare float @llvm.hadd.f32.v2f32(<2 x float>)
+
+define float @test1_hadd_float_f32(<2 x float> %a1) {
+; UNSAFE-LABEL: test1_hadd_float_f32
+; UNSAFE:             movaps  %xmm0, %xmm1
+; UNSAFE-NEXT:        shufps  $229, %xmm1, %xmm1      # xmm1 = xmm1[1,1,2,3]
+; UNSAFE-NEXT:        addss   %xmm0, %xmm1
+; UNSAFE-NEXT:        movaps  %xmm1, %xmm0
+; UNSAFE-NEXT:        retq
+  %1 = call float @llvm.hadd.f32.v2f32(<2 x float> %a1)
+  ret float %1
+}
+
+declare float @llvm.hadd.f32.v4f32(<4 x float>)
+
+define float @test2_hadd_float_f32(<4 x float> %a1) {
+; UNSAFE-LABEL: test2_hadd_float_f32
+; UNSAFE:             movapd  %xmm0, %xmm1
+; UNSAFE-NEXT:        shufpd  $1, %xmm1, %xmm1        # xmm1 = xmm1[1,0]
+; UNSAFE-NEXT:        addps   %xmm0, %xmm1
+; UNSAFE-NEXT:        movaps  %xmm1, %xmm0
+; UNSAFE-NEXT:        shufps  $237, %xmm0, %xmm0      # xmm0 = xmm0[1,3,2,3]
+; UNSAFE-NEXT:        addps   %xmm1, %xmm0
+; UNSAFE-NEXT:        retq
+  %1 = call float @llvm.hadd.f32.v4f32(<4 x float> %a1)
+  ret float %1
+}
+
+declare double @llvm.hadd.f64.v2f64(<2 x double>)
+
+define double @test1_hadd_float_f64(<2 x double> %a1) {
+; UNSAFE-LABEL: test1_hadd_float_f64
+; UNSAFE:             movapd  %xmm0, %xmm1
+; UNSAFE-NEXT:        shufpd  $1, %xmm1, %xmm1        # xmm1 = xmm1[1,0]
+; UNSAFE-NEXT:        addpd   %xmm0, %xmm1
+; UNSAFE-NEXT:        movapd  %xmm1, %xmm0
+; UNSAFE-NEXT:        retq
+  %1 = call double @llvm.hadd.f64.v2f64(<2 x double> %a1)
+  ret double %1
+}
Index: test/CodeGen/X86/vec-hadd-float-256.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/vec-hadd-float-256.ll
@@ -0,0 +1,26 @@
+; RUN:  llc < %s -mtriple=x86_64-unknown-linux-gnu -enable-unsafe-fp-math | FileCheck --check-prefix=UNSAFE %s
+
+declare double @llvm.hadd.f64.v8f64(<8 x double>)
+
+define double @test2_hadd_float_f64(<8 x double> %a1) {
+; UNSAFE-LABEL: test2_hadd_float_f64
+; UNSAFE:             movapd  {{.*}}
+; UNSAFE-NEXT:        shufpd  {{.*#+}}       xmm4 = xmm4[1,0]
+; UNSAFE-NEXT:        addpd   {{.*}}
+; UNSAFE-NEXT:        movapd  {{.*}}
+; UNSAFE-NEXT:        shufpd  {{.*}}         xmm3 = xmm3[1,0]
+; UNSAFE-NEXT:        addpd   {{.*}}
+; UNSAFE-NEXT:        addsd   {{.*}}
+; UNSAFE-NEXT:        movapd  {{.*}}
+; UNSAFE-NEXT:        shufpd  {{.*#+}}       xmm2 = xmm2[1,0]
+; UNSAFE-NEXT:        addpd   {{.*}}
+; UNSAFE-NEXT:        movapd  {{.*}}
+; UNSAFE-NEXT:        shufpd  {{.*}}         xmm1 = xmm1[1,0]
+; UNSAFE-NEXT:        addpd   {{.*}}
+; UNSAFE-NEXT:        addsd   {{.*}}
+; UNSAFE-NEXT:        addsd   {{.*}}
+; UNSAFE-NEXT:        movapd  {{.*}}
+; UNSAFE-NEXT:        retq
+  %1 = call double @llvm.hadd.f64.v8f64(<8 x double> %a1)
+  ret double %1
+}
Index: test/CodeGen/X86/vec-hadd-int-128.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/vec-hadd-int-128.ll
@@ -0,0 +1,71 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s -check-prefix=CHECK
+
+declare i8 @llvm.hadd.i8.v4i8(<4 x i8>)
+
+define i8 @test_hadd_int_i8(<4 x i8> %a1) {
+; CHECK-LABEL: test_hadd_int_i8
+; CHECK:             pshufd  {{.*#+}}              xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT:        paddd   {{.*}}
+; CHECK-NEXT:        pshufd  {{.*#+}}              xmm0 = xmm1[1,3,2,3]
+; CHECK-NEXT:        paddd   {{.*}}
+; CHECK-NEXT:        movd    %xmm0, %eax
+; CHECK-NEXT:        retq
+  %1 = call i8 @llvm.hadd.i8.v4i8(<4 x i8> %a1)
+  ret i8 %1
+}
+
+declare i16 @llvm.hadd.i16.v4i16(<4 x i16>)
+
+define i16 @test_hadd_int_i16(<4 x i16> %a1) {
+; CHECK-LABEL: test_hadd_int_i16
+; CHECK:             pshufd  {{.*#+}}                 xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT:        paddd   {{.*}}
+; CHECK-NEXT:        pshufd  {{.*#+}}                 xmm0 = xmm1[1,3,2,3]
+; CHECK-NEXT:        paddd   {{.*}}
+; CHECK-NEXT:        movd    %xmm0, %eax
+; CHECK-NEXT:        retq
+  %1 = call i16 @llvm.hadd.i16.v4i16(<4 x i16> %a1)
+  ret i16 %1
+}
+
+declare i32 @llvm.hadd.i32.v3i32(<3 x i32>)
+
+define i32 @test1_hadd_int_i32(<3 x i32> %a1) {
+; CHECK-LABEL: test1_hadd_int_i32
+; CHECK:             movd    %xmm0, %eax
+; CHECK-NEXT:        pshufd  {{.*#+}}                xmm1 = xmm0[1,1,2,3]
+; CHECK-NEXT:        movd    %xmm1, %ecx
+; CHECK-NEXT:        addl    %eax, %ecx
+; CHECK-NEXT:        pshufd  {{.*#+}}                xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:        movd    %xmm0, %eax
+; CHECK-NEXT:        addl    %ecx, %eax
+; CHECK-NEXT:        retq
+  %1 = call i32 @llvm.hadd.i32.v3i32(<3 x i32> %a1)
+  ret i32 %1
+}
+
+declare i32 @llvm.hadd.i32.v4i32(<4 x i32>)
+
+define i32 @test2_hadd_int_i32(<4 x i32> %a1) {
+; CHECK-LABEL: test2_hadd_int_i32
+; CHECK:             pshufd  {{.*#+}}                xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT:        paddd   {{.*}}
+; CHECK-NEXT:        pshufd  {{.*#+}}                xmm0 = xmm1[1,3,2,3]
+; CHECK-NEXT:        paddd   {{.*}}
+; CHECK-NEXT:        movd    %xmm0, %eax
+; CHECK-NEXT:        retq
+  %1 = call i32 @llvm.hadd.i32.v4i32(<4 x i32> %a1)
+  ret i32 %1
+}
+
+declare i64 @llvm.hadd.i64.v2i64(<2 x i64>)
+
+define i64 @test1_hadd_int_i64(<2 x i64> %a1) {
+; CHECK-LABEL: test1_hadd_int_i64
+; CHECK:             pshufd  {{.*#+}}                xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT:        paddq   {{.*}}
+; CHECK-NEXT:        movd    %xmm1, %rax
+; CHECK-NEXT:        retq
+  %1 = call i64 @llvm.hadd.i64.v2i64(<2 x i64> %a1)
+  ret i64 %1
+}
Index: test/CodeGen/X86/vec-hadd-int-256.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/vec-hadd-int-256.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=CHECK
+
+declare i64 @llvm.hadd.i64.v4i64(<4 x i64>)
+
+define i64 @test2_hadd_int_i64(<4 x i64> %a1) {
+; CHECK-LABEL: test2_hadd_int_i64:
+; CHECK:             pshufd  {{.*#+}}                xmm2 = xmm1[2,3,0,1]
+; CHECK-NEXT:        paddq   {{.*}}
+; CHECK-NEXT:        movd    %xmm2, %rcx
+; CHECK-NEXT:        pshufd  {{.*#+}}       xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT:        paddq   {{.*}}
+; CHECK-NEXT:        movd    %xmm1, %rax
+; CHECK-NEXT:        addq    %rcx, %rax
+; CHECK-NEXT:        retq
+  %1 = call i64 @llvm.hadd.i64.v4i64(<4 x i64> %a1)
+  ret i64 %1
+}