Index: docs/LangRef.rst
===================================================================
--- docs/LangRef.rst
+++ docs/LangRef.rst
@@ -10785,6 +10785,55 @@
     %1 = select <4 x i1> %ispos, <4 x i32> %sub, <4 x i32> %neg
 
 
+'``llvm.hsum.*``'
+^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. The loaded data is a vector of any integer or floating point data type.
+Floating point data type is only supported in fast-math mode.
+
+.. code-block:: llvm
+
+      declare <integer> @llvm.hsum.i32.v4i32(<4 x integer> %a)
+      declare <float> @llvm.hsum.f32.v4f32(<4 x float> %a)
+
+
+Overview:
+"""""""""
+
+The ``llvm.hsum`` intrinsic returns the result of the horizontal or reduction sum of the elements of the
+vector operand, treating it as integers or floats.
+
+.. note::
+
+    These intrinsics are primarily used during the code generation stage of
+    compilation.They are generated by the compiler passes such as the Loop and
+    SLP vectorizers.
+    The expectation is that, the frontends should not need to generate these
+    intrinsics themselves.
+
+Arguments:
+""""""""""
+
+The argument is vector of integer or floating point number.
+
+Semantics:
+""""""""""
+
+The expression::
+
+    call i32 @llvm.hsum.i32.v4i32(<4 x i32> %a)
+
+is equivalent to::
+
+    %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+    %2 = add <4 x i32> %a, %1
+    %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+    %4 = add <4 x i32> %2, %3
+    %5 = extractelement <4 x i32> %4, i32 0
+
+
 Half Precision Floating Point Intrinsics
 ----------------------------------------
 
Index: include/llvm/CodeGen/ISDOpcodes.h
===================================================================
--- include/llvm/CodeGen/ISDOpcodes.h
+++ include/llvm/CodeGen/ISDOpcodes.h
@@ -342,6 +342,12 @@
     /// vector. These nodes are generated from llvm.*absdiff* intrinsics.
     SABSDIFF, UABSDIFF,
 
+    /// HSUM/FHSUM(VEC) - Horizontal sum across the elements of sole integer
+    /// or float input vector. Returns a vector, of the same type as VEC.
+    /// These nodes are generated from llvm.hsum* intrinsics. The scalar result
+    /// of the intrinsic is stored in first element of a vector of type VEC.
+    HSUM, FHSUM,
+
     /// Bit counting operators with an undefined result for zero inputs.
     CTTZ_ZERO_UNDEF, CTLZ_ZERO_UNDEF,
 
Index: include/llvm/IR/Intrinsics.td
===================================================================
--- include/llvm/IR/Intrinsics.td
+++ include/llvm/IR/Intrinsics.td
@@ -612,6 +612,9 @@
 def int_uabsdiff : Intrinsic<[llvm_anyvector_ty],
                         [ LLVMMatchType<0>, LLVMMatchType<0> ], [IntrNoMem]>;
 
+// Calculate the horizontal/reduction sum across the elements of input vector.
+def int_hsum : Intrinsic<[llvm_any_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+
 //===-------------------------- Masked Intrinsics -------------------------===//
 //
 def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, LLVMPointerTo<0>,
Index: include/llvm/Target/TargetSelectionDAG.td
===================================================================
--- include/llvm/Target/TargetSelectionDAG.td
+++ include/llvm/Target/TargetSelectionDAG.td
@@ -388,6 +388,8 @@
 
 def sabsdiff   : SDNode<"ISD::SABSDIFF"   , SDTIntBinOp>;
 def uabsdiff   : SDNode<"ISD::UABSDIFF"   , SDTIntBinOp>;
+def hsum       : SDNode<"ISD::HSUM"       , SDTIntUnaryOp>;
+def fhsum      : SDNode<"ISD::FHSUM"      , SDTIntUnaryOp>;
 def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
 def bswap      : SDNode<"ISD::BSWAP"      , SDTIntUnaryOp>;
 def ctlz       : SDNode<"ISD::CTLZ"       , SDTIntUnaryOp>;
Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -150,6 +150,9 @@
   case ISD::SABSDIFF:
     Res = PromoteIntRes_SimpleIntBinOp(N);
     break;
+  case ISD::HSUM:
+    Res = PromoteIntRes_HSUM(N);
+    break;
   }
 
   // If the result is null then the sub-method took care of registering it.
@@ -157,6 +160,12 @@
     SetPromotedInteger(SDValue(N, ResNo), Res);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_HSUM(SDNode *N) {
+  SDValue OpVal = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
+                     OpVal.getValueType(), OpVal);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_MERGE_VALUES(SDNode *N,
                                                      unsigned ResNo) {
   SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
@@ -890,6 +899,7 @@
   case ISD::SRL:
   case ISD::ROTL:
   case ISD::ROTR: Res = PromoteIntOp_Shift(N); break;
+  case ISD::HSUM: Res = PromoteIntOp_HSUM(N); break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -1223,6 +1233,11 @@
                                 N->getOperand(0).getValueType().getScalarType());
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_HSUM(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
+                     N->getValueType(0), Op);
+}
 
 //===----------------------------------------------------------------------===//
 //  Integer Result Expansion
Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -264,6 +264,7 @@
   SDValue PromoteIntRes_UNDEF(SDNode *N);
   SDValue PromoteIntRes_VAARG(SDNode *N);
   SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_HSUM(SDNode *N);
 
   // Integer Operand Promotion.
   bool PromoteIntegerOperand(SDNode *N, unsigned OperandNo);
@@ -294,6 +295,7 @@
   SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N);
   SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_HSUM(SDNode *N);
 
   void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
 
@@ -640,6 +642,7 @@
   SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo);
   SDValue SplitVecOp_UnaryOp(SDNode *N);
   SDValue SplitVecOp_TruncateHelper(SDNode *N);
+  SDValue SplitVecOp_HSUM(SDNode *N);
 
   SDValue SplitVecOp_BITCAST(SDNode *N);
   SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
@@ -714,6 +717,7 @@
 
   SDValue WidenVecOp_Convert(SDNode *N);
   SDValue WidenVecOp_FCOPYSIGN(SDNode *N);
+  SDValue WidenVecOp_HSUM(SDNode *N);
 
   //===--------------------------------------------------------------------===//
   // Vector Widening Utilities Support: LegalizeVectorTypes.cpp
Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -106,6 +106,7 @@
   SDValue ExpandStore(SDValue Op);
   SDValue ExpandFNEG(SDValue Op);
   SDValue ExpandABSDIFF(SDValue Op);
+  SDValue ExpandHSUM(SDValue Op);
 
   /// \brief Implements vector promotion.
   ///
@@ -331,6 +332,8 @@
   case ISD::UMAX:
   case ISD::UABSDIFF:
   case ISD::SABSDIFF:
+  case ISD::HSUM:
+  case ISD::FHSUM:
     QueryType = Node->getValueType(0);
     break;
   case ISD::FP_ROUND_INREG:
@@ -716,6 +719,9 @@
   case ISD::UABSDIFF:
   case ISD::SABSDIFF:
     return ExpandABSDIFF(Op);
+  case ISD::HSUM:
+  case ISD::FHSUM:
+    return ExpandHSUM(Op);
   default:
     return DAG.UnrollVectorOp(Op.getNode());
   }
@@ -1047,6 +1053,29 @@
   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
 }
 
+SDValue VectorLegalizer::ExpandHSUM(SDValue Op) {
+  bool isExpand = true;
+  if (Op->getOpcode() == ISD::FHSUM && !DAG.getTarget().Options.UnsafeFPMath)
+    isExpand = false;
+  assert(isExpand &&
+         "Floating point horizontal sum only supported for fast-math");
+  SDLoc dl(Op);
+  SDValue OpVal = Op.getOperand(0);
+  EVT VT = OpVal.getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  SmallVector<int, 16> ShuffleMask(NumElems, -1);
+  for (unsigned i = NumElems; i != 1; i >>= 1) {
+    // Move the upper half of the vector to the lower half.
+    for (unsigned j = 0; j != i / 2; ++j)
+      ShuffleMask[j] = (i / 2 + j);
+    SDValue Shuffle =
+        DAG.getVectorShuffle(VT, dl, OpVal, DAG.getUNDEF(VT), &ShuffleMask[0]);
+    OpVal = DAG.getNode(Op->getOpcode() == ISD::HSUM ? ISD::ADD : ISD::FADD, dl,
+                        VT, OpVal, Shuffle);
+  }
+  return OpVal;
+}
 }
 
 bool SelectionDAG::LegalizeVectors() {
Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -646,6 +646,8 @@
   case ISD::SINT_TO_FP:
   case ISD::TRUNCATE:
   case ISD::UINT_TO_FP:
+  case ISD::HSUM:
+  case ISD::FHSUM:
     SplitVecRes_UnaryOp(N, Lo, Hi);
     break;
 
@@ -1422,6 +1424,10 @@
     case ISD::FTRUNC:
       Res = SplitVecOp_UnaryOp(N);
       break;
+    case ISD::HSUM:
+    case ISD::FHSUM:
+      Res = SplitVecOp_HSUM(N);
+      break;
     }
   }
 
@@ -1507,6 +1513,18 @@
                      JoinIntegers(Lo, Hi));
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_HSUM(SDNode *N) {
+  SDValue Lo, Hi;
+  EVT SubVT = N->getValueType(0);
+  SDValue OpVal = N->getOperand(0);
+  SDLoc dl(N);
+  GetSplitVector(OpVal, Lo, Hi);
+  Lo = DAG.getNode(N->getOpcode(), dl, SubVT, Lo);
+  Hi = DAG.getNode(N->getOpcode(), dl, SubVT, Hi);
+  return DAG.getNode(N->getOpcode() == ISD::HSUM ? ISD::ADD : ISD::FADD, dl,
+                     SubVT, Lo, Hi);
+}
+
 SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
   // We know that the extracted result type is legal.
   EVT SubVT = N->getValueType(0);
@@ -2034,6 +2052,10 @@
   case ISD::FMA:
     Res = WidenVecRes_Ternary(N);
     break;
+  case ISD::HSUM:
+  case ISD::FHSUM:
+    Res = WidenVecRes_Unary(N);
+    break;
   }
 
   // If Res is null, the sub-method took care of registering the result.
@@ -2879,6 +2901,10 @@
   case ISD::TRUNCATE:
     Res = WidenVecOp_Convert(N);
     break;
+  case ISD::HSUM:
+  case ISD::FHSUM:
+    Res = WidenVecOp_HSUM(N);
+    break;
   }
 
   // If Res is null, the sub-method took care of registering the result.
@@ -2897,6 +2923,40 @@
   return false;
 }
 
+SDValue DAGTypeLegalizer::WidenVecOp_HSUM(SDNode *N) {
+  bool isExpand = true;
+  if (N->getOpcode() == ISD::FHSUM && !DAG.getTarget().Options.UnsafeFPMath)
+    isExpand = false;
+  assert(isExpand &&
+         "Floating point horizontal sum only supported for fast-math");
+
+  // Since widen introduces undefs, scalarize the horizontal sum for actual
+  // number of vector elements.
+  // TODO: Improve scalarization using vector shift and add on supported
+  // targets.
+  SDValue Op = N->getOperand(0);
+  EVT VT = Op.getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+  SDLoc dl(Op);
+
+  SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op,
+                                DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op,
+                                DAG.getConstant(1, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  SDValue Ops;
+  Ops = DAG.getNode(N->getOpcode() == ISD::HSUM ? ISD::ADD : ISD::FADD, dl,
+                    EltVT, LHSElem, RHSElem);
+  for (unsigned i = 2; i < NumElems; ++i) {
+    LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op,
+                          DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    Ops = DAG.getNode(N->getOpcode() == ISD::HSUM ? ISD::ADD : ISD::FADD, dl,
+                      EltVT, LHSElem, Ops);
+  }
+  return Ops;
+}
+
+
 SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4710,6 +4710,24 @@
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1))));
     return nullptr;
+  case Intrinsic::hsum: {
+    /// At IR level, this intrinsic is defined to return an scalar value.
+    /// However at SDAG level, this is being transformed into two nodes
+    /// *HSUM + EXTRACT_VECTOR_ELT. The idea is to keep the result of *HSUM
+    /// into vector, because if required, addtional vector operations can be
+    /// performed on this vector result.
+    SDValue Arg = getValue(I.getArgOperand(0));
+    EVT Ty = Arg.getValueType();
+    EVT EltVT = Ty.getVectorElementType();
+    SDValue Hsum = DAG.getNode(
+                     EltVT.isFloatingPoint() ? ISD::FHSUM : ISD::HSUM,
+                     sdl, Ty,
+                     Arg);
+    Hsum = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, EltVT, Hsum,
+                       DAG.getConstant(0, sdl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    setValue(&I, Hsum);
+    return nullptr;
+  }
   case Intrinsic::cttz: {
     SDValue Arg = getValue(I.getArgOperand(0));
     ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -164,6 +164,8 @@
   case ISD::FLOG:                       return "flog";
   case ISD::FLOG2:                      return "flog2";
   case ISD::FLOG10:                     return "flog10";
+  case ISD::HSUM:                       return "hsum";
+  case ISD::FHSUM:                      return "fhsum";
 
   // Binary operators
   case ISD::ADD:                        return "add";
Index: lib/CodeGen/TargetLoweringBase.cpp
===================================================================
--- lib/CodeGen/TargetLoweringBase.cpp
+++ lib/CodeGen/TargetLoweringBase.cpp
@@ -831,6 +831,8 @@
     setOperationAction(ISD::UMULO, VT, Expand);
     setOperationAction(ISD::UABSDIFF, VT, Expand);
     setOperationAction(ISD::SABSDIFF, VT, Expand);
+    setOperationAction(ISD::HSUM, VT, Expand);
+    setOperationAction(ISD::FHSUM, VT, Expand);
 
     // These library functions default to expand.
     setOperationAction(ISD::FROUND, VT, Expand);
Index: test/CodeGen/X86/vec-hadd-float-128.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/vec-hadd-float-128.ll
@@ -0,0 +1,48 @@
+; RUN:  llc < %s -mtriple=x86_64-unknown-unknown -enable-unsafe-fp-math | FileCheck --check-prefix=UNSAFE %s
+
+
+declare float @llvm.hsum.f32.v2f32(<2 x float>)
+
+define float @test1_hsum_float_f32(<2 x float> %a1) {
+; UNSAFE-LABEL: test1_hsum_float_f32:
+; UNSAFE:       # BB#0:
+; UNSAFE-NEXT:    movapd %xmm0, %xmm1
+; UNSAFE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; UNSAFE-NEXT:    addps %xmm0, %xmm1
+; UNSAFE-NEXT:    movaps %xmm1, %xmm0
+; UNSAFE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; UNSAFE-NEXT:    addps %xmm1, %xmm0
+; UNSAFE-NEXT:    retq
+  %1 = call float @llvm.hsum.f32.v2f32(<2 x float> %a1)
+  ret float %1
+}
+
+declare float @llvm.hsum.f32.v4f32(<4 x float>)
+
+define float @test2_hsum_float_f32(<4 x float> %a1) {
+; UNSAFE-LABEL: test2_hsum_float_f32:
+; UNSAFE:       # BB#0:
+; UNSAFE-NEXT:    movapd %xmm0, %xmm1
+; UNSAFE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; UNSAFE-NEXT:    addps %xmm0, %xmm1
+; UNSAFE-NEXT:    movaps %xmm1, %xmm0
+; UNSAFE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; UNSAFE-NEXT:    addps %xmm1, %xmm0
+; UNSAFE-NEXT:    retq
+  %1 = call float @llvm.hsum.f32.v4f32(<4 x float> %a1)
+  ret float %1
+}
+
+declare double @llvm.hsum.f64.v2f64(<2 x double>)
+
+define double @test1_hsum_float_f64(<2 x double> %a1) {
+; UNSAFE-LABEL: test1_hsum_float_f64:
+; UNSAFE:       # BB#0:
+; UNSAFE-NEXT:    movapd %xmm0, %xmm1
+; UNSAFE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; UNSAFE-NEXT:    addpd %xmm0, %xmm1
+; UNSAFE-NEXT:    movapd %xmm1, %xmm0
+; UNSAFE-NEXT:    retq
+  %1 = call double @llvm.hsum.f64.v2f64(<2 x double> %a1)
+  ret double %1
+}
Index: test/CodeGen/X86/vec-hadd-float-256.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/vec-hadd-float-256.ll
@@ -0,0 +1,15 @@
+; RUN:  llc < %s -mtriple=x86_64-unknown-unknown -enable-unsafe-fp-math | FileCheck --check-prefix=UNSAFE %s
+
+declare double @llvm.hsum.f64.v4f64(<4 x double>)
+
+define double @test_hsum_float_f64(<4 x double> %a1) {
+; UNSAFE-LABEL: test_hsum_float_f64:
+; UNSAFE:       # BB#0:
+; UNSAFE-NEXT:    movapd %xmm0, %xmm1
+; UNSAFE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; UNSAFE-NEXT:    addpd %xmm0, %xmm1
+; UNSAFE-NEXT:    movapd %xmm1, %xmm0
+; UNSAFE-NEXT:    retq
+  %1 = call double @llvm.hsum.f64.v4f64(<4 x double> %a1)
+  ret double %1
+}
Index: test/CodeGen/X86/vec-hadd-int-128.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/vec-hadd-int-128.ll
@@ -0,0 +1,75 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s -check-prefix=CHECK
+
+declare i8 @llvm.hsum.i8.v4i8(<4 x i8>)
+
+define i8 @test_hsum_int_i8(<4 x i8> %a1) {
+; CHECK-LABEL: test_hsum_int_i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT:    paddd %xmm0, %xmm1
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; CHECK-NEXT:    paddd %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; CHECK-NEXT:    retq
+  %1 = call i8 @llvm.hsum.i8.v4i8(<4 x i8> %a1)
+  ret i8 %1
+}
+
+declare i16 @llvm.hsum.i16.v4i16(<4 x i16>)
+
+define i16 @test_hsum_int_i16(<4 x i16> %a1) {
+; CHECK-LABEL: test_hsum_int_i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT:    paddd %xmm0, %xmm1
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; CHECK-NEXT:    paddd %xmm1, %xmm0
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    retq
+  %1 = call i16 @llvm.hsum.i16.v4i16(<4 x i16> %a1)
+  ret i16 %1
+}
+
+declare i32 @llvm.hsum.i32.v3i32(<3 x i32>)
+
+define i32 @test1_hsum_int_i32(<3 x i32> %a1) {
+; CHECK-LABEL: test1_hsum_int_i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT:    paddd %xmm0, %xmm1
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; CHECK-NEXT:    paddd %xmm1, %xmm0
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    retq
+  %1 = call i32 @llvm.hsum.i32.v3i32(<3 x i32> %a1)
+  ret i32 %1
+}
+
+declare i32 @llvm.hsum.i32.v4i32(<4 x i32>)
+
+define i32 @test2_hsum_int_i32(<4 x i32> %a1) {
+; CHECK-LABEL: test2_hsum_int_i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT:    paddd %xmm0, %xmm1
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; CHECK-NEXT:    paddd %xmm1, %xmm0
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    retq
+  %1 = call i32 @llvm.hsum.i32.v4i32(<4 x i32> %a1)
+  ret i32 %1
+}
+
+declare i64 @llvm.hsum.i64.v2i64(<2 x i64>)
+
+define i64 @test1_hsum_int_i64(<2 x i64> %a1) {
+; CHECK-LABEL: test1_hsum_int_i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT:    paddq %xmm0, %xmm1
+; CHECK-NEXT:    movd %xmm1, %rax
+; CHECK-NEXT:    retq
+  %1 = call i64 @llvm.hsum.i64.v2i64(<2 x i64> %a1)
+  ret i64 %1
+}
Index: test/CodeGen/X86/vec-hadd-int-256.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/vec-hadd-int-256.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK
+
+declare i64 @llvm.hsum.i64.v4i64(<4 x i64>)
+
+define i64 @test2_hsum_int_i64(<4 x i64> %a1) {
+; CHECK-LABEL: test2_hsum_int_i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT:    paddq %xmm0, %xmm1
+; CHECK-NEXT:    movd %xmm1, %rax
+; CHECK-NEXT:    retq
+  %1 = call i64 @llvm.hsum.i64.v4i64(<4 x i64> %a1)
+  ret i64 %1
+}