Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -10785,6 +10785,55 @@ %1 = select <4 x i1> %ispos, <4 x i32> %sub, <4 x i32> %neg +'``llvm.hsum.*``' +^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. The loaded data is a vector of any integer or floating point data type. +Floating point data type is only supported in fast-math mode. + +.. code-block:: llvm + + declare @llvm.hsum.i32.v4i32(<4 x integer> %a) + declare @llvm.hsum.f32.v4f32(<4 x float> %a) + + +Overview: +""""""""" + +The ``llvm.hsum`` intrinsic returns the result of the horizontal or reduction sum of the elements of the +vector operand, treating it as integers or floats. + +.. note:: + + These intrinsics are primarily used during the code generation stage of + compilation.They are generated by the compiler passes such as the Loop and + SLP vectorizers. + The expectation is that, the frontends should not need to generate these + intrinsics themselves. + +Arguments: +"""""""""" + +The argument is vector of integer or floating point number. + +Semantics: +"""""""""" + +The expression:: + + call i32 @llvm.hsum.i32.v4i32(<4 x i32> %a) + +is equivalent to:: + + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = add <4 x i32> %a, %1 + %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> + %4 = add <4 x i32> %2, %3 + %5 = extractelement <4 x i32> %4, i32 0 + + Half Precision Floating Point Intrinsics ---------------------------------------- Index: include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- include/llvm/CodeGen/ISDOpcodes.h +++ include/llvm/CodeGen/ISDOpcodes.h @@ -342,6 +342,12 @@ /// vector. These nodes are generated from llvm.*absdiff* intrinsics. SABSDIFF, UABSDIFF, + /// HSUM/FHSUM(VEC) - Horizontal sum across the elements of sole integer + /// or float input vector. Returns a vector, of the same type as VEC. + /// These nodes are generated from llvm.hsum* intrinsics. The scalar result + /// of the intrinsic is stored in first element of a vector of type VEC. + HSUM, FHSUM, + /// Bit counting operators with an undefined result for zero inputs. CTTZ_ZERO_UNDEF, CTLZ_ZERO_UNDEF, Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -612,6 +612,9 @@ def int_uabsdiff : Intrinsic<[llvm_anyvector_ty], [ LLVMMatchType<0>, LLVMMatchType<0> ], [IntrNoMem]>; +// Calculate the horizontal/reduction sum across the elements of input vector. +def int_hsum : Intrinsic<[llvm_any_ty], [llvm_anyvector_ty], [IntrNoMem]>; + //===-------------------------- Masked Intrinsics -------------------------===// // def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, LLVMPointerTo<0>, Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -388,6 +388,8 @@ def sabsdiff : SDNode<"ISD::SABSDIFF" , SDTIntBinOp>; def uabsdiff : SDNode<"ISD::UABSDIFF" , SDTIntBinOp>; +def hsum : SDNode<"ISD::HSUM" , SDTIntUnaryOp>; +def fhsum : SDNode<"ISD::FHSUM" , SDTIntUnaryOp>; def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>; def bswap : SDNode<"ISD::BSWAP" , SDTIntUnaryOp>; def ctlz : SDNode<"ISD::CTLZ" , SDTIntUnaryOp>; Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -150,6 +150,9 @@ case ISD::SABSDIFF: Res = PromoteIntRes_SimpleIntBinOp(N); break; + case ISD::HSUM: + Res = PromoteIntRes_HSUM(N); + break; } // If the result is null then the sub-method took care of registering it. @@ -157,6 +160,12 @@ SetPromotedInteger(SDValue(N, ResNo), Res); } +SDValue DAGTypeLegalizer::PromoteIntRes_HSUM(SDNode *N) { + SDValue OpVal = GetPromotedInteger(N->getOperand(0)); + return DAG.getNode(N->getOpcode(), SDLoc(N), + OpVal.getValueType(), OpVal); +} + SDValue DAGTypeLegalizer::PromoteIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo) { SDValue Op = DisintegrateMERGE_VALUES(N, ResNo); @@ -890,6 +899,7 @@ case ISD::SRL: case ISD::ROTL: case ISD::ROTR: Res = PromoteIntOp_Shift(N); break; + case ISD::HSUM: Res = PromoteIntOp_HSUM(N); break; } // If the result is null, the sub-method took care of registering results etc. @@ -1223,6 +1233,11 @@ N->getOperand(0).getValueType().getScalarType()); } +SDValue DAGTypeLegalizer::PromoteIntOp_HSUM(SDNode *N) { + SDValue Op = GetPromotedInteger(N->getOperand(0)); + return DAG.getNode(N->getOpcode(), SDLoc(N), + N->getValueType(0), Op); +} //===----------------------------------------------------------------------===// // Integer Result Expansion Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -264,6 +264,7 @@ SDValue PromoteIntRes_UNDEF(SDNode *N); SDValue PromoteIntRes_VAARG(SDNode *N); SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo); + SDValue PromoteIntRes_HSUM(SDNode *N); // Integer Operand Promotion. bool PromoteIntegerOperand(SDNode *N, unsigned OperandNo); @@ -294,6 +295,7 @@ SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N); SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo); + SDValue PromoteIntOp_HSUM(SDNode *N); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -640,6 +642,7 @@ SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo); SDValue SplitVecOp_UnaryOp(SDNode *N); SDValue SplitVecOp_TruncateHelper(SDNode *N); + SDValue SplitVecOp_HSUM(SDNode *N); SDValue SplitVecOp_BITCAST(SDNode *N); SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N); @@ -714,6 +717,7 @@ SDValue WidenVecOp_Convert(SDNode *N); SDValue WidenVecOp_FCOPYSIGN(SDNode *N); + SDValue WidenVecOp_HSUM(SDNode *N); //===--------------------------------------------------------------------===// // Vector Widening Utilities Support: LegalizeVectorTypes.cpp Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -106,6 +106,7 @@ SDValue ExpandStore(SDValue Op); SDValue ExpandFNEG(SDValue Op); SDValue ExpandABSDIFF(SDValue Op); + SDValue ExpandHSUM(SDValue Op); /// \brief Implements vector promotion. /// @@ -331,6 +332,8 @@ case ISD::UMAX: case ISD::UABSDIFF: case ISD::SABSDIFF: + case ISD::HSUM: + case ISD::FHSUM: QueryType = Node->getValueType(0); break; case ISD::FP_ROUND_INREG: @@ -716,6 +719,9 @@ case ISD::UABSDIFF: case ISD::SABSDIFF: return ExpandABSDIFF(Op); + case ISD::HSUM: + case ISD::FHSUM: + return ExpandHSUM(Op); default: return DAG.UnrollVectorOp(Op.getNode()); } @@ -1047,6 +1053,29 @@ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } +SDValue VectorLegalizer::ExpandHSUM(SDValue Op) { + bool isExpand = true; + if (Op->getOpcode() == ISD::FHSUM && !DAG.getTarget().Options.UnsafeFPMath) + isExpand = false; + assert(isExpand && + "Floating point horizontal sum only supported for fast-math"); + SDLoc dl(Op); + SDValue OpVal = Op.getOperand(0); + EVT VT = OpVal.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + SmallVector ShuffleMask(NumElems, -1); + for (unsigned i = NumElems; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i / 2; ++j) + ShuffleMask[j] = (i / 2 + j); + SDValue Shuffle = + DAG.getVectorShuffle(VT, dl, OpVal, DAG.getUNDEF(VT), &ShuffleMask[0]); + OpVal = DAG.getNode(Op->getOpcode() == ISD::HSUM ? ISD::ADD : ISD::FADD, dl, + VT, OpVal, Shuffle); + } + return OpVal; +} } bool SelectionDAG::LegalizeVectors() { Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -646,6 +646,8 @@ case ISD::SINT_TO_FP: case ISD::TRUNCATE: case ISD::UINT_TO_FP: + case ISD::HSUM: + case ISD::FHSUM: SplitVecRes_UnaryOp(N, Lo, Hi); break; @@ -1422,6 +1424,10 @@ case ISD::FTRUNC: Res = SplitVecOp_UnaryOp(N); break; + case ISD::HSUM: + case ISD::FHSUM: + Res = SplitVecOp_HSUM(N); + break; } } @@ -1507,6 +1513,18 @@ JoinIntegers(Lo, Hi)); } +SDValue DAGTypeLegalizer::SplitVecOp_HSUM(SDNode *N) { + SDValue Lo, Hi; + EVT SubVT = N->getValueType(0); + SDValue OpVal = N->getOperand(0); + SDLoc dl(N); + GetSplitVector(OpVal, Lo, Hi); + Lo = DAG.getNode(N->getOpcode(), dl, SubVT, Lo); + Hi = DAG.getNode(N->getOpcode(), dl, SubVT, Hi); + return DAG.getNode(N->getOpcode() == ISD::HSUM ? ISD::ADD : ISD::FADD, dl, + SubVT, Lo, Hi); +} + SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) { // We know that the extracted result type is legal. EVT SubVT = N->getValueType(0); @@ -2034,6 +2052,10 @@ case ISD::FMA: Res = WidenVecRes_Ternary(N); break; + case ISD::HSUM: + case ISD::FHSUM: + Res = WidenVecRes_Unary(N); + break; } // If Res is null, the sub-method took care of registering the result. @@ -2879,6 +2901,10 @@ case ISD::TRUNCATE: Res = WidenVecOp_Convert(N); break; + case ISD::HSUM: + case ISD::FHSUM: + Res = WidenVecOp_HSUM(N); + break; } // If Res is null, the sub-method took care of registering the result. @@ -2897,6 +2923,40 @@ return false; } +SDValue DAGTypeLegalizer::WidenVecOp_HSUM(SDNode *N) { + bool isExpand = true; + if (N->getOpcode() == ISD::FHSUM && !DAG.getTarget().Options.UnsafeFPMath) + isExpand = false; + assert(isExpand && + "Floating point horizontal sum only supported for fast-math"); + + // Since widen introduces undefs, scalarize the horizontal sum for actual + // number of vector elements. + // TODO: Improve scalarization using vector shift and add on supported + // targets. + SDValue Op = N->getOperand(0); + EVT VT = Op.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + SDLoc dl(Op); + + SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op, + DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op, + DAG.getConstant(1, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + SDValue Ops; + Ops = DAG.getNode(N->getOpcode() == ISD::HSUM ? ISD::ADD : ISD::FADD, dl, + EltVT, LHSElem, RHSElem); + for (unsigned i = 2; i < NumElems; ++i) { + LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op, + DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + Ops = DAG.getNode(N->getOpcode() == ISD::HSUM ? ISD::ADD : ISD::FADD, dl, + EltVT, LHSElem, Ops); + } + return Ops; +} + + SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) { SDLoc DL(N); EVT VT = N->getValueType(0); Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4710,6 +4710,24 @@ getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)))); return nullptr; + case Intrinsic::hsum: { + /// At IR level, this intrinsic is defined to return an scalar value. + /// However at SDAG level, this is being transformed into two nodes + /// *HSUM + EXTRACT_VECTOR_ELT. The idea is to keep the result of *HSUM + /// into vector, because if required, addtional vector operations can be + /// performed on this vector result. + SDValue Arg = getValue(I.getArgOperand(0)); + EVT Ty = Arg.getValueType(); + EVT EltVT = Ty.getVectorElementType(); + SDValue Hsum = DAG.getNode( + EltVT.isFloatingPoint() ? ISD::FHSUM : ISD::HSUM, + sdl, Ty, + Arg); + Hsum = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, EltVT, Hsum, + DAG.getConstant(0, sdl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + setValue(&I, Hsum); + return nullptr; + } case Intrinsic::cttz: { SDValue Arg = getValue(I.getArgOperand(0)); ConstantInt *CI = cast(I.getArgOperand(1)); Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -164,6 +164,8 @@ case ISD::FLOG: return "flog"; case ISD::FLOG2: return "flog2"; case ISD::FLOG10: return "flog10"; + case ISD::HSUM: return "hsum"; + case ISD::FHSUM: return "fhsum"; // Binary operators case ISD::ADD: return "add"; Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -831,6 +831,8 @@ setOperationAction(ISD::UMULO, VT, Expand); setOperationAction(ISD::UABSDIFF, VT, Expand); setOperationAction(ISD::SABSDIFF, VT, Expand); + setOperationAction(ISD::HSUM, VT, Expand); + setOperationAction(ISD::FHSUM, VT, Expand); // These library functions default to expand. setOperationAction(ISD::FROUND, VT, Expand); Index: test/CodeGen/X86/vec-hadd-float-128.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/vec-hadd-float-128.ll @@ -0,0 +1,48 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -enable-unsafe-fp-math | FileCheck --check-prefix=UNSAFE %s + + +declare float @llvm.hsum.f32.v2f32(<2 x float>) + +define float @test1_hsum_float_f32(<2 x float> %a1) { +; UNSAFE-LABEL: test1_hsum_float_f32: +; UNSAFE: # BB#0: +; UNSAFE-NEXT: movapd %xmm0, %xmm1 +; UNSAFE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; UNSAFE-NEXT: addps %xmm0, %xmm1 +; UNSAFE-NEXT: movaps %xmm1, %xmm0 +; UNSAFE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; UNSAFE-NEXT: addps %xmm1, %xmm0 +; UNSAFE-NEXT: retq + %1 = call float @llvm.hsum.f32.v2f32(<2 x float> %a1) + ret float %1 +} + +declare float @llvm.hsum.f32.v4f32(<4 x float>) + +define float @test2_hsum_float_f32(<4 x float> %a1) { +; UNSAFE-LABEL: test2_hsum_float_f32: +; UNSAFE: # BB#0: +; UNSAFE-NEXT: movapd %xmm0, %xmm1 +; UNSAFE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; UNSAFE-NEXT: addps %xmm0, %xmm1 +; UNSAFE-NEXT: movaps %xmm1, %xmm0 +; UNSAFE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; UNSAFE-NEXT: addps %xmm1, %xmm0 +; UNSAFE-NEXT: retq + %1 = call float @llvm.hsum.f32.v4f32(<4 x float> %a1) + ret float %1 +} + +declare double @llvm.hsum.f64.v2f64(<2 x double>) + +define double @test1_hsum_float_f64(<2 x double> %a1) { +; UNSAFE-LABEL: test1_hsum_float_f64: +; UNSAFE: # BB#0: +; UNSAFE-NEXT: movapd %xmm0, %xmm1 +; UNSAFE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; UNSAFE-NEXT: addpd %xmm0, %xmm1 +; UNSAFE-NEXT: movapd %xmm1, %xmm0 +; UNSAFE-NEXT: retq + %1 = call double @llvm.hsum.f64.v2f64(<2 x double> %a1) + ret double %1 +} Index: test/CodeGen/X86/vec-hadd-float-256.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/vec-hadd-float-256.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -enable-unsafe-fp-math | FileCheck --check-prefix=UNSAFE %s + +declare double @llvm.hsum.f64.v4f64(<4 x double>) + +define double @test_hsum_float_f64(<4 x double> %a1) { +; UNSAFE-LABEL: test_hsum_float_f64: +; UNSAFE: # BB#0: +; UNSAFE-NEXT: movapd %xmm0, %xmm1 +; UNSAFE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; UNSAFE-NEXT: addpd %xmm0, %xmm1 +; UNSAFE-NEXT: movapd %xmm1, %xmm0 +; UNSAFE-NEXT: retq + %1 = call double @llvm.hsum.f64.v4f64(<4 x double> %a1) + ret double %1 +} Index: test/CodeGen/X86/vec-hadd-int-128.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/vec-hadd-int-128.ll @@ -0,0 +1,75 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s -check-prefix=CHECK + +declare i8 @llvm.hsum.i8.v4i8(<4 x i8>) + +define i8 @test_hsum_int_i8(<4 x i8> %a1) { +; CHECK-LABEL: test_hsum_int_i8: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; CHECK-NEXT: paddd %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb -{{[0-9]+}}(%rsp), %al +; CHECK-NEXT: retq + %1 = call i8 @llvm.hsum.i8.v4i8(<4 x i8> %a1) + ret i8 %1 +} + +declare i16 @llvm.hsum.i16.v4i16(<4 x i16>) + +define i16 @test_hsum_int_i16(<4 x i16> %a1) { +; CHECK-LABEL: test_hsum_int_i16: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; CHECK-NEXT: paddd %xmm1, %xmm0 +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq + %1 = call i16 @llvm.hsum.i16.v4i16(<4 x i16> %a1) + ret i16 %1 +} + +declare i32 @llvm.hsum.i32.v3i32(<3 x i32>) + +define i32 @test1_hsum_int_i32(<3 x i32> %a1) { +; CHECK-LABEL: test1_hsum_int_i32: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; CHECK-NEXT: paddd %xmm1, %xmm0 +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq + %1 = call i32 @llvm.hsum.i32.v3i32(<3 x i32> %a1) + ret i32 %1 +} + +declare i32 @llvm.hsum.i32.v4i32(<4 x i32>) + +define i32 @test2_hsum_int_i32(<4 x i32> %a1) { +; CHECK-LABEL: test2_hsum_int_i32: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; CHECK-NEXT: paddd %xmm1, %xmm0 +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq + %1 = call i32 @llvm.hsum.i32.v4i32(<4 x i32> %a1) + ret i32 %1 +} + +declare i64 @llvm.hsum.i64.v2i64(<2 x i64>) + +define i64 @test1_hsum_int_i64(<2 x i64> %a1) { +; CHECK-LABEL: test1_hsum_int_i64: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: movd %xmm1, %rax +; CHECK-NEXT: retq + %1 = call i64 @llvm.hsum.i64.v2i64(<2 x i64> %a1) + ret i64 %1 +} Index: test/CodeGen/X86/vec-hadd-int-256.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/vec-hadd-int-256.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK + +declare i64 @llvm.hsum.i64.v4i64(<4 x i64>) + +define i64 @test2_hsum_int_i64(<4 x i64> %a1) { +; CHECK-LABEL: test2_hsum_int_i64: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: movd %xmm1, %rax +; CHECK-NEXT: retq + %1 = call i64 @llvm.hsum.i64.v4i64(<4 x i64> %a1) + ret i64 %1 +}