Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -10815,6 +10815,55 @@ %1 = select <4 x i1> %ispos, <4 x i32> %sub, <4 x i32> %neg +'``llvm.hsum.*``' +^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. The loaded data is a vector of any integer or floating point data type. +Order of additions performed by the intrinsic is undefined. + +.. code-block:: llvm + + declare @llvm.hsum.i32.v4i32(<4 x integer> %a) + declare @llvm.hsum.f32.v4f32(<4 x float> %a) + + +Overview: +""""""""" + +The ``llvm.hsum`` intrinsic returns the result of the horizontal or reduction sum of the elements of the +vector operand, treating it as integers or floats. If the result overflows, the behavior is undefined. + +.. note:: + + These intrinsics are primarily used during the code generation stage of + compilation.They are generated by the compiler passes such as the Loop and + SLP vectorizers. + The expectation is that, the frontends should not need to generate these + intrinsics themselves. + +Arguments: +"""""""""" + +The argument is vector of integer or floating point number. + +Semantics: +"""""""""" + +The expression:: + + call i32 @llvm.hsum.i32.v4i32(<4 x i32> %a) + +is equivalent to:: + + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = add <4 x i32> %a, %1 + %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> + %4 = add <4 x i32> %2, %3 + %5 = extractelement <4 x i32> %4, i32 0 + + Half Precision Floating Point Intrinsics ---------------------------------------- Index: include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- include/llvm/CodeGen/ISDOpcodes.h +++ include/llvm/CodeGen/ISDOpcodes.h @@ -342,6 +342,12 @@ /// vector. These nodes are generated from llvm.*absdiff* intrinsics. SABSDIFF, UABSDIFF, + /// HSUM/FHSUM(VEC) - Horizontal sum across the elements of sole integer + /// or float input vector. Returns a vector, of the same type as VEC. + /// These nodes are generated from llvm.hsum* intrinsics. The scalar result + /// of the intrinsic is stored in first element of a vector of type VEC. + HSUM, FHSUM, + /// Bit counting operators with an undefined result for zero inputs. CTTZ_ZERO_UNDEF, CTLZ_ZERO_UNDEF, Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -613,6 +613,9 @@ def int_uabsdiff : Intrinsic<[llvm_anyvector_ty], [ LLVMMatchType<0>, LLVMMatchType<0> ], [IntrNoMem]>; +// Calculate the horizontal/reduction sum across the elements of input vector. +def int_hsum : Intrinsic<[llvm_any_ty], [llvm_anyvector_ty], [IntrNoMem]>; + //===-------------------------- Masked Intrinsics -------------------------===// // def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, LLVMPointerTo<0>, Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -388,6 +388,8 @@ def sabsdiff : SDNode<"ISD::SABSDIFF" , SDTIntBinOp>; def uabsdiff : SDNode<"ISD::UABSDIFF" , SDTIntBinOp>; +def hsum : SDNode<"ISD::HSUM" , SDTIntUnaryOp>; +def fhsum : SDNode<"ISD::FHSUM" , SDTIntUnaryOp>; def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>; def bswap : SDNode<"ISD::BSWAP" , SDTIntUnaryOp>; def ctlz : SDNode<"ISD::CTLZ" , SDTIntUnaryOp>; Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -150,6 +150,9 @@ case ISD::SABSDIFF: Res = PromoteIntRes_SimpleIntBinOp(N); break; + case ISD::HSUM: + Res = PromoteIntRes_HSUM(N); + break; } // If the result is null then the sub-method took care of registering it. @@ -157,6 +160,12 @@ SetPromotedInteger(SDValue(N, ResNo), Res); } +SDValue DAGTypeLegalizer::PromoteIntRes_HSUM(SDNode *N) { + SDValue OpVal = GetPromotedInteger(N->getOperand(0)); + return DAG.getNode(N->getOpcode(), SDLoc(N), + OpVal.getValueType(), OpVal); +} + SDValue DAGTypeLegalizer::PromoteIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo) { SDValue Op = DisintegrateMERGE_VALUES(N, ResNo); @@ -890,6 +899,7 @@ case ISD::SRL: case ISD::ROTL: case ISD::ROTR: Res = PromoteIntOp_Shift(N); break; + case ISD::HSUM: Res = PromoteIntOp_HSUM(N); break; } // If the result is null, the sub-method took care of registering results etc. @@ -1223,6 +1233,11 @@ N->getOperand(0).getValueType().getScalarType()); } +SDValue DAGTypeLegalizer::PromoteIntOp_HSUM(SDNode *N) { + SDValue Op = GetPromotedInteger(N->getOperand(0)); + return DAG.getNode(N->getOpcode(), SDLoc(N), + N->getValueType(0), Op); +} //===----------------------------------------------------------------------===// // Integer Result Expansion Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -264,6 +264,7 @@ SDValue PromoteIntRes_UNDEF(SDNode *N); SDValue PromoteIntRes_VAARG(SDNode *N); SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo); + SDValue PromoteIntRes_HSUM(SDNode *N); // Integer Operand Promotion. bool PromoteIntegerOperand(SDNode *N, unsigned OperandNo); @@ -294,6 +295,7 @@ SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N); SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo); + SDValue PromoteIntOp_HSUM(SDNode *N); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -635,6 +637,7 @@ void SplitVecRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_HSUM(SDNode *N, SDValue &Lo, SDValue &Hi); // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>. bool SplitVectorOperand(SDNode *N, unsigned OpNo); @@ -715,6 +718,7 @@ SDValue WidenVecOp_Convert(SDNode *N); SDValue WidenVecOp_FCOPYSIGN(SDNode *N); + SDValue WidenVecOp_HSUM(SDNode *N); //===--------------------------------------------------------------------===// // Vector Widening Utilities Support: LegalizeVectorTypes.cpp Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -106,6 +106,7 @@ SDValue ExpandStore(SDValue Op); SDValue ExpandFNEG(SDValue Op); SDValue ExpandABSDIFF(SDValue Op); + SDValue ExpandHSUM(SDValue Op); /// \brief Implements vector promotion. /// @@ -331,6 +332,8 @@ case ISD::UMAX: case ISD::UABSDIFF: case ISD::SABSDIFF: + case ISD::HSUM: + case ISD::FHSUM: QueryType = Node->getValueType(0); break; case ISD::FP_ROUND_INREG: @@ -716,6 +719,9 @@ case ISD::UABSDIFF: case ISD::SABSDIFF: return ExpandABSDIFF(Op); + case ISD::HSUM: + case ISD::FHSUM: + return ExpandHSUM(Op); default: return DAG.UnrollVectorOp(Op.getNode()); } @@ -1047,6 +1053,24 @@ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } +SDValue VectorLegalizer::ExpandHSUM(SDValue Op) { + SDLoc dl(Op); + SDValue OpVal = Op.getOperand(0); + EVT VT = OpVal.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + SmallVector ShuffleMask(NumElems, -1); + for (unsigned i = NumElems; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i / 2; ++j) + ShuffleMask[j] = (i / 2 + j); + SDValue Shuffle = + DAG.getVectorShuffle(VT, dl, OpVal, DAG.getUNDEF(VT), &ShuffleMask[0]); + OpVal = DAG.getNode(Op->getOpcode() == ISD::HSUM ? ISD::ADD : ISD::FADD, dl, + VT, OpVal, Shuffle); + } + return OpVal; +} } bool SelectionDAG::LegalizeVectors() { Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -649,6 +649,11 @@ SplitVecRes_UnaryOp(N, Lo, Hi); break; + case ISD::HSUM: + case ISD::FHSUM: + SplitVecRes_HSUM(N, Lo, Hi); + break; + case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: @@ -696,6 +701,18 @@ SetSplitVector(SDValue(N, ResNo), Lo, Hi); } +void DAGTypeLegalizer::SplitVecRes_HSUM(SDNode *N, SDValue &Lo, SDValue &Hi) { + SDValue OpVal = N->getOperand(0); + SDLoc dl(N); + GetSplitVector(OpVal, Lo, Hi); + EVT NewVT = Lo.getValueType(); + Lo = DAG.getNode(N->getOpcode(), dl, NewVT, Lo); + Hi = DAG.getNode(N->getOpcode(), dl, NewVT, Hi); + Lo = DAG.getNode(N->getOpcode() == ISD::HSUM ? ISD::ADD : ISD::FADD, dl, + NewVT, Lo, Hi); + Hi = DAG.getUNDEF(NewVT); +} + void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue LHSLo, LHSHi; @@ -2034,6 +2051,10 @@ case ISD::FMA: Res = WidenVecRes_Ternary(N); break; + case ISD::HSUM: + case ISD::FHSUM: + Res = WidenVecRes_Unary(N); + break; } // If Res is null, the sub-method took care of registering the result. @@ -2879,6 +2900,10 @@ case ISD::TRUNCATE: Res = WidenVecOp_Convert(N); break; + case ISD::HSUM: + case ISD::FHSUM: + Res = WidenVecOp_HSUM(N); + break; } // If Res is null, the sub-method took care of registering the result. @@ -2897,6 +2922,34 @@ return false; } +SDValue DAGTypeLegalizer::WidenVecOp_HSUM(SDNode *N) { + // Since widen introduces undefs, scalarize the horizontal sum for actual + // number of vector elements. + // TODO: Improve scalarization using vector shift and add on supported + // targets. + SDValue Op = N->getOperand(0); + EVT VT = Op.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + SDLoc dl(Op); + + SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op, + DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op, + DAG.getConstant(1, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + SDValue Ops; + Ops = DAG.getNode(N->getOpcode() == ISD::HSUM ? ISD::ADD : ISD::FADD, dl, + EltVT, LHSElem, RHSElem); + for (unsigned i = 2; i < NumElems; ++i) { + LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op, + DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + Ops = DAG.getNode(N->getOpcode() == ISD::HSUM ? ISD::ADD : ISD::FADD, dl, + EltVT, LHSElem, Ops); + } + return Ops; +} + + SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) { SDLoc DL(N); EVT VT = N->getValueType(0); Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4745,6 +4745,24 @@ getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)))); return nullptr; + case Intrinsic::hsum: { + /// At IR level, this intrinsic is defined to return an scalar value. + /// However at SDAG level, this is being transformed into two nodes + /// *HSUM + EXTRACT_VECTOR_ELT. The idea is to keep the result of *HSUM + /// into vector, because if required, addtional vector operations can be + /// performed on this vector result. + SDValue Arg = getValue(I.getArgOperand(0)); + EVT Ty = Arg.getValueType(); + EVT EltVT = Ty.getVectorElementType(); + SDValue Hsum = DAG.getNode( + EltVT.isFloatingPoint() ? ISD::FHSUM : ISD::HSUM, + sdl, Ty, + Arg); + Hsum = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, EltVT, Hsum, + DAG.getConstant(0, sdl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + setValue(&I, Hsum); + return nullptr; + } case Intrinsic::cttz: { SDValue Arg = getValue(I.getArgOperand(0)); ConstantInt *CI = cast(I.getArgOperand(1)); Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -164,6 +164,8 @@ case ISD::FLOG: return "flog"; case ISD::FLOG2: return "flog2"; case ISD::FLOG10: return "flog10"; + case ISD::HSUM: return "hsum"; + case ISD::FHSUM: return "fhsum"; // Binary operators case ISD::ADD: return "add"; Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -829,6 +829,8 @@ setOperationAction(ISD::UMULO, VT, Expand); setOperationAction(ISD::UABSDIFF, VT, Expand); setOperationAction(ISD::SABSDIFF, VT, Expand); + setOperationAction(ISD::HSUM, VT, Expand); + setOperationAction(ISD::FHSUM, VT, Expand); // These library functions default to expand. setOperationAction(ISD::FROUND, VT, Expand); Index: test/CodeGen/X86/vec-hadd-float-128.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/vec-hadd-float-128.ll @@ -0,0 +1,48 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck --check-prefix=UNSAFE %s + + +declare float @llvm.hsum.f32.v2f32(<2 x float>) + +define float @test1_hsum_float_f32(<2 x float> %a1) { +; UNSAFE-LABEL: test1_hsum_float_f32: +; UNSAFE: # BB#0: +; UNSAFE-NEXT: movapd %xmm0, %xmm1 +; UNSAFE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; UNSAFE-NEXT: addps %xmm0, %xmm1 +; UNSAFE-NEXT: movaps %xmm1, %xmm0 +; UNSAFE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; UNSAFE-NEXT: addps %xmm1, %xmm0 +; UNSAFE-NEXT: retq + %1 = call float @llvm.hsum.f32.v2f32(<2 x float> %a1) + ret float %1 +} + +declare float @llvm.hsum.f32.v4f32(<4 x float>) + +define float @test2_hsum_float_f32(<4 x float> %a1) { +; UNSAFE-LABEL: test2_hsum_float_f32: +; UNSAFE: # BB#0: +; UNSAFE-NEXT: movapd %xmm0, %xmm1 +; UNSAFE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; UNSAFE-NEXT: addps %xmm0, %xmm1 +; UNSAFE-NEXT: movaps %xmm1, %xmm0 +; UNSAFE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; UNSAFE-NEXT: addps %xmm1, %xmm0 +; UNSAFE-NEXT: retq + %1 = call float @llvm.hsum.f32.v4f32(<4 x float> %a1) + ret float %1 +} + +declare double @llvm.hsum.f64.v2f64(<2 x double>) + +define double @test1_hsum_float_f64(<2 x double> %a1) { +; UNSAFE-LABEL: test1_hsum_float_f64: +; UNSAFE: # BB#0: +; UNSAFE-NEXT: movapd %xmm0, %xmm1 +; UNSAFE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; UNSAFE-NEXT: addpd %xmm0, %xmm1 +; UNSAFE-NEXT: movapd %xmm1, %xmm0 +; UNSAFE-NEXT: retq + %1 = call double @llvm.hsum.f64.v2f64(<2 x double> %a1) + ret double %1 +} Index: test/CodeGen/X86/vec-hadd-float-256.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/vec-hadd-float-256.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck --check-prefix=UNSAFE %s + +declare double @llvm.hsum.f64.v4f64(<4 x double>) + +define double @test_hsum_float_f64(<4 x double> %a1) { +; UNSAFE-LABEL: test_hsum_float_f64: +; UNSAFE: # BB#0: +; UNSAFE-NEXT: movapd %xmm1, %xmm2 +; UNSAFE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0] +; UNSAFE-NEXT: addpd %xmm1, %xmm2 +; UNSAFE-NEXT: movapd %xmm0, %xmm1 +; UNSAFE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; UNSAFE-DAG: addpd %xmm2, %xmm1 +; UNSAFE-DAG: addpd %xmm0, %xmm1 +; UNSAFE-NEXT: movapd %xmm1, %xmm0 +; UNSAFE-NEXT: retq + %1 = call double @llvm.hsum.f64.v4f64(<4 x double> %a1) + ret double %1 +} Index: test/CodeGen/X86/vec-hadd-int-128.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/vec-hadd-int-128.ll @@ -0,0 +1,75 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s -check-prefix=CHECK + +declare i8 @llvm.hsum.i8.v4i8(<4 x i8>) + +define i8 @test_hsum_int_i8(<4 x i8> %a1) { +; CHECK-LABEL: test_hsum_int_i8: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; CHECK-NEXT: paddd %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb -{{[0-9]+}}(%rsp), %al +; CHECK-NEXT: retq + %1 = call i8 @llvm.hsum.i8.v4i8(<4 x i8> %a1) + ret i8 %1 +} + +declare i16 @llvm.hsum.i16.v4i16(<4 x i16>) + +define i16 @test_hsum_int_i16(<4 x i16> %a1) { +; CHECK-LABEL: test_hsum_int_i16: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; CHECK-NEXT: paddd %xmm1, %xmm0 +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq + %1 = call i16 @llvm.hsum.i16.v4i16(<4 x i16> %a1) + ret i16 %1 +} + +declare i32 @llvm.hsum.i32.v3i32(<3 x i32>) + +define i32 @test1_hsum_int_i32(<3 x i32> %a1) { +; CHECK-LABEL: test1_hsum_int_i32: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; CHECK-NEXT: paddd %xmm1, %xmm0 +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq + %1 = call i32 @llvm.hsum.i32.v3i32(<3 x i32> %a1) + ret i32 %1 +} + +declare i32 @llvm.hsum.i32.v4i32(<4 x i32>) + +define i32 @test2_hsum_int_i32(<4 x i32> %a1) { +; CHECK-LABEL: test2_hsum_int_i32: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; CHECK-NEXT: paddd %xmm1, %xmm0 +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq + %1 = call i32 @llvm.hsum.i32.v4i32(<4 x i32> %a1) + ret i32 %1 +} + +declare i64 @llvm.hsum.i64.v2i64(<2 x i64>) + +define i64 @test1_hsum_int_i64(<2 x i64> %a1) { +; CHECK-LABEL: test1_hsum_int_i64: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: movd %xmm1, %rax +; CHECK-NEXT: retq + %1 = call i64 @llvm.hsum.i64.v2i64(<2 x i64> %a1) + ret i64 %1 +} Index: test/CodeGen/X86/vec-hadd-int-256.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/vec-hadd-int-256.ll @@ -0,0 +1,17 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK + +declare i64 @llvm.hsum.i64.v4i64(<4 x i64>) + +define i64 @test_hsum_int_i64(<4 x i64> %a1) { +; CHECK-LABEL: test_hsum_int_i64: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; CHECK-NEXT: paddq %xmm1, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: paddq %xmm2, %xmm1 +; CHECK-NEXT: movd %xmm1, %rax +; CHECK-NEXT: retq + %1 = call i64 @llvm.hsum.i64.v4i64(<4 x i64> %a1) + ret i64 %1 +}