Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -10387,6 +10387,55 @@ %1 = select <4 x i1> %ispos, <4 x i32> %sub, <4 x i32> %neg +'``llvm.hadd.*``' +^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. The loaded data is a vector of any integer or floating point data type. +Floating point data type is only supported in fast-math mode. + +.. code-block:: llvm + + declare @llvm.hadd.i32.v4i32(<4 x integer> %a) + declare @llvm.hadd.f32.v4f32(<4 x float> %a) + + +Overview: +""""""""" + +The ``llvm.hadd`` intrinsic returns the result of the horizontal or reduction sum of the elements of the +vector operand, treating it as integers or floats. + +.. note:: + + These intrinsics are primarily used during the code generation stage of + compilation.They are generated by the compiler passes such as the Loop and + SLP vectorizers. + The expectation is that, the frontends should not need to generate these + intrinsics themselves. + +Arguments: +"""""""""" + +The argument is vector of integer or floating point number. + +Semantics: +"""""""""" + +The expression:: + + call i32 @llvm.hadd.i32.v4i32(<4 x i32> %a) + +is equivalent to:: + + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = add <4 x i32> %a, %1 + %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> + %4 = add <4 x i32> %2, %3 + %5 = extractelement <4 x i32> %4, i32 0 + + Half Precision Floating Point Intrinsics ---------------------------------------- Index: include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- include/llvm/CodeGen/ISDOpcodes.h +++ include/llvm/CodeGen/ISDOpcodes.h @@ -342,6 +342,11 @@ /// vector. These nodes are generated from llvm.*absdiff* intrinsics. SABSDIFF, UABSDIFF, + /// HADD/FHADD - Horizontal sum across the elements of sole + /// integer or float input vector. + /// These nodes are generated from llvm.hadd* intrinsics. + HADD, FHADD, + /// Bit counting operators with an undefined result for zero inputs. CTTZ_ZERO_UNDEF, CTLZ_ZERO_UNDEF, Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -612,6 +612,9 @@ def int_uabsdiff : Intrinsic<[llvm_anyvector_ty], [ LLVMMatchType<0>, LLVMMatchType<0> ], [IntrNoMem]>; +// Calculate the horizontal/reduction sum across the elements of input vector. +def int_hadd : Intrinsic<[llvm_any_ty], [llvm_anyvector_ty], [IntrNoMem]>; + //===-------------------------- Masked Intrinsics -------------------------===// // def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, LLVMPointerTo<0>, Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -388,6 +388,8 @@ def sabsdiff : SDNode<"ISD::SABSDIFF" , SDTIntBinOp>; def uabsdiff : SDNode<"ISD::UABSDIFF" , SDTIntBinOp>; +def hadd : SDNode<"ISD::HADD" , SDTIntUnaryOp>; +def fhadd : SDNode<"ISD::FHADD" , SDTIntUnaryOp>; def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>; def bswap : SDNode<"ISD::BSWAP" , SDTIntUnaryOp>; def ctlz : SDNode<"ISD::CTLZ" , SDTIntUnaryOp>; Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2891,6 +2891,34 @@ case ISD::BSWAP: Results.push_back(ExpandBSWAP(Node->getOperand(0), dl)); break; + case ISD::FHADD: + case ISD::HADD: { + if (Node->getOpcode() == ISD::FHADD) + assert(TM.Options.UnsafeFPMath && + "Floating point horizontal sum only supported for fast-math"); + SDValue OpVal = Node->getOperand(0); + EVT VT = OpVal.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + + SmallVector ShuffleMask(NumElems, -1); + for (unsigned i = NumElems; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i / 2; ++j) + ShuffleMask[j] = (i / 2 + j); + SDValue Shuffle = DAG.getVectorShuffle(VT, dl, OpVal, DAG.getUNDEF(VT), + &ShuffleMask[0]); + OpVal = DAG.getNode(Node->getOpcode() == ISD::HADD ? ISD::ADD : ISD::FADD, + dl, VT, OpVal, Shuffle); + } + Tmp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, OpVal, + DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + if (Node->getOpcode() == ISD::HADD) { + Tmp1 = DAG.getAnyExtOrTrunc(Tmp1, dl, Node->getValueType(0)); + } + Results.push_back(Tmp1); + break; + } case ISD::FRAMEADDR: case ISD::RETURNADDR: case ISD::FRAME_TO_ARGS_OFFSET: Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -890,6 +890,7 @@ case ISD::SRL: case ISD::ROTL: case ISD::ROTR: Res = PromoteIntOp_Shift(N); break; + case ISD::HADD: Res = PromoteIntOp_HADD(N); break; } // If the result is null, the sub-method took care of registering results etc. @@ -1223,6 +1224,11 @@ N->getOperand(0).getValueType().getScalarType()); } +SDValue DAGTypeLegalizer::PromoteIntOp_HADD(SDNode *N) { + SDValue Op = GetPromotedInteger(N->getOperand(0)); + return DAG.getNode(N->getOpcode(), SDLoc(N), + N->getValueType(0), Op); +} //===----------------------------------------------------------------------===// // Integer Result Expansion Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -294,6 +294,7 @@ SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N); SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo); + SDValue PromoteIntOp_HADD(SDNode *N); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -639,6 +640,7 @@ SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo); SDValue SplitVecOp_UnaryOp(SDNode *N); SDValue SplitVecOp_TruncateHelper(SDNode *N); + SDValue SplitVecOp_HADD(SDNode *N); SDValue SplitVecOp_BITCAST(SDNode *N); SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N); @@ -710,6 +712,7 @@ SDValue WidenVecOp_SETCC(SDNode* N); SDValue WidenVecOp_Convert(SDNode *N); + SDValue WidenVecOp_HADD(SDNode *N); //===--------------------------------------------------------------------===// // Vector Widening Utilities Support: LegalizeVectorTypes.cpp Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -59,6 +59,9 @@ /// \brief Implements unrolling a VSETCC. SDValue UnrollVSETCC(SDValue Op); + /// \brief Implements unrolling a HADD/FHADD. + SDValue UnrollHADD(SDValue Op); + /// \brief Implement expand-based legalization of vector operations. /// /// This is just a high-level routine to dispatch to specific code paths for @@ -714,6 +717,9 @@ case ISD::UABSDIFF: case ISD::SABSDIFF: return ExpandABSDIFF(Op); + case ISD::HADD: + case ISD::FHADD: + return UnrollHADD(Op); default: return DAG.UnrollVectorOp(Op.getNode()); } @@ -1045,6 +1051,34 @@ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } +SDValue VectorLegalizer::UnrollHADD(SDValue Op) { + if (Op->getOpcode() == ISD::FHADD) + assert(DAG.getTarget().Options.UnsafeFPMath && + "Floating point horizontal sum only supported for fast-math"); + SDLoc dl(Op); + SDValue OpVal = Op.getOperand(0); + EVT VT = OpVal.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + + SmallVector ShuffleMask(NumElems, -1); + for (unsigned i = NumElems; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i / 2; ++j) + ShuffleMask[j] = (i / 2 + j); + SDValue Shuffle = + DAG.getVectorShuffle(VT, dl, OpVal, DAG.getUNDEF(VT), &ShuffleMask[0]); + OpVal = DAG.getNode(Op->getOpcode() == ISD::HADD ? ISD::ADD : ISD::FADD, dl, + VT, OpVal, Shuffle); + } + SDValue Tmp1 = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, dl, EltVT, OpVal, + DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + if (Op->getOpcode() == ISD::HADD) { + Tmp1 = DAG.getAnyExtOrTrunc(Tmp1, dl, Op.getValueType()); + } + return Tmp1; +} } bool SelectionDAG::LegalizeVectors() { Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -643,6 +643,8 @@ case ISD::SINT_TO_FP: case ISD::TRUNCATE: case ISD::UINT_TO_FP: + case ISD::HADD: + case ISD::FHADD: SplitVecRes_UnaryOp(N, Lo, Hi); break; @@ -1398,6 +1400,10 @@ case ISD::FTRUNC: Res = SplitVecOp_UnaryOp(N); break; + case ISD::HADD: + case ISD::FHADD: + Res = SplitVecOp_HADD(N); + break; } } @@ -1483,6 +1489,18 @@ JoinIntegers(Lo, Hi)); } +SDValue DAGTypeLegalizer::SplitVecOp_HADD(SDNode *N) { + SDValue Lo, Hi; + EVT SubVT = N->getValueType(0); + SDValue OpVal = N->getOperand(0); + SDLoc dl(N); + GetSplitVector(OpVal, Lo, Hi); + Lo = DAG.getNode(N->getOpcode(), dl, SubVT, Lo); + Hi = DAG.getNode(N->getOpcode(), dl, SubVT, Hi); + return DAG.getNode(N->getOpcode() == ISD::HADD ? ISD::ADD : ISD::FADD, dl, + SubVT, Lo, Hi); +} + SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) { // We know that the extracted result type is legal. EVT SubVT = N->getValueType(0); @@ -2833,6 +2851,10 @@ case ISD::TRUNCATE: Res = WidenVecOp_Convert(N); break; + case ISD::HADD: + case ISD::FHADD: + Res = WidenVecOp_HADD(N); + break; } // If Res is null, the sub-method took care of registering the result. @@ -2851,6 +2873,37 @@ return false; } +SDValue DAGTypeLegalizer::WidenVecOp_HADD(SDNode *N) { + if (N->getOpcode() == ISD::FHADD) + assert(DAG.getTarget().Options.UnsafeFPMath && + "Floating point horizontal sum only supported for fast-math"); + // Since widen introduces undefs, scalarize the horizontal sum for actual + // number of vector elements. + // TODO: Improve scalarization using vector shift and add on supported + // targets. + SDValue Op = N->getOperand(0); + EVT VT = Op.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + SDLoc dl(Op); + + SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op, + DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op, + DAG.getConstant(1, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + SDValue Ops; + Ops = DAG.getNode(N->getOpcode() == ISD::HADD ? ISD::ADD : ISD::FADD, dl, + EltVT, LHSElem, RHSElem); + for (unsigned i = 2; i < NumElems; ++i) { + LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op, + DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + Ops = DAG.getNode(N->getOpcode() == ISD::HADD ? ISD::ADD : ISD::FADD, dl, + EltVT, LHSElem, Ops); + } + return Ops; +} + + SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) { SDLoc DL(N); EVT VT = N->getValueType(0); Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4663,6 +4663,16 @@ getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)))); return nullptr; + case Intrinsic::hadd: { + SDValue Arg = getValue(I.getArgOperand(0)); + EVT Ty = Arg.getValueType(); + EVT EltVT = Ty.getVectorElementType(); + setValue(&I, DAG.getNode( + EltVT.isFloatingPoint() ? ISD::FHADD : ISD::HADD, + sdl, TLI.getValueType(DAG.getDataLayout(), I.getType()), + Arg)); + return nullptr; + } case Intrinsic::cttz: { SDValue Arg = getValue(I.getArgOperand(0)); ConstantInt *CI = cast(I.getArgOperand(1)); Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -162,6 +162,8 @@ case ISD::FLOG: return "flog"; case ISD::FLOG2: return "flog2"; case ISD::FLOG10: return "flog10"; + case ISD::HADD: return "hadd"; + case ISD::FHADD: return "fhadd"; // Binary operators case ISD::ADD: return "add"; Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -829,6 +829,8 @@ setOperationAction(ISD::UMULO, VT, Expand); setOperationAction(ISD::UABSDIFF, VT, Expand); setOperationAction(ISD::SABSDIFF, VT, Expand); + setOperationAction(ISD::HADD, VT, Expand); + setOperationAction(ISD::FHADD, VT, Expand); // These library functions default to expand. setOperationAction(ISD::FROUND, VT, Expand); Index: test/CodeGen/X86/hadd_expand.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/hadd_expand.ll @@ -0,0 +1,87 @@ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s -check-prefix=CHECK + +declare i8 @llvm.hadd.i8.v4i8(<4 x i8>) + +define i8 @test_hadd_int_i8(<4 x i8> %a1) { +; CHECK-LABEL: test_hadd_int_i8 +; CHECK: pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: pshufd $237, %xmm1, %xmm0 # xmm0 = xmm1[1,3,2,3] +; CHECK-NEXT: paddd %xmm1, %xmm0 +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq + %1 = call i8 @llvm.hadd.i8.v4i8(<4 x i8> %a1) + ret i8 %1 +} + +declare i16 @llvm.hadd.i16.v4i16(<4 x i16>) + +define i16 @test_hadd_int_i16(<4 x i16> %a1) { +; CHECK-LABEL: test_hadd_int_i16 +; CHECK: pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: pshufd $237, %xmm1, %xmm0 # xmm0 = xmm1[1,3,2,3] +; CHECK-NEXT: paddd %xmm1, %xmm0 +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq + %1 = call i16 @llvm.hadd.i16.v4i16(<4 x i16> %a1) + ret i16 %1 +} + +declare i32 @llvm.hadd.i32.v3i32(<3 x i32>) + +define i32 @test1_hadd_int_i32(<3 x i32> %a1) { +; CHECK-LABEL: test1_hadd_int_i32 +; CHECK: movd %xmm0, %eax +; CHECK-NEXT: pshufd $229, %xmm0, %xmm1 # xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: pshufd $78, %xmm0, %xmm0 # xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: retq + %1 = call i32 @llvm.hadd.i32.v3i32(<3 x i32> %a1) + ret i32 %1 +} + +declare i32 @llvm.hadd.i32.v4i32(<4 x i32>) + +define i32 @test2_hadd_int_i32(<4 x i32> %a1) { +; CHECK-LABEL: test2_hadd_int_i32 +; CHECK: pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: pshufd $237, %xmm1, %xmm0 # xmm0 = xmm1[1,3,2,3] +; CHECK-NEXT: paddd %xmm1, %xmm0 +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq + %1 = call i32 @llvm.hadd.i32.v4i32(<4 x i32> %a1) + ret i32 %1 +} + +declare i64 @llvm.hadd.i64.v2i64(<2 x i64>) + +define i64 @test1_hadd_int_i64(<2 x i64> %a1) { +; CHECK-LABEL: test1_hadd_int_i64 +; CHECK: pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: movd %xmm1, %rax +; CHECK-NEXT: retq + %1 = call i64 @llvm.hadd.i64.v2i64(<2 x i64> %a1) + ret i64 %1 +} + +declare i64 @llvm.hadd.i64.v4i64(<4 x i64>) + +define i64 @test2_hadd_int_i64(<4 x i64> %a1) { +; CHECK-LABEL: test2_hadd_int_i64 +; CHECK: pshufd $78, %xmm1, %xmm2 # xmm2 = xmm1[2,3,0,1] +; CHECK-NEXT: paddq %xmm1, %xmm2 +; CHECK-NEXT: movd %xmm2, %rcx +; CHECK-NEXT: pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: movd %xmm1, %rax +; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: retq + %1 = call i64 @llvm.hadd.i64.v4i64(<4 x i64> %a1) + ret i64 %1 +} Index: test/CodeGen/X86/hadd_float_expand.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/hadd_float_expand.ll @@ -0,0 +1,67 @@ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=CHECK + +declare float @llvm.hadd.f32.v2f32(<2 x float>) + +define float @test1_hadd_float_f32(<2 x float> %a1) { +; CHECK-LABEL: test1_hadd_float_f32 +; CHECK: movaps %xmm0, %xmm1 +; CHECK-NEXT: shufps $229, %xmm1, %xmm1 # xmm1 = xmm1[1,1,2,3] +; CHECK-NEXT: addss %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %1 = call float @llvm.hadd.f32.v2f32(<2 x float> %a1) + ret float %1 +} + +declare float @llvm.hadd.f32.v4f32(<4 x float>) + +define float @test2_hadd_float_f32(<4 x float> %a1) { +; CHECK-LABEL: test2_hadd_float_f32 +; CHECK: movapd %xmm0, %xmm1 +; CHECK-NEXT: shufpd $1, %xmm1, %xmm1 # xmm1 = xmm1[1,0] +; CHECK-NEXT: addps %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: shufps $237, %xmm0, %xmm0 # xmm0 = xmm0[1,3,2,3] +; CHECK-NEXT: addps %xmm1, %xmm0 +; CHECK-NEXT: retq + %1 = call float @llvm.hadd.f32.v4f32(<4 x float> %a1) + ret float %1 +} + +declare double @llvm.hadd.f64.v2f64(<2 x double>) + +define double @test1_hadd_float_f64(<2 x double> %a1) { +; CHECK-LABEL: test1_hadd_float_f64 +; CHECK: movapd %xmm0, %xmm1 +; CHECK-NEXT: shufpd $1, %xmm1, %xmm1 # xmm1 = xmm1[1,0] +; CHECK-NEXT: addpd %xmm0, %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: retq + %1 = call double @llvm.hadd.f64.v2f64(<2 x double> %a1) + ret double %1 +} + +declare double @llvm.hadd.f64.v8f64(<8 x double>) + +define double @test2_hadd_float_f64(<8 x double> %a1) { +; CHECK-LABEL: test2_hadd_float_f64 +; CHECK: movapd %xmm3, %xmm4 +; CHECK-NEXT: shufpd $1, %xmm4, %xmm4 # xmm4 = xmm4[1,0] +; CHECK-NEXT: addpd %xmm3, %xmm4 +; CHECK-NEXT: movapd %xmm2, %xmm3 +; CHECK-NEXT: shufpd $1, %xmm3, %xmm3 # xmm3 = xmm3[1,0] +; CHECK-NEXT: addpd %xmm2, %xmm3 +; CHECK-NEXT: addsd %xmm4, %xmm3 +; CHECK-NEXT: movapd %xmm1, %xmm2 +; CHECK-NEXT: shufpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0] +; CHECK-NEXT: addpd %xmm1, %xmm2 +; CHECK-NEXT: movapd %xmm0, %xmm1 +; CHECK-NEXT: shufpd $1, %xmm1, %xmm1 # xmm1 = xmm1[1,0] +; CHECK-NEXT: addpd %xmm0, %xmm1 +; CHECK-NEXT: addsd %xmm3, %xmm2 +; CHECK-NEXT: addsd %xmm2, %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: retq + %1 = call double @llvm.hadd.f64.v8f64(<8 x double> %a1) + ret double %1 +}