Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -9579,6 +9579,54 @@ %r2 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) ; yields float:r2 = (a * b) + c + +'``llvm.hadd.*``' +^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. The loaded data is a vector of any integer or floating point data type. + +.. code-block:: llvm + + declare <4 x integer> @llvm.hadd.v4i32(<4 x integer> %a) + declare <4 x float> @llvm.hadd.v4f32(<4 x float> %a) + +Overview: +""""""""" + +The ``llvm.hadd`` intrinsic returns the result of the horizontal or reduction sum of the elements of the +vector operand, treating it as integers or floats. + +.. note:: + + These intrinsics are primarily used during the code generation stage of + compilation.They are generated by the compiler passes such as the Loop and + SLP vectorizers. + The expectation is that, the frontends should not need to generate these + intrinsics themselves. + +Arguments: +"""""""""" + +The argument is vector of integer or floating point number. + +Semantics: +"""""""""" + +The expression:: + + call i32 @llvm.hadd.v4i32(<4 x i32> %a) + +is equivalent to:: + + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = add <4 x i32> %a, %1 + %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> + %4 = add <4 x i32> %2, %3 + %5 = extractelement <4 x i32> %4, i32 0 + + Half Precision Floating Point Intrinsics ---------------------------------------- Index: include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- include/llvm/CodeGen/ISDOpcodes.h +++ include/llvm/CodeGen/ISDOpcodes.h @@ -331,6 +331,11 @@ /// Byte Swap and Counting operators. BSWAP, CTTZ, CTLZ, CTPOP, + /// HADD/FHADD - Horizontal sum across the elements of sole + /// integer or float input vector. + /// These nodes are generated from llvm.hadd* intrinsics. + HADD, FHADD, + /// Bit counting operators with an undefined result for zero inputs. CTTZ_ZERO_UNDEF, CTLZ_ZERO_UNDEF, Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -595,6 +595,9 @@ def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], [], "llvm.clear_cache">; +// Calculate the horizontal/reduction sum across the elements of input vector. +def int_hadd_int : Intrinsic<[llvm_anyint_ty], [llvm_anyvector_ty], [IntrNoMem]>; +def int_hadd_float : Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>; //===-------------------------- Masked Intrinsics -------------------------===// // def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, LLVMPointerTo<0>, Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -386,6 +386,8 @@ def umin : SDNode<"ISD::UMIN" , SDTIntBinOp>; def umax : SDNode<"ISD::UMAX" , SDTIntBinOp>; +def hadd : SDNode<"ISD::HADD" , SDTIntUnaryOp>; +def fhadd : SDNode<"ISD::FHADD" , SDTIntUnaryOp>; def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>; def bswap : SDNode<"ISD::BSWAP" , SDTIntUnaryOp>; def ctlz : SDNode<"ISD::CTLZ" , SDTIntUnaryOp>; Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2871,6 +2871,34 @@ case ISD::BSWAP: Results.push_back(ExpandBSWAP(Node->getOperand(0), dl)); break; + case ISD::FHADD: + case ISD::HADD: { + SDValue OpVal = Node->getOperand(0); + EVT VT = OpVal.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + + SmallVector ShuffleMask(NumElems, -1); + for (unsigned i = NumElems; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i / 2; ++j) + ShuffleMask[j] = (i / 2 + j); + SDValue Shuffle = DAG.getVectorShuffle(VT, dl, OpVal, DAG.getUNDEF(VT), + &ShuffleMask[0]); + if(Node->getOpcode() == ISD::HADD) { + OpVal = DAG.getNode(ISD::ADD, dl, VT, OpVal, Shuffle); + } else { + OpVal = DAG.getNode(ISD::FADD, dl, VT, OpVal, Shuffle); + } + } + Tmp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, OpVal, + DAG.getConstant(0, dl, TLI.getVectorIdxTy())); + if (Node->getOpcode() == ISD::HADD) { + Tmp1 = DAG.getAnyExtOrTrunc(Tmp1, dl, Node->getValueType(0)); + } + Results.push_back(Tmp1); + break; + } case ISD::FRAMEADDR: case ISD::RETURNADDR: case ISD::FRAME_TO_ARGS_OFFSET: Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -884,6 +884,7 @@ case ISD::SRL: case ISD::ROTL: case ISD::ROTR: Res = PromoteIntOp_Shift(N); break; + case ISD::HADD: Res = PromoteIntOp_HADD(N); break; } // If the result is null, the sub-method took care of registering results etc. @@ -1217,6 +1218,11 @@ N->getOperand(0).getValueType().getScalarType()); } +SDValue DAGTypeLegalizer::PromoteIntOp_HADD(SDNode *N) { + SDValue Op = GetPromotedInteger(N->getOperand(0)); + return DAG.getNode(N->getOpcode(), SDLoc(N), + N->getValueType(0), Op); +} //===----------------------------------------------------------------------===// // Integer Result Expansion Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -294,6 +294,7 @@ SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N); SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo); + SDValue PromoteIntOp_HADD(SDNode *N); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -639,6 +640,7 @@ SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo); SDValue SplitVecOp_UnaryOp(SDNode *N); SDValue SplitVecOp_TruncateHelper(SDNode *N); + SDValue SplitVecOp_HADD(SDNode *N); SDValue SplitVecOp_BITCAST(SDNode *N); SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N); @@ -710,6 +712,7 @@ SDValue WidenVecOp_SETCC(SDNode* N); SDValue WidenVecOp_Convert(SDNode *N); + SDValue WidenVecOp_FHADD(SDNode *N); //===--------------------------------------------------------------------===// // Vector Widening Utilities Support: LegalizeVectorTypes.cpp Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -59,6 +59,9 @@ /// \brief Implements unrolling a VSETCC. SDValue UnrollVSETCC(SDValue Op); + /// \brief Implements unrolling a HADD/FHADD. + SDValue UnrollHADD(SDValue Op); + /// \brief Implement expand-based legalization of vector operations. /// /// This is just a high-level routine to dispatch to specific code paths for @@ -704,6 +707,9 @@ return ExpandFNEG(Op); case ISD::SETCC: return UnrollVSETCC(Op); + case ISD::HADD: + case ISD::FHADD: + return UnrollHADD(Op); default: return DAG.UnrollVectorOp(Op.getNode()); } @@ -1010,6 +1016,36 @@ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } +SDValue VectorLegalizer::UnrollHADD(SDValue Op) { + EVT VT = Op.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + SDValue OpVal = Op.getOperand(0); + EVT TmpEltVT = OpVal.getValueType().getVectorElementType(); + SDLoc dl(Op); + + SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, OpVal, + DAG.getConstant(0, dl, TLI.getVectorIdxTy())); + SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, OpVal, + DAG.getConstant(1, dl, TLI.getVectorIdxTy())); + SDValue Ops; + if (Op->getOpcode() == ISD::HADD) { + Ops = DAG.getNode(ISD::ADD, dl, VT, LHSElem, RHSElem); + } else { + Ops = DAG.getNode(ISD::FADD, dl, VT, LHSElem, RHSElem); + } + + for (unsigned i = 2; i < NumElems; ++i) { + LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, OpVal, + DAG.getConstant(i, dl, TLI.getVectorIdxTy())); + if (Op->getOpcode() == ISD::HADD) { + Ops = DAG.getNode(ISD::ADD, dl, VT, LHSElem, Ops); + } else { + Ops = DAG.getNode(ISD::FADD, dl, VT, LHSElem, Ops); + } + } + return Ops; +} } bool SelectionDAG::LegalizeVectors() { Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -640,6 +640,8 @@ case ISD::SINT_TO_FP: case ISD::TRUNCATE: case ISD::UINT_TO_FP: + case ISD::HADD: + case ISD::FHADD: SplitVecRes_UnaryOp(N, Lo, Hi); break; @@ -1394,6 +1396,10 @@ case ISD::FTRUNC: Res = SplitVecOp_UnaryOp(N); break; + case ISD::HADD: + case ISD::FHADD: + Res = SplitVecOp_HADD(N); + break; } } @@ -1479,6 +1485,21 @@ JoinIntegers(Lo, Hi)); } +SDValue DAGTypeLegalizer::SplitVecOp_HADD(SDNode *N) { + SDValue Lo, Hi; + EVT SubVT = N->getValueType(0); + SDValue OpVal = N->getOperand(0); + SDLoc dl(N); + GetSplitVector(OpVal, Lo, Hi); + Lo = DAG.getNode(N->getOpcode(), dl, SubVT, Lo); + Hi = DAG.getNode(N->getOpcode(), dl, SubVT, Hi); + if (N->getOpcode() == ISD::HADD) { + return DAG.getNode(ISD::ADD, dl, SubVT, Lo, Hi); + } else { + return DAG.getNode(ISD::FADD, dl, SubVT, Lo, Hi); + } +} + SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) { // We know that the extracted result type is legal. EVT SubVT = N->getValueType(0); @@ -2824,6 +2845,9 @@ case ISD::TRUNCATE: Res = WidenVecOp_Convert(N); break; + case ISD::FHADD: + Res = WidenVecOp_FHADD(N); + break; } // If Res is null, the sub-method took care of registering the result. @@ -2842,6 +2866,11 @@ return false; } +SDValue DAGTypeLegalizer::WidenVecOp_FHADD(SDNode *N) { + SDValue InOp = GetWidenedVector(N->getOperand(0)); + return DAG.getNode(ISD::FHADD, SDLoc(N), N->getValueType(0), InOp); +} + SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) { SDLoc DL(N); EVT VT = N->getValueType(0); Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4621,6 +4621,16 @@ getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)))); return nullptr; + case Intrinsic::hadd_int: + setValue(&I, DAG.getNode(ISD::HADD, sdl, + TLI.getValueType(I.getType()), + getValue(I.getArgOperand(0)))); + return nullptr; + case Intrinsic::hadd_float: + setValue(&I, DAG.getNode(ISD::FHADD, sdl, + TLI.getValueType(I.getType()), + getValue(I.getArgOperand(0)))); + return nullptr; case Intrinsic::cttz: { SDValue Arg = getValue(I.getArgOperand(0)); ConstantInt *CI = cast(I.getArgOperand(1)); Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -161,6 +161,8 @@ case ISD::FLOG: return "flog"; case ISD::FLOG2: return "flog2"; case ISD::FLOG10: return "flog10"; + case ISD::HADD: return "hadd"; + case ISD::FHADD: return "fhadd"; // Binary operators case ISD::ADD: return "add"; Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -823,6 +823,8 @@ setOperationAction(ISD::USUBO, VT, Expand); setOperationAction(ISD::SMULO, VT, Expand); setOperationAction(ISD::UMULO, VT, Expand); + setOperationAction(ISD::HADD, VT, Expand); + setOperationAction(ISD::FHADD, VT, Expand); // These library functions default to expand. setOperationAction(ISD::FROUND, VT, Expand); Index: test/CodeGen/X86/hadd_expand.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/hadd_expand.ll @@ -0,0 +1,71 @@ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s -check-prefix=CHECK + +declare i8 @llvm.hadd.int.i8.v4i8(<4 x i8>) + +define i8 @test_hadd_int_i8(<4 x i8> %a1) { +; CHECK-LABEL: test_hadd_int_i8 +; CHECK: pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: pshufd $237, %xmm1, %xmm0 # xmm0 = xmm1[1,3,2,3] +; CHECK-NEXT: paddd %xmm1, %xmm0 +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq + %1 = call i8 @llvm.hadd.int.i8.v4i8(<4 x i8> %a1) + ret i8 %1 +} + +declare i16 @llvm.hadd.int.i16.v4i16(<4 x i16>) + +define i16 @test_hadd_int_i16(<4 x i16> %a1) { +; CHECK-LABEL: test_hadd_int_i16 +; CHECK: pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: pshufd $237, %xmm1, %xmm0 # xmm0 = xmm1[1,3,2,3] +; CHECK-NEXT: paddd %xmm1, %xmm0 +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq + %1 = call i16 @llvm.hadd.int.i16.v4i16(<4 x i16> %a1) + ret i16 %1 +} + +declare i32 @llvm.hadd.int.i32.v4i32(<4 x i32>) + +define i32 @test_hadd_int_i32(<4 x i32> %a1) { +; CHECK-LABEL: test_hadd_int_i32 +; CHECK: pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: pshufd $237, %xmm1, %xmm0 # xmm0 = xmm1[1,3,2,3] +; CHECK-NEXT: paddd %xmm1, %xmm0 +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq + %1 = call i32 @llvm.hadd.int.i32.v4i32(<4 x i32> %a1) + ret i32 %1 +} + +declare i64 @llvm.hadd.int.i64.v2i64(<2 x i64>) + +define i64 @test1_hadd_int_i64(<2 x i64> %a1) { +; CHECK-LABEL: test1_hadd_int_i64 +; CHECK: pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: movd %xmm1, %rax +; CHECK-NEXT: retq + %1 = call i64 @llvm.hadd.int.i64.v2i64(<2 x i64> %a1) + ret i64 %1 +} + +declare i64 @llvm.hadd.int.i64.v4i64(<4 x i64>) + +define i64 @test2_hadd_int_i64(<4 x i64> %a1) { +; CHECK-LABEL: test2_hadd_int_i64 +; CHECK: pshufd $78, %xmm1, %xmm2 # xmm2 = xmm1[2,3,0,1] +; CHECK-NEXT: paddq %xmm1, %xmm2 +; CHECK-NEXT: movd %xmm2, %rcx +; CHECK-NEXT: pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: movd %xmm1, %rax +; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: retq + %1 = call i64 @llvm.hadd.int.i64.v4i64(<4 x i64> %a1) + ret i64 %1 +} Index: test/CodeGen/X86/hadd_float_expand.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/hadd_float_expand.ll @@ -0,0 +1,69 @@ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s -check-prefix=CHECK + +declare float @llvm.hadd.float.f32.v2f32(<2 x float>) + +define float @test1_hadd_float_f32(<2 x float> %a1) { +; CHECK-LABEL: test1_hadd_float_f32 +; CHECK: movapd %xmm0, %xmm1 +; CHECK-NEXT: shufpd $1, %xmm1, %xmm1 # xmm1 = xmm1[1,0] +; CHECK-NEXT: addps %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: shufps $237, %xmm0, %xmm0 # xmm0 = xmm0[1,3,2,3] +; CHECK-NEXT: addps %xmm1, %xmm0 +; CHECK-NEXT: retq + %1 = call float @llvm.hadd.float.f32.v2f32(<2 x float> %a1) + ret float %1 +} + +declare float @llvm.hadd.float.f32.v4f32(<4 x float>) + +define float @test2_hadd_float_f32(<4 x float> %a1) { +; CHECK-LABEL: test2_hadd_float_f32 +; CHECK: movapd %xmm0, %xmm1 +; CHECK-NEXT: shufpd $1, %xmm1, %xmm1 # xmm1 = xmm1[1,0] +; CHECK-NEXT: addps %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: shufps $237, %xmm0, %xmm0 # xmm0 = xmm0[1,3,2,3] +; CHECK-NEXT: addps %xmm1, %xmm0 +; CHECK-NEXT: retq + %1 = call float @llvm.hadd.float.f32.v4f32(<4 x float> %a1) + ret float %1 +} + +declare double @llvm.hadd.float.f64.v2f64(<2 x double>) + +define double @test1_hadd_float_f64(<2 x double> %a1) { +; CHECK-LABEL: test1_hadd_float_f64 +; CHECK: movapd %xmm0, %xmm1 +; CHECK-NEXT: shufpd $1, %xmm1, %xmm1 # xmm1 = xmm1[1,0] +; CHECK-NEXT: addpd %xmm0, %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: retq + %1 = call double @llvm.hadd.float.f64.v2f64(<2 x double> %a1) + ret double %1 +} + +declare double @llvm.hadd.float.f64.v8f64(<8 x double>) + +define double @test2_hadd_float_f64(<8 x double> %a1) { +; CHECK-LABEL: test2_hadd_float_f64 +; CHECK: movapd %xmm3, %xmm4 +; CHECK-NEXT: shufpd $1, %xmm4, %xmm4 # xmm4 = xmm4[1,0] +; CHECK-NEXT: addpd %xmm3, %xmm4 +; CHECK-NEXT: movapd %xmm2, %xmm3 +; CHECK-NEXT: shufpd $1, %xmm3, %xmm3 # xmm3 = xmm3[1,0] +; CHECK-NEXT: addpd %xmm2, %xmm3 +; CHECK-NEXT: addsd %xmm4, %xmm3 +; CHECK-NEXT: movapd %xmm1, %xmm2 +; CHECK-NEXT: shufpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0] +; CHECK-NEXT: addpd %xmm1, %xmm2 +; CHECK-NEXT: movapd %xmm0, %xmm1 +; CHECK-NEXT: shufpd $1, %xmm1, %xmm1 # xmm1 = xmm1[1,0] +; CHECK-NEXT: addpd %xmm0, %xmm1 +; CHECK-NEXT: addsd %xmm2, %xmm1 +; CHECK-NEXT: addsd %xmm3, %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: retq + %1 = call double @llvm.hadd.float.f64.v8f64(<8 x double> %a1) + ret double %1 +}