Index: include/llvm/CodeGen/MachineValueType.h =================================================================== --- include/llvm/CodeGen/MachineValueType.h +++ include/llvm/CodeGen/MachineValueType.h @@ -196,21 +196,24 @@ /// is32BitVector - Return true if this is a 32-bit vector type. bool is32BitVector() const { return (SimpleTy == MVT::v4i8 || SimpleTy == MVT::v2i16 || - SimpleTy == MVT::v1i32); + SimpleTy == MVT::v1i32 || SimpleTy == MVT::v2f16 || + SimpleTy == MVT::v1f32); } /// is64BitVector - Return true if this is a 64-bit vector type. bool is64BitVector() const { return (SimpleTy == MVT::v8i8 || SimpleTy == MVT::v4i16 || SimpleTy == MVT::v2i32 || SimpleTy == MVT::v1i64 || - SimpleTy == MVT::v1f64 || SimpleTy == MVT::v2f32); + SimpleTy == MVT::v4f16 || SimpleTy == MVT::v2f32 || + SimpleTy == MVT::v1f64); } /// is128BitVector - Return true if this is a 128-bit vector type. bool is128BitVector() const { return (SimpleTy == MVT::v16i8 || SimpleTy == MVT::v8i16 || SimpleTy == MVT::v4i32 || SimpleTy == MVT::v2i64 || - SimpleTy == MVT::v4f32 || SimpleTy == MVT::v2f64); + SimpleTy == MVT::v8f16 || SimpleTy == MVT::v4f32 || + SimpleTy == MVT::v2f64); } /// is256BitVector - Return true if this is a 256-bit vector type. Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -4104,12 +4104,13 @@ MVT OVT = Node->getSimpleValueType(0); if (Node->getOpcode() == ISD::UINT_TO_FP || Node->getOpcode() == ISD::SINT_TO_FP || - Node->getOpcode() == ISD::SETCC) { + Node->getOpcode() == ISD::SETCC || + Node->getOpcode() == ISD::FNEARBYINT) { OVT = Node->getOperand(0).getSimpleValueType(); } MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), OVT); SDLoc dl(Node); - SDValue Tmp1, Tmp2, Tmp3; + SDValue Tmp1, Tmp2, Tmp3, Tmp4; switch (Node->getOpcode()) { case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: @@ -4262,9 +4263,13 @@ Tmp1, Tmp2, Node->getOperand(2))); break; } + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: case ISD::FDIV: case ISD::FREM: - case ISD::FPOW: { + case ISD::FPOW: + case ISD::FCOPYSIGN: { Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0)); Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1)); Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2); @@ -4273,13 +4278,41 @@ break; } case ISD::FLOG2: + case ISD::FLOG10: case ISD::FEXP2: + case ISD::FEXP: case ISD::FLOG: - case ISD::FEXP: { + case ISD::FCEIL: + case ISD::FFLOOR: + case ISD::FABS: + case ISD::FNEARBYINT: + case ISD::FNEG: + case ISD::FROUND: + case ISD::FRINT: + case ISD::FSIN: + case ISD::FCOS: + case ISD::FSQRT: + case ISD::FTRUNC: { Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0)); Tmp2 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1); - Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT, - Tmp2, DAG.getIntPtrConstant(0))); + Results.push_back( + DAG.getNode(ISD::FP_ROUND, dl, OVT, Tmp2, DAG.getIntPtrConstant(0))); + break; + } + case ISD::FPOWI: { + Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0)); + Tmp2 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Node->getOperand(1)); + Results.push_back( + DAG.getNode(ISD::FP_ROUND, dl, OVT, Tmp2, DAG.getIntPtrConstant(0))); + break; + } + case ISD::FMA: { + Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0)); + Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1)); + Tmp3 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(2)); + Tmp4 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2, Tmp3); + Results.push_back( + DAG.getNode(ISD::FP_ROUND, dl, OVT, Tmp4, DAG.getIntPtrConstant(0))); break; } } Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -1822,6 +1822,12 @@ SDLoc dl(N); EVT VT = N->getValueType(0); SDValue Op = N->getOperand(0); + + // There are no libcalls for f16, so f16 -> i* must go via f32 + if (Op.getValueType() == MVT::f16) { + Op = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op); + } + RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!"); SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, true/*irrelevant*/, @@ -1834,6 +1840,12 @@ SDLoc dl(N); EVT VT = N->getValueType(0); SDValue Op = N->getOperand(0); + + // There are no libcalls for f16, so f16 -> i* must go via f32 + if (Op.getValueType() == MVT::f16) { + Op = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op); + } + RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!"); SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, false/*irrelevant*/, @@ -2691,10 +2703,23 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) { SDValue Op = N->getOperand(0); EVT DstVT = N->getValueType(0); + SDLoc dl(N); + + // There are no libcalls for f16, so i* -> f16 must go via f32 + if (DstVT == MVT::f16) + DstVT = MVT::f32; + RTLIB::Libcall LC = RTLIB::getSINTTOFP(Op.getValueType(), DstVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Don't know how to expand this SINT_TO_FP!"); - return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, SDLoc(N)).first; + SDValue Result = + TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, SDLoc(N)).first; + + if (DstVT == N->getValueType(0)) + return Result; + else + return DAG.getNode(ISD::FP_ROUND, dl, MVT::f16, Result, + DAG.getIntPtrConstant(0)); } SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) { @@ -2797,6 +2822,10 @@ EVT DstVT = N->getValueType(0); SDLoc dl(N); + // There are no libcalls for f16, so i* -> f16 must go via f32 + if (DstVT == MVT::f16) + DstVT = MVT::f32; + // The following optimization is valid only if every value in SrcVT (when // treated as signed) is representable in DstVT. Check that the mantissa // size of DstVT is >= than the number of bits in SrcVT -1. @@ -2856,14 +2885,26 @@ MachinePointerInfo::getConstantPool(), MVT::f32, false, false, false, Alignment); - return DAG.getNode(ISD::FADD, dl, DstVT, SignedConv, Fudge); + SDValue Result = DAG.getNode(ISD::FADD, dl, DstVT, SignedConv, Fudge); + + if (DstVT == N->getValueType(0)) + return Result; + else + return DAG.getNode(ISD::FP_ROUND, dl, MVT::f16, Result, + DAG.getIntPtrConstant(0)); } // Otherwise, use a libcall. RTLIB::Libcall LC = RTLIB::getUINTTOFP(SrcVT, DstVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Don't know how to expand this UINT_TO_FP!"); - return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, dl).first; + SDValue Result = TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, dl).first; + + if (DstVT == N->getValueType(0)) + return Result; + else + return DAG.getNode(ISD::FP_ROUND, dl, MVT::f16, Result, + DAG.getIntPtrConstant(0)); } SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) { Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -382,14 +382,23 @@ for (unsigned j = 0; j != Op.getNumOperands(); ++j) { if (Op.getOperand(j).getValueType().isVector()) - Operands[j] = DAG.getNode(ISD::BITCAST, dl, NVT, Op.getOperand(j)); + if (Op.getOperand(j) + .getValueType() + .getVectorElementType() + .isFloatingPoint()) + Operands[j] = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op.getOperand(j)); + else + Operands[j] = DAG.getNode(ISD::BITCAST, dl, NVT, Op.getOperand(j)); else Operands[j] = Op.getOperand(j); } Op = DAG.getNode(Op.getOpcode(), dl, NVT, Operands); - - return DAG.getNode(ISD::BITCAST, dl, VT, Op); + if (VT.isFloatingPoint() || + (VT.isVector() && VT.getVectorElementType().isFloatingPoint())) + return DAG.getNode(ISD::FP_ROUND, dl, VT, Op, DAG.getIntPtrConstant(0)); + else + return DAG.getNode(ISD::BITCAST, dl, VT, Op); } SDValue VectorLegalizer::PromoteINT_TO_FP(SDValue Op) { Index: lib/Target/AArch64/AArch64CallingConvention.td =================================================================== --- lib/Target/AArch64/AArch64CallingConvention.td +++ lib/Target/AArch64/AArch64CallingConvention.td @@ -60,18 +60,18 @@ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, // If more than will fit in registers, pass them on the stack instead. CCIfType<[i1, i8, i16, f16], CCAssignToStack<8, 8>>, CCIfType<[i32, f32], CCAssignToStack<8, 8>>, - CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8], + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16], CCAssignToStack<8, 8>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], CCAssignToStack<16, 16>> ]>; @@ -96,10 +96,10 @@ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>> ]>; @@ -139,19 +139,20 @@ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, // If more than will fit in registers, pass them on the stack instead. CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>, CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>, CCIfType<[i32, f32], CCAssignToStack<4, 4>>, - CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8], + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16], CCAssignToStack<8, 8>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>> + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCAssignToStack<16, 16>> ]>; def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ @@ -165,8 +166,10 @@ // Everything is on the stack. // i128 is split to two i64s, and its stack alignment is 16 bytes. CCIfType<[i64], CCIfSplit>>, - CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], CCAssignToStack<8, 8>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>> + CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCAssignToStack<8, 8>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCAssignToStack<16, 16>> ]>; // The WebKit_JS calling convention only passes the first argument (the callee) Index: lib/Target/AArch64/AArch64ISelDAGToDAG.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -2204,9 +2204,9 @@ return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0); @@ -2222,9 +2222,9 @@ return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0); @@ -2240,9 +2240,9 @@ return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0); @@ -2258,9 +2258,9 @@ return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0); @@ -2276,9 +2276,9 @@ return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0); @@ -2294,9 +2294,9 @@ return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0); @@ -2312,9 +2312,9 @@ return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0); @@ -2330,9 +2330,9 @@ return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0); @@ -2348,9 +2348,9 @@ return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0); @@ -2364,7 +2364,8 @@ case Intrinsic::aarch64_neon_ld2lane: if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectLoadLane(Node, 2, AArch64::LD2i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectLoadLane(Node, 2, AArch64::LD2i16); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2376,7 +2377,8 @@ case Intrinsic::aarch64_neon_ld3lane: if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectLoadLane(Node, 3, AArch64::LD3i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectLoadLane(Node, 3, AArch64::LD3i16); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2388,7 +2390,8 @@ case Intrinsic::aarch64_neon_ld4lane: if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectLoadLane(Node, 4, AArch64::LD4i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectLoadLane(Node, 4, AArch64::LD4i16); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2448,9 +2451,9 @@ return SelectStore(Node, 2, AArch64::ST1Twov8b); else if (VT == MVT::v16i8) return SelectStore(Node, 2, AArch64::ST1Twov16b); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectStore(Node, 2, AArch64::ST1Twov4h); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectStore(Node, 2, AArch64::ST1Twov8h); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectStore(Node, 2, AArch64::ST1Twov2s); @@ -2467,9 +2470,9 @@ return SelectStore(Node, 3, AArch64::ST1Threev8b); else if (VT == MVT::v16i8) return SelectStore(Node, 3, AArch64::ST1Threev16b); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectStore(Node, 3, AArch64::ST1Threev4h); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectStore(Node, 3, AArch64::ST1Threev8h); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectStore(Node, 3, AArch64::ST1Threev2s); @@ -2486,9 +2489,9 @@ return SelectStore(Node, 4, AArch64::ST1Fourv8b); else if (VT == MVT::v16i8) return SelectStore(Node, 4, AArch64::ST1Fourv16b); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectStore(Node, 4, AArch64::ST1Fourv4h); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectStore(Node, 4, AArch64::ST1Fourv8h); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectStore(Node, 4, AArch64::ST1Fourv2s); @@ -2505,9 +2508,9 @@ return SelectStore(Node, 2, AArch64::ST2Twov8b); else if (VT == MVT::v16i8) return SelectStore(Node, 2, AArch64::ST2Twov16b); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectStore(Node, 2, AArch64::ST2Twov4h); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectStore(Node, 2, AArch64::ST2Twov8h); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectStore(Node, 2, AArch64::ST2Twov2s); @@ -2524,9 +2527,9 @@ return SelectStore(Node, 3, AArch64::ST3Threev8b); else if (VT == MVT::v16i8) return SelectStore(Node, 3, AArch64::ST3Threev16b); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectStore(Node, 3, AArch64::ST3Threev4h); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectStore(Node, 3, AArch64::ST3Threev8h); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectStore(Node, 3, AArch64::ST3Threev2s); @@ -2543,9 +2546,9 @@ return SelectStore(Node, 4, AArch64::ST4Fourv8b); else if (VT == MVT::v16i8) return SelectStore(Node, 4, AArch64::ST4Fourv16b); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectStore(Node, 4, AArch64::ST4Fourv4h); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectStore(Node, 4, AArch64::ST4Fourv8h); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectStore(Node, 4, AArch64::ST4Fourv2s); @@ -2560,7 +2563,8 @@ case Intrinsic::aarch64_neon_st2lane: { if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectStoreLane(Node, 2, AArch64::ST2i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectStoreLane(Node, 2, AArch64::ST2i16); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2573,7 +2577,8 @@ case Intrinsic::aarch64_neon_st3lane: { if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectStoreLane(Node, 3, AArch64::ST3i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectStoreLane(Node, 3, AArch64::ST3i16); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2586,7 +2591,8 @@ case Intrinsic::aarch64_neon_st4lane: { if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectStoreLane(Node, 4, AArch64::ST4i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectStoreLane(Node, 4, AArch64::ST4i16); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2603,9 +2609,9 @@ return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0); @@ -2622,9 +2628,9 @@ return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0); @@ -2641,9 +2647,9 @@ return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0); @@ -2660,9 +2666,9 @@ return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0); @@ -2679,9 +2685,9 @@ return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0); @@ -2698,9 +2704,9 @@ return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0); @@ -2717,9 +2723,9 @@ return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0); @@ -2736,9 +2742,9 @@ return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0); @@ -2755,9 +2761,9 @@ return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0); @@ -2774,9 +2780,9 @@ return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0); @@ -2791,7 +2797,8 @@ case AArch64ISD::LD1LANEpost: { if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2804,7 +2811,8 @@ case AArch64ISD::LD2LANEpost: { if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2817,7 +2825,8 @@ case AArch64ISD::LD3LANEpost: { if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2830,7 +2839,8 @@ case AArch64ISD::LD4LANEpost: { if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2846,9 +2856,9 @@ return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST); else if (VT == MVT::v16i8) return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST); @@ -2866,9 +2876,9 @@ return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST); else if (VT == MVT::v16i8) return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST); @@ -2886,9 +2896,9 @@ return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST); else if (VT == MVT::v16i8) return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST); @@ -2906,9 +2916,9 @@ return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST); else if (VT == MVT::v16i8) return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST); @@ -2926,9 +2936,9 @@ return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST); else if (VT == MVT::v16i8) return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST); @@ -2946,9 +2956,9 @@ return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST); else if (VT == MVT::v16i8) return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST); @@ -2964,7 +2974,8 @@ VT = Node->getOperand(1).getValueType(); if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2978,7 +2989,8 @@ VT = Node->getOperand(1).getValueType(); if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2992,7 +3004,8 @@ VT = Node->getOperand(1).getValueType(); if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -106,6 +106,7 @@ addDRTypeForNEON(MVT::v2i32); addDRTypeForNEON(MVT::v1i64); addDRTypeForNEON(MVT::v1f64); + addDRTypeForNEON(MVT::v4f16); addQRTypeForNEON(MVT::v4f32); addQRTypeForNEON(MVT::v2f64); @@ -113,6 +114,7 @@ addQRTypeForNEON(MVT::v8i16); addQRTypeForNEON(MVT::v4i32); addQRTypeForNEON(MVT::v2i64); + addQRTypeForNEON(MVT::v8f16); } // Compute derived properties from the register classes @@ -278,6 +280,138 @@ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + // f16 is storage-only, so all operations get promoted to f32 + setOperationAction(ISD::FABS, MVT::f16, Promote); + setOperationAction(ISD::FADD, MVT::f16, Promote); + setOperationAction(ISD::FCEIL, MVT::f16, Promote); + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); + setOperationAction(ISD::FCOS, MVT::f16, Promote); + setOperationAction(ISD::FDIV, MVT::f16, Promote); + setOperationAction(ISD::FFLOOR, MVT::f16, Promote); + setOperationAction(ISD::FMA, MVT::f16, Promote); + setOperationAction(ISD::FMUL, MVT::f16, Promote); + setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); + setOperationAction(ISD::FNEG, MVT::f16, Promote); + setOperationAction(ISD::FPOW, MVT::f16, Promote); + setOperationAction(ISD::FPOWI, MVT::f16, Promote); + setOperationAction(ISD::FREM, MVT::f16, Promote); + setOperationAction(ISD::FROUND, MVT::f16, Promote); + setOperationAction(ISD::FRINT, MVT::f16, Promote); + setOperationAction(ISD::FSIN, MVT::f16, Promote); + setOperationAction(ISD::FSINCOS, MVT::f16, Promote); + setOperationAction(ISD::FSQRT, MVT::f16, Promote); + setOperationAction(ISD::FSUB, MVT::f16, Promote); + setOperationAction(ISD::FTRUNC, MVT::f16, Promote); + setOperationAction(ISD::SETCC, MVT::f16, Promote); + setOperationAction(ISD::BR_CC, MVT::f16, Promote); + setOperationAction(ISD::SELECT, MVT::f16, Promote); + setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); + setOperationAction(ISD::FP_EXTEND, MVT::f16, Promote); + setOperationAction(ISD::FEXP, MVT::f16, Promote); + setOperationAction(ISD::FEXP2, MVT::f16, Promote); + setOperationAction(ISD::FLOG, MVT::f16, Promote); + setOperationAction(ISD::FLOG2, MVT::f16, Promote); + setOperationAction(ISD::FLOG10, MVT::f16, Promote); + + // v4f16 is also a storage-only type, so promote it to v4f32 + setOperationAction(ISD::FABS, MVT::v4f16, Promote); + setOperationAction(ISD::FADD, MVT::v4f16, Promote); + setOperationAction(ISD::FCEIL, MVT::v4f16, Promote); + setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Promote); + setOperationAction(ISD::FCOS, MVT::v4f16, Promote); + setOperationAction(ISD::FDIV, MVT::v4f16, Promote); + setOperationAction(ISD::FFLOOR, MVT::v4f16, Promote); + setOperationAction(ISD::FMA, MVT::v4f16, Promote); + setOperationAction(ISD::FMUL, MVT::v4f16, Promote); + setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Promote); + setOperationAction(ISD::FNEG, MVT::v4f16, Promote); + setOperationAction(ISD::FPOW, MVT::v4f16, Promote); + setOperationAction(ISD::FPOWI, MVT::v4f16, Promote); + setOperationAction(ISD::FREM, MVT::v4f16, Promote); + setOperationAction(ISD::FROUND, MVT::v4f16, Promote); + setOperationAction(ISD::FRINT, MVT::v4f16, Promote); + setOperationAction(ISD::FSIN, MVT::v4f16, Promote); + setOperationAction(ISD::FSINCOS, MVT::v4f16, Promote); + setOperationAction(ISD::FSQRT, MVT::v4f16, Promote); + setOperationAction(ISD::FSUB, MVT::v4f16, Promote); + setOperationAction(ISD::FTRUNC, MVT::v4f16, Promote); + setOperationAction(ISD::SETCC, MVT::v4f16, Promote); + setOperationAction(ISD::BR_CC, MVT::v4f16, Promote); + setOperationAction(ISD::SELECT, MVT::v4f16, Promote); + setOperationAction(ISD::SELECT_CC, MVT::v4f16, Promote); + setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); + setOperationAction(ISD::FEXP, MVT::v4f16, Promote); + setOperationAction(ISD::FEXP2, MVT::v4f16, Promote); + setOperationAction(ISD::FLOG, MVT::v4f16, Promote); + setOperationAction(ISD::FLOG2, MVT::v4f16, Promote); + setOperationAction(ISD::FLOG10, MVT::v4f16, Promote); + + AddPromotedToType(ISD::FABS, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FCEIL, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FCOPYSIGN, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FCOS, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FFLOOR, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FMA, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FNEARBYINT, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FNEG, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FPOW, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FPOWI, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FREM, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FROUND, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FRINT, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FSIN, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FSINCOS, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FSQRT, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FTRUNC, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::SETCC, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::BR_CC, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::SELECT, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::SELECT_CC, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FEXP, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FEXP2, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FLOG, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FLOG2, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FLOG10, MVT::v4f16, MVT::v4f32); + + // v8f16 is also a storage-only type, so expand it + setOperationAction(ISD::FABS, MVT::v8f16, Expand); + setOperationAction(ISD::FADD, MVT::v8f16, Expand); + setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); + setOperationAction(ISD::FCOS, MVT::v8f16, Expand); + setOperationAction(ISD::FDIV, MVT::v8f16, Expand); + setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); + setOperationAction(ISD::FMA, MVT::v8f16, Expand); + setOperationAction(ISD::FMUL, MVT::v8f16, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); + setOperationAction(ISD::FNEG, MVT::v8f16, Expand); + setOperationAction(ISD::FPOW, MVT::v8f16, Expand); + setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); + setOperationAction(ISD::FREM, MVT::v8f16, Expand); + setOperationAction(ISD::FROUND, MVT::v8f16, Expand); + setOperationAction(ISD::FRINT, MVT::v8f16, Expand); + setOperationAction(ISD::FSIN, MVT::v8f16, Expand); + setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); + setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); + setOperationAction(ISD::FSUB, MVT::v8f16, Expand); + setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); + setOperationAction(ISD::SETCC, MVT::v8f16, Expand); + setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); + setOperationAction(ISD::SELECT, MVT::v8f16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); + setOperationAction(ISD::FEXP, MVT::v8f16, Expand); + setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); + setOperationAction(ISD::FLOG, MVT::v8f16, Expand); + setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); + setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); + // AArch64 has implementations of a lot of rounding-like FP operations. static MVT RoundingTypes[] = { MVT::f32, MVT::f64}; for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) { @@ -535,8 +669,12 @@ setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand); setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand); + // NEON has no direct f16 <-> i* conversions, so all of these need custom + // lowering. setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom); setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Custom); + setOperationAction(ISD::UINT_TO_FP, VT.getSimpleVT(), Custom); if (Subtarget->isLittleEndian()) { for (unsigned im = (unsigned)ISD::PRE_INC; @@ -1390,9 +1528,9 @@ // in the cost tables. EVT InVT = Op.getOperand(0).getValueType(); EVT VT = Op.getValueType(); + SDLoc dl(Op); if (VT.getSizeInBits() < InVT.getSizeInBits()) { - SDLoc dl(Op); SDValue Cv = DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), Op.getOperand(0)); @@ -1400,8 +1538,16 @@ } if (VT.getSizeInBits() > InVT.getSizeInBits()) { - SDLoc dl(Op); - SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0)); + MVT ExtVT = + MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), + VT.getVectorNumElements()); + SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); + return DAG.getNode(Op.getOpcode(), dl, VT, Ext); + } + + if (32 > InVT.getScalarSizeInBits()) { + MVT ExtVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements()); + SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); return DAG.getNode(Op.getOpcode(), dl, VT, Ext); } @@ -1414,6 +1560,13 @@ if (Op.getOperand(0).getValueType().isVector()) return LowerVectorFP_TO_INT(Op, DAG); + // f16 -> i* must go via f32 + if (Op.getOperand(0).getValueType() == MVT::f16) { + SDLoc dl(Op); + SDValue Tmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)); + return DAG.getNode(Op->getOpcode(), dl, Op->getValueType(0), Tmp); + } + if (Op.getOperand(0).getValueType() != MVT::f128) { // It's legal except when f128 is involved return Op; @@ -1450,6 +1603,12 @@ return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0)); } + if (VT.getScalarSizeInBits() < 32) { + MVT CastVT = MVT::getVectorVT(MVT::f32, InVT.getVectorNumElements()); + In = DAG.getNode(Op.getOpcode(), dl, CastVT, In); + return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0)); + } + if (VT.getSizeInBits() > InVT.getSizeInBits()) { unsigned CastOpc = Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; @@ -1470,6 +1629,13 @@ if (Op.getOperand(0).getValueType() == MVT::i128) return SDValue(); + // i* -> f16 must go via f32 + if (Op.getValueType() == MVT::f16) { + SDLoc dl(Op); + Op = DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)); + return DAG.getNode(ISD::FP_ROUND, dl, MVT::f16, Op, DAG.getIntPtrConstant(0)); + } + // Other conversions are legal, unless it's to the completely software-based // fp128. if (Op.getValueType() != MVT::f128) @@ -3247,7 +3413,7 @@ // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. if (LHS.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); // If softenSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. @@ -3355,6 +3521,12 @@ return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); } + // Extend f16 -> f32 before comparing + if (LHS.getValueType() == MVT::f16) { + LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); + RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); + } + // Now we know we're dealing with FP values. assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); assert(LHS.getValueType() == RHS.getValueType()); @@ -4659,7 +4831,8 @@ VT.getVectorElementType() == MVT::f32) return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); // vrev <4 x i16> -> REV32 - if (VT.getVectorElementType() == MVT::i16) + if (VT.getVectorElementType() == MVT::i16 || + VT.getVectorElementType() == MVT::f16) return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); // vrev <4 x i8> -> REV16 assert(VT.getVectorElementType() == MVT::i8); @@ -4779,7 +4952,7 @@ static unsigned getDUPLANEOp(EVT EltType) { if (EltType == MVT::i8) return AArch64ISD::DUPLANE8; - if (EltType == MVT::i16) + if (EltType == MVT::i16 || EltType == MVT::f16) return AArch64ISD::DUPLANE16; if (EltType == MVT::i32 || EltType == MVT::f32) return AArch64ISD::DUPLANE32; @@ -4909,7 +5082,8 @@ SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64); EVT ScalarVT = VT.getVectorElementType(); - if (ScalarVT.getSizeInBits() < 32) + + if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger()) ScalarVT = MVT::i32; return DAG.getNode( @@ -5668,11 +5842,12 @@ // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || - VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64) + VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || + VT == MVT::v8f16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && - VT != MVT::v1i64 && VT != MVT::v2f32) + VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) return SDValue(); // For V64 types, we perform insertion by expanding the value @@ -5901,6 +6076,15 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, SDLoc dl, SelectionDAG &DAG) { + if (LHS.getValueType().getVectorElementType() == MVT::f16) { + assert(LHS.getValueType() == RHS.getValueType()); + MVT ExtVT = + MVT::getVectorVT(MVT::f32, LHS.getValueType().getVectorNumElements()); + LHS = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, LHS); + RHS = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, RHS); + VT = MVT::getVectorVT(MVT::i32, LHS.getValueType().getVectorNumElements()); + } + EVT SrcVT = LHS.getValueType(); BuildVectorSDNode *BVN = dyn_cast(RHS.getNode()); @@ -6006,7 +6190,8 @@ } assert(LHS.getValueType().getVectorElementType() == MVT::f32 || - LHS.getValueType().getVectorElementType() == MVT::f64); + LHS.getValueType().getVectorElementType() == MVT::f64 || + LHS.getValueType().getVectorElementType() == MVT::f16); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two branches to implement. @@ -8067,6 +8252,7 @@ SDLoc DL(N); SDValue Op = N->getOperand(0); + assert(Op.getValueType() == MVT::f16 && "Inconsistent bitcast? Only 16-bit types should be i16 or f16"); Op = SDValue( Index: lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- lib/Target/AArch64/AArch64InstrFormats.td +++ lib/Target/AArch64/AArch64InstrFormats.td @@ -5262,6 +5262,10 @@ def v2i64 : BaseSIMDZipVector<0b111, opc, V128, asm, ".2d", OpNode, v2i64>; + def : Pat<(v4f16 (OpNode V64:$Rn, V64:$Rm)), + (!cast(NAME#"v4i16") V64:$Rn, V64:$Rm)>; + def : Pat<(v8f16 (OpNode V128:$Rn, V128:$Rm)), + (!cast(NAME#"v8i16") V128:$Rn, V128:$Rm)>; def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)), (!cast(NAME#"v2i32") V64:$Rn, V64:$Rm)>; def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)), Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -1356,6 +1356,8 @@ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; } def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; @@ -1377,6 +1379,8 @@ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; } def : Pat<(f128 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; @@ -1513,6 +1517,8 @@ (LDURDi GPR64sp:$Rn, simm9:$offset)>; def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), (LDURDi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v4f16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; } def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), (LDURDi GPR64sp:$Rn, simm9:$offset)>; @@ -1533,6 +1539,8 @@ (LDURQi GPR64sp:$Rn, simm9:$offset)>; def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), (LDURQi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v8f16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), + (LDURQi GPR64sp:$Rn, simm9:$offset)>; } // anyext -> zext @@ -1829,6 +1837,7 @@ defm : VecROStorePat; defm : VecROStorePat; defm : VecROStorePat; + defm : VecROStorePat; } defm : VecROStorePat; @@ -1843,6 +1852,7 @@ defm : VecROStorePat; defm : VecROStorePat; defm : VecROStorePat; + defm : VecROStorePat; } } // AddedComplexity = 10 @@ -1893,6 +1903,9 @@ def : Pat<(store (v2i32 FPR64:$Rt), (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(store (v4f16 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; } def : Pat<(store (v1f64 FPR64:$Rt), (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), @@ -1922,6 +1935,9 @@ def : Pat<(store (v2i64 FPR128:$Rt), (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(store (v8f16 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; } def : Pat<(store (f128 FPR128:$Rt), (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), @@ -1984,6 +2000,9 @@ def : Pat<(store (v2i32 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v4f16 FPR64:$Rt), + (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; } def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; @@ -2014,6 +2033,9 @@ def : Pat<(store (v2f64 FPR128:$Rt), (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v8f16 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; } // unscaled i64 truncating stores @@ -2090,6 +2112,8 @@ (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off), (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; @@ -2103,6 +2127,8 @@ (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; //--- // (immediate post-indexed) @@ -2140,6 +2166,8 @@ (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off), (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; @@ -2153,6 +2181,8 @@ (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; //===----------------------------------------------------------------------===// // Load/store exclusive instructions. @@ -2413,6 +2443,11 @@ (i64 2))))), (FCVTLv4i32 V128:$Rn)>; +def : Pat<(v4f32 (fextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>; +def : Pat<(v4f32 (fextend (v4f16 (extract_subvector (v8f16 V128:$Rn), + (i64 4))))), + (FCVTLv8i16 V128:$Rn)>; + defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>; defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>; defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>; @@ -2424,6 +2459,7 @@ (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))), (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>; +def : Pat<(v4f16 (fround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>; def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))), (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>; @@ -2506,6 +2542,10 @@ defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>; defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>; +def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>; +def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>; +def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>; +def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>; def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>; def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>; @@ -3184,6 +3224,10 @@ (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; +def : Pat<(v4f16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))), + (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; +def : Pat<(v8f16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), + (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; // We use EXT to handle extract_subvector to copy the upper 64-bits of a // 128-bit vector. @@ -3195,6 +3239,8 @@ (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))), (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; +def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 4))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))), (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))), @@ -3307,6 +3353,19 @@ (v2f64 (DUPv2i64lane (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub), (i64 0)))>; +def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))), + (v4f16 (DUPv4i16lane + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), + (i64 0)))>; +def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))), + (v8f16 (DUPv8i16lane + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), + (i64 0)))>; + +def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)), + (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>; +def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)), + (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>; def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)), (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>; @@ -3428,6 +3487,23 @@ def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))), (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>; +def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn), + (f16 FPR16:$Rm), (i64 VectorIndexS:$imm))), + (EXTRACT_SUBREG + (INSvi16lane + (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)), + VectorIndexS:$imm, + (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), + (i64 0)), + dsub)>; + +def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn), + (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))), + (INSvi16lane + V128:$Rn, VectorIndexH:$imm, + (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), + (i64 0))>; + def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn), (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))), (EXTRACT_SUBREG @@ -3508,6 +3584,7 @@ dsub)>; } +defm : Neon_INS_elt_pattern; defm : Neon_INS_elt_pattern; defm : Neon_INS_elt_pattern; defm : Neon_INS_elt_pattern; @@ -3547,6 +3624,7 @@ def : ConcatPat; def : ConcatPat; def : ConcatPat; +def : ConcatPat; def : ConcatPat; // If the high lanes are undef, though, we can just ignore them: @@ -4861,6 +4939,7 @@ def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))), @@ -4869,6 +4948,8 @@ (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), @@ -4881,6 +4962,8 @@ (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; +def : Pat<(v4f16 (bitconvert GPR64:$Xn)), + (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; @@ -4890,6 +4973,8 @@ (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; +def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))), + (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; } @@ -4918,6 +5003,7 @@ def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>; } let Predicates = [IsBE] in { @@ -4927,6 +5013,8 @@ (v1i64 (REV64v4i16 FPR64:$src))>; def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 (REV64v8i8 FPR64:$src))>; +def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), + (v1i64 (REV64v4i16 FPR64:$src))>; def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 (REV64v2i32 FPR64:$src))>; } @@ -4939,6 +5027,7 @@ def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), @@ -4951,6 +5040,8 @@ (v2i32 (REV64v2i32 FPR64:$src))>; def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 (REV64v2i32 FPR64:$src))>; +def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), + (v2i32 (REV64v4i16 FPR64:$src))>; } def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>; @@ -4959,6 +5050,7 @@ def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>; } @@ -4971,6 +5063,8 @@ (v4i16 (REV16v8i8 FPR64:$src))>; def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), + (v4i16 (REV32v4i16 FPR64:$src))>; def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 (REV32v4i16 FPR64:$src))>; def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), @@ -4978,12 +5072,41 @@ } let Predicates = [IsLE] in { +def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))), + (v4f16 (REV16v8i8 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +} + + + +let Predicates = [IsLE] in { def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), (v8i8 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), @@ -4998,6 +5121,8 @@ (v8i8 (REV32v8i8 FPR64:$src))>; def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 (REV64v8i8 FPR64:$src))>; +def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), + (v8i8 (REV16v8i8 FPR64:$src))>; } let Predicates = [IsLE] in { @@ -5005,6 +5130,7 @@ def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), (f64 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), @@ -5015,6 +5141,8 @@ (f64 (REV64v2i32 FPR64:$src))>; def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 (REV64v8i8 FPR64:$src))>; +def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), + (f64 (REV64v4i16 FPR64:$src))>; } def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>; @@ -5024,6 +5152,7 @@ def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>; +def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), @@ -5034,6 +5163,8 @@ (v1f64 (REV64v8i8 FPR64:$src))>; def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 (REV64v2i32 FPR64:$src))>; +def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), + (v1f64 (REV64v4i16 FPR64:$src))>; } def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>; @@ -5044,6 +5175,7 @@ def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), @@ -5056,6 +5188,8 @@ (v2f32 (REV64v2i32 FPR64:$src))>; def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 (REV64v2i32 FPR64:$src))>; +def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), + (v2f32 (REV64v4i16 FPR64:$src))>; } def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>; @@ -5065,6 +5199,7 @@ def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>; +def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>; } let Predicates = [IsBE] in { @@ -5136,6 +5271,7 @@ def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), @@ -5149,6 +5285,8 @@ (v2i64 (REV64v16i8 FPR128:$src))>; def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 (REV64v4i32 FPR128:$src))>; +def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), + (v2i64 (REV64v8i16 FPR128:$src))>; } def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>; @@ -5158,6 +5296,7 @@ def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), @@ -5172,6 +5311,8 @@ (v4i32 (REV32v16i8 FPR128:$src))>; def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 (REV64v4i32 FPR128:$src))>; +def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), + (v4i32 (REV32v8i16 FPR128:$src))>; } def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>; @@ -5182,6 +5323,7 @@ def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), @@ -5198,6 +5340,36 @@ (v8i16 (REV64v8i16 FPR128:$src))>; def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 (REV32v8i16 FPR128:$src))>; +def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), + (v8i16 (REV32v8i16 FPR128:$src))>; +} + +let Predicates = [IsLE] in { +def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), + (v8f16 (EXTv16i8 (REV64v8i16 FPR128:$src), + (REV64v8i16 FPR128:$src), + (i32 8)))>; +def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))), + (v8f16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), + (v8f16 (REV32v8i16 FPR128:$src))>; +def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), + (v8f16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), + (v8f16 (REV16v16i8 FPR128:$src))>; +def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), + (v8f16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), + (v8f16 (REV32v8i16 FPR128:$src))>; } let Predicates = [IsLE] in { @@ -5207,6 +5379,7 @@ def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), @@ -5223,6 +5396,8 @@ (v16i8 (REV64v16i8 FPR128:$src))>; def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 (REV32v16i8 FPR128:$src))>; +def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), + (v16i8 (REV16v16i8 FPR128:$src))>; } def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))), @@ -5246,6 +5421,8 @@ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>; def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)), (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; +def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (i32 0)), + (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)), (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>; Index: lib/Target/AArch64/AArch64RegisterInfo.td =================================================================== --- lib/Target/AArch64/AArch64RegisterInfo.td +++ lib/Target/AArch64/AArch64RegisterInfo.td @@ -390,13 +390,14 @@ } def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>; def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32, - v1i64], + v1i64, v4f16], 64, (sequence "D%u", 0, 31)>; // We don't (yet) have an f128 legal type, so don't use that here. We // normalize 128-bit vectors to v2f64 for arg passing and such, so use // that here. def FPR128 : RegisterClass<"AArch64", - [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128], + [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128, + v8f16], 128, (sequence "Q%u", 0, 31)>; // The lower 16 vector registers. Some instructions can only take registers Index: test/CodeGen/AArch64/fp16-instructions.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/fp16-instructions.ll @@ -0,0 +1,164 @@ +; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s + +define half @add_h(half %a, half %b) { +entry: +; CHECK-LABEL: add_h: +; CHECK: fcvt +; CHECK: fcvt +; CHECK: fadd +; CHECK: fcvt + %0 = fadd half %a, %b + ret half %0 +} + + +define half @sub_h(half %a, half %b) { +entry: +; CHECK-LABEL: sub_h: +; CHECK: fcvt +; CHECK: fcvt +; CHECK: fsub +; CHECK: fcvt + %0 = fsub half %a, %b + ret half %0 +} + + +define half @mul_h(half %a, half %b) { +entry: +; CHECK-LABEL: mul_h: +; CHECK: fcvt +; CHECK: fcvt +; CHECK: fmul +; CHECK: fcvt + %0 = fmul half %a, %b + ret half %0 +} + + +define half @div_h(half %a, half %b) { +entry: +; CHECK-LABEL: div_h: +; CHECK: fcvt +; CHECK: fcvt +; CHECK: fdiv +; CHECK: fcvt + %0 = fdiv half %a, %b + ret half %0 +} + + +define half @rem_h(half %a, half %b) { +entry: +; CHECK-LABEL: rem_h: +; CHECK: fcvt +; CHECK: fcvt +; CHECK: bl fmodf +; CHECK: fcvt + %0 = frem half %a, %b + ret half %0 +} + + +define half @load_h(half* %a) { +entry: +; CHECK-LABEL: load_h: +; CHECK: ldr h + %0 = load half* %a, align 4 + ret half %0 +} + + +define void @store_h(half* %a, half %b) { +entry: +; CHECK-LABEL: store_h: +; CHECK: str h + store half %b, half* %a, align 4 + ret void +} + +define half @s_to_h(float %a) { +; CHECK-LABEL: s_to_h: +; CHECK: fcvt + %1 = fptrunc float %a to half + ret half %1 +} + +define half @d_to_h(double %a) { +; CHECK-LABEL: d_to_h: +; CHECK: fcvt + %1 = fptrunc double %a to half + ret half %1 +} + +define float @h_to_s(half %a) { +; CHECK-LABEL: h_to_s: +; CHECK: fcvt + %1 = fpext half %a to float + ret float %1 +} + +define double @h_to_d(half %a) { +; CHECK-LABEL: h_to_d: +; CHECK: fcvt + %1 = fpext half %a to double + ret double %1 +} + +define i32 @f_to_si(half %a) { +; CHECK-LABEL: f_to_si: +; CHECK: fcvt + %1 = fptosi half %a to i32 + ret i32 %1 +} + + +define i32 @f_to_ui(half %a) { +; CHECK-LABEL: f_to_ui: +; CHECK: fcvt + %1 = fptoui half %a to i32 + ret i32 %1 +} + + +define half @si_to_h(i32 %a) { +; CHECK-LABEL: si_to_h: +; CHECK: fcvt + %1 = sitofp i32 %a to half + ret half %1 +} + + +define half @ui_to_h(i32 %a) { +; CHECK-LABEL: ui_to_h: +; CHECK: fcvt + %1 = uitofp i32 %a to half + ret half %1 +} + + +define half @bitcast_i_to_h(i16 %a) { +; CHECK-LABEL: bitcast_i_to_h: +; CHECK: fmov + %1 = bitcast i16 %a to half + ret half %1 +} + + +define i16 @bitcast_h_to_i(half %a) { +; CHECK-LABEL: bitcast_h_to_i: +; CHECK: fmov + %1 = bitcast half %a to i16 + ret i16 %1 +} + + +define half @select_h(half %a, half %b, i1 %c) { +; CHECK-LABEL: select_h: +; CHECK: fcvt +; CHECK: fcvt +; CHECK: fcsel +; CHECK: fcvt + %1 = select i1 %c, half %a, half %b + ret half %1 +} Index: test/CodeGen/AArch64/fp16-intrinsics.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/fp16-intrinsics.ll @@ -0,0 +1,228 @@ +; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s + +declare half @llvm.sqrt.f16(half %Val) +define half @sqrt_h(half %a) { +; CHECK-LABEL: sqrt_h: +; CHECK: fcvt +; CHECK: fsqrt +; CHECK: fcvt + %1 = call half @llvm.sqrt.f16(half %a) + ret half %1 +} + + + +declare half @llvm.sin.f16(half %Val) +define half @sin_h(half %a) { +; CHECK-LABEL: sin_h: +; CHECK: fcvt +; CHECK: bl sinf +; CHECK: fcvt + %1 = call half @llvm.sin.f16(half %a) + ret half %1 +} + + +declare half @llvm.cos.f16(half %Val) +define half @cos_h(half %a) { +; CHECK-LABEL: cos_h: +; CHECK: fcvt +; CHECK: bl cosf +; CHECK: fcvt + %1 = call half @llvm.cos.f16(half %a) + ret half %1 +} + + +declare half @llvm.pow.f16(half %Val, half %power) +define half @pow_h(half %a, half %b) { +; CHECK-LABEL: pow_h: +; CHECK: fcvt +; CHECK: fcvt +; CHECK: bl powf +; CHECK: fcvt + %1 = call half @llvm.pow.f16(half %a, half %b) + ret half %1 +} + + +declare half @llvm.fma.f16(half %a, half %b, half %c) +define half @fma_h(half %a, half %b, half %c) { +; CHECK-LABEL: fma_h: +; CHECK: fcvt +; CHECK: fcvt +; CHECK: fcvt +; CHECK: fmadd +; CHECK: fcvt + %1 = call half @llvm.fma.f16(half %a, half %b, half %c) + ret half %1 +} + + +declare half @llvm.fabs.f16(half %Val) +define half @abs_h(half %a) { +; CHECK-LABEL: abs_h: +; CHECK: fcvt +; CHECK: fabs +; CHECK: fcvt + %1 = call half @llvm.fabs.f16(half %a) + ret half %1 +} + + + + +declare half @llvm.floor.f16(half %Val) +define half @floor_h(half %a) { +; CHECK-LABEL: floor_h: +; CHECK: fcvt +; CHECK: frintm +; CHECK: fcvt +; CHECK: frintx + %1 = call half @llvm.floor.f16(half %a) + ret half %1 +} + + +declare half @llvm.ceil.f16(half %Val) +define half @ceil_h(half %a) { +; CHECK-LABEL: ceil_h: +; CHECK: fcvt +; CHECK: frintp +; CHECK: fcvt +; CHECK: frintx + %1 = call half @llvm.ceil.f16(half %a) + ret half %1 +} + + +declare half @llvm.trunc.f16(half %Val) +define half @trunc_h(half %a) { +; CHECK-LABEL: trunc_h: +; CHECK: fcvt +; CHECK: frintz +; CHECK: fcvt +; CHECK: frintx + %1 = call half @llvm.trunc.f16(half %a) + ret half %1 +} + + +declare half @llvm.rint.f16(half %Val) +define half @rint_h(half %a) { +; CHECK-LABEL: rint_h: +; CHECK: fcvt +; CHECK: frintx +; CHECK: fcvt + %1 = call half @llvm.rint.f16(half %a) + ret half %1 +} + + +declare half @llvm.nearbyint.f16(half %Val) +define half @nearbyint_h(half %a) { +; CHECK-LABEL: nearbyint_h: +; CHECK: fcvt +; CHECK: frinti +; CHECK: fcvt + %1 = call half @llvm.nearbyint.f16(half %a) + ret half %1 +} + + +declare half @llvm.round.f16(half %Val) +define half @round_h(half %a) { +; CHECK-LABEL: round_h: +; CHECK: fcvt +; CHECK: frinta +; CHECK: fcvt +; CHECK: frintx + %1 = call half @llvm.round.f16(half %a) + ret half %1 +} + + +declare half @llvm.fmuladd.f16(half %a, half %b, half %c) +define half @fmuladd_h(half %a, half %b, half %c) { +; CHECK-LABEL: fmuladd_h: +; CHECK: fcvt +; CHECK: fcvt +; CHECK: fmul +; CHECK: fadd +; CHECK: fcvt + %1 = call half @llvm.fmuladd.f16(half %a, half %b, half %c) + ret half %1 +} + +declare half @llvm.powi.f16(half %Val, i32 %power) +define half @powi_h(half %a, i32 %b) { +; CHECK-LABEL: powi_h: +; CHECK: fcvt +; CHECK: bl __powisf2 +; CHECK: fcvt + %1 = call half @llvm.powi.f16(half %a, i32 %b) + ret half %1 +} + +declare half @llvm.exp.f16(half %Val) +define half @exp_h(half %a) { +; CHECK-LABEL: exp_h: +; CHECK: fcvt +; CHECK: bl expf +; CHECK: fcvt + %1 = call half @llvm.exp.f16(half %a) + ret half %1 +} + + +declare half @llvm.exp2.f16(half %Val) +define half @exp2_h(half %a) { +; CHECK-LABEL: exp2_h: +; CHECK: fcvt +; CHECK: bl exp2f +; CHECK: fcvt + %1 = call half @llvm.exp2.f16(half %a) + ret half %1 +} + +declare half @llvm.log.f16(half %Val) +define half @log_h(half %a) { +; CHECK-LABEL: log_h: +; CHECK: fcvt +; CHECK: bl logf +; CHECK: fcvt + %1 = call half @llvm.log.f16(half %a) + ret half %1 +} + + +declare half @llvm.log10.f16(half %Val) +define half @log10_h(half %a) { +; CHECK-LABEL: log10_h: +; CHECK: fcvt +; CHECK: bl log10f +; CHECK: fcvt + %1 = call half @llvm.log10.f16(half %a) + ret half %1 +} + +declare half @llvm.log2.f16(half %Val) +define half @log2_h(half %a) { +; CHECK-LABEL: log2_h: +; CHECK: fcvt +; CHECK: bl log2f +; CHECK: fcvt + %1 = call half @llvm.log2.f16(half %a) + ret half %1 +} + +declare half @llvm.copysign.f16(half %Mag, half %Sgn) +define half @copysign_h(half %a, half %b) { +; CHECK-LABEL: copysign_h: +; CHECK: fcvt +; CHECK: movi +; CHECK: bit +; CHECK: fcvt + %1 = call half @llvm.copysign.f16(half %a, half %b) + ret half %1 +} Index: test/CodeGen/AArch64/fp16-v4-instructions.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/fp16-v4-instructions.ll @@ -0,0 +1,172 @@ +; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s + +define <4 x half> @add_h(<4 x half> %a, <4 x half> %b) { +entry: +; CHECK-LABEL: add_h: +; CHECK: fcvtl +; CHECK: fcvtl +; CHECK: fadd +; CHECK: fcvtn + %0 = fadd <4 x half> %a, %b + ret <4 x half> %0 +} + + +define <4 x half> @sub_h(<4 x half> %a, <4 x half> %b) { +entry: +; CHECK-LABEL: sub_h: +; CHECK: fcvtl +; CHECK: fcvtl +; CHECK: fsub +; CHECK: fcvtn + %0 = fsub <4 x half> %a, %b + ret <4 x half> %0 +} + + +define <4 x half> @mul_h(<4 x half> %a, <4 x half> %b) { +entry: +; CHECK-LABEL: mul_h: +; CHECK: fcvtl +; CHECK: fcvtl +; CHECK: fmul +; CHECK: fcvtn + %0 = fmul <4 x half> %a, %b + ret <4 x half> %0 +} + + +define <4 x half> @div_h(<4 x half> %a, <4 x half> %b) { +entry: +; CHECK-LABEL: div_h: +; CHECK: fcvtl +; CHECK: fcvtl +; CHECK: fdiv +; CHECK: fcvtn + %0 = fdiv <4 x half> %a, %b + ret <4 x half> %0 +} + + +define <4 x half> @rem_h(<4 x half> %a, <4 x half> %b) { +entry: +; CHECK-LABEL: rem_h: +; CHECK: fcvtl +; CHECK: fcvtl +; CHECK: bl fmodf +; CHECK: bl fmodf +; CHECK: bl fmodf +; CHECK: bl fmodf +; CHECK: fcvtn + %0 = frem <4 x half> %a, %b + ret <4 x half> %0 +} + + +define <4 x half> @load_h(<4 x half>* %a) { +entry: +; CHECK-LABEL: load_h: +; CHECK: ldr d + %0 = load <4 x half>* %a, align 4 + ret <4 x half> %0 +} + + +define void @store_h(<4 x half>* %a, <4 x half> %b) { +entry: +; CHECK-LABEL: store_h: +; CHECK: str d + store <4 x half> %b, <4 x half>* %a, align 4 + ret void +} + +define <4 x half> @s_to_h(<4 x float> %a) { +; CHECK-LABEL: s_to_h: +; CHECK: fcvtn + %1 = fptrunc <4 x float> %a to <4 x half> + ret <4 x half> %1 +} + +define <4 x half> @d_to_h(<4 x double> %a) { +; CHECK-LABEL: d_to_h: +; CHECK: fcvt +; CHECK: fcvt +; CHECK: fcvt +; CHECK: fcvt + %1 = fptrunc <4 x double> %a to <4 x half> + ret <4 x half> %1 +} + +define <4 x float> @h_to_s(<4 x half> %a) { +; CHECK-LABEL: h_to_s: +; CHECK: fcvtl + %1 = fpext <4 x half> %a to <4 x float> + ret <4 x float> %1 +} + +define <4 x double> @h_to_d(<4 x half> %a) { +; CHECK-LABEL: h_to_d: +; CHECK: fcvt +; CHECK: fcvt +; CHECK: fcvt +; CHECK: fcvt + %1 = fpext <4 x half> %a to <4 x double> + ret <4 x double> %1 +} + +define <4 x i32> @f_to_si(<4 x half> %a) { +; CHECK-LABEL: f_to_si: +; CHECK: fcvtl + %1 = fptosi <4 x half> %a to <4 x i32> + ret <4 x i32> %1 +} + + +define <4 x i32> @f_to_ui(<4 x half> %a) { +; CHECK-LABEL: f_to_ui: +; CHECK: fcvtl + %1 = fptoui <4 x half> %a to <4 x i32> + ret <4 x i32> %1 +} + + +define <4 x half> @si_to_h(<4 x i32> %a) { +; CHECK-LABEL: si_to_h: +; CHECK: fcvtn + %1 = sitofp <4 x i32> %a to <4 x half> + ret <4 x half> %1 +} + + +define <4 x half> @ui_to_h(<4 x i32> %a) { +; CHECK-LABEL: ui_to_h: +; CHECK: fcvtn + %1 = uitofp <4 x i32> %a to <4 x half> + ret <4 x half> %1 +} + + +define <4 x half> @bitcast_i_to_h(<4 x i16> %a) { +; CHECK-LABEL: bitcast_i_to_h: +; CHECK-NOT: fmov + %1 = bitcast <4 x i16> %a to <4 x half> + ret <4 x half> %1 +} + + +define <4 x i16> @bitcast_h_to_i(<4 x half> %a) { +; CHECK-LABEL: bitcast_h_to_i: +; CHECK-NOT: fmov + %1 = bitcast <4 x half> %a to <4 x i16> + ret <4 x i16> %1 +} + + +define <4 x half> @select_h(<4 x half> %a, <4 x half> %b, i1 %c) { +; CHECK-LABEL: select_h: +; CHECK: fcvtl +; CHECK: fcvtl +; CHECK: fcvtn + %1 = select i1 %c, <4 x half> %a, <4 x half> %b + ret <4 x half> %1 +} Index: test/CodeGen/AArch64/fp16-v8-instructions.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/fp16-v8-instructions.ll @@ -0,0 +1,341 @@ +; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s + +define <8 x half> @add_h(<8 x half> %a, <8 x half> %b) { +entry: +; CHECK-LABEL: add_h: +; CHECK: fcvt +; CHECK: fcvt +; CHECK-DAG: fadd +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fadd +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fadd +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fadd +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fadd +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fadd +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fadd +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fadd +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK: fcvt + %0 = fadd <8 x half> %a, %b + ret <8 x half> %0 +} + + +define <8 x half> @sub_h(<8 x half> %a, <8 x half> %b) { +entry: +; CHECK-LABEL: sub_h: +; CHECK: fcvt +; CHECK: fcvt +; CHECK-DAG: fsub +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fsub +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fsub +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fsub +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fsub +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fsub +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fsub +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fsub +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK: fcvt + %0 = fsub <8 x half> %a, %b + ret <8 x half> %0 +} + + +define <8 x half> @mul_h(<8 x half> %a, <8 x half> %b) { +entry: +; CHECK-LABEL: mul_h: +; CHECK: fcvt +; CHECK: fcvt +; CHECK-DAG: fmul +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fmul +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fmul +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fmul +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fmul +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fmul +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fmul +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fmul +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK: fcvt + %0 = fmul <8 x half> %a, %b + ret <8 x half> %0 +} + + +define <8 x half> @div_h(<8 x half> %a, <8 x half> %b) { +entry: +; CHECK-LABEL: div_h: +; CHECK: fcvt +; CHECK: fcvt +; CHECK-DAG: fdiv +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fdiv +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fdiv +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fdiv +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fdiv +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fdiv +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fdiv +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fdiv +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK: fcvt + %0 = fdiv <8 x half> %a, %b + ret <8 x half> %0 +} + + +define <8 x half> @rem_h(<8 x half> %a, <8 x half> %b) { +entry: +; CHECK-LABEL: rem_h: +; CHECK: fcvt s +; CHECK: fcvt s +; CHECK-DAG: bl fmodf +; CHECK-DAG: fcvt h +; CHECK-DAG: fcvt s +; CHECK-DAG: fcvt s +; CHECK-DAG: bl fmodf +; CHECK-DAG: fcvt h +; CHECK-DAG: fcvt s +; CHECK-DAG: fcvt s +; CHECK-DAG: bl fmodf +; CHECK-DAG: fcvt h +; CHECK-DAG: fcvt s +; CHECK-DAG: fcvt s +; CHECK-DAG: bl fmodf +; CHECK-DAG: fcvt h +; CHECK-DAG: fcvt s +; CHECK-DAG: fcvt s +; CHECK-DAG: bl fmodf +; CHECK-DAG: fcvt h +; CHECK-DAG: fcvt s +; CHECK-DAG: fcvt s +; CHECK-DAG: bl fmodf +; CHECK-DAG: fcvt h +; CHECK-DAG: fcvt s +; CHECK-DAG: fcvt s +; CHECK-DAG: bl fmodf +; CHECK-DAG: fcvt h +; CHECK-DAG: fcvt s +; CHECK-DAG: fcvt s +; CHECK-DAG: bl fmodf +; CHECK: fcvt h + %0 = frem <8 x half> %a, %b + ret <8 x half> %0 +} + + +define <8 x half> @load_h(<8 x half>* %a) { +entry: +; CHECK-LABEL: load_h: +; CHECK: ldr q + %0 = load <8 x half>* %a, align 4 + ret <8 x half> %0 +} + + +define void @store_h(<8 x half>* %a, <8 x half> %b) { +entry: +; CHECK-LABEL: store_h: +; CHECK: str q + store <8 x half> %b, <8 x half>* %a, align 4 + ret void +} + +define <8 x half> @s_to_h(<8 x float> %a) { +; CHECK-LABEL: s_to_h: +; CHECK: fcvtn +; CHECK: fcvtn +; CHECK: ins + %1 = fptrunc <8 x float> %a to <8 x half> + ret <8 x half> %1 +} + +define <8 x half> @d_to_h(<8 x double> %a) { +; CHECK-LABEL: d_to_h: +; CHECK-DAG: fcvt h +; CHECK-DAG: fcvt h +; CHECK-DAG: fcvt h +; CHECK-DAG: fcvt h +; CHECK-DAG: fcvt h +; CHECK-DAG: fcvt h +; CHECK-DAG: fcvt h +; CHECK-DAG: fcvt h +; CHECK-DAG: ins v{{[0-9]+}}.d +; CHECK-DAG: ins v{{[0-9]+}}.d +; CHECK-DAG: ins v{{[0-9]+}}.d +; CHECK-DAG: ins v{{[0-9]+}}.d +; CHECK-DAG: ins v{{[0-9]+}}.h +; CHECK-DAG: ins v{{[0-9]+}}.h +; CHECK-DAG: ins v{{[0-9]+}}.h +; CHECK-DAG: ins v{{[0-9]+}}.h +; CHECK-DAG: ins v{{[0-9]+}}.h +; CHECK-DAG: ins v{{[0-9]+}}.h +; CHECK-DAG: ins v{{[0-9]+}}.h +; CHECK-DAG: ins v{{[0-9]+}}.h + %1 = fptrunc <8 x double> %a to <8 x half> + ret <8 x half> %1 +} + +define <8 x float> @h_to_s(<8 x half> %a) { +; CHECK-LABEL: h_to_s: +; CHECK-DAG: fcvtl2 +; CHECK-DAG: fcvtl + %1 = fpext <8 x half> %a to <8 x float> + ret <8 x float> %1 +} + +define <8 x double> @h_to_d(<8 x half> %a) { +; CHECK-LABEL: h_to_d: +; CHECK-DAG: fcvt d +; CHECK-DAG: fcvt d +; CHECK-DAG: fcvt d +; CHECK-DAG: fcvt d +; CHECK-DAG: fcvt d +; CHECK-DAG: fcvt d +; CHECK-DAG: fcvt d +; CHECK-DAG: fcvt d +; CHECK-DAG: ins +; CHECK-DAG: ins +; CHECK-DAG: ins +; CHECK-DAG: ins + %1 = fpext <8 x half> %a to <8 x double> + ret <8 x double> %1 +} + +define <8 x i32> @f_to_si(<8 x half> %a) { +; CHECK-LABEL: f_to_si: +; CHECK-DAG: fcvtl2 +; CHECK-DAG: fcvtl +; CHECK-DAG: fcvtzs +; CHECK-DAG: fcvtzs + %1 = fptosi <8 x half> %a to <8 x i32> + ret <8 x i32> %1 +} + + +define <8 x i32> @f_to_ui(<8 x half> %a) { +; CHECK-LABEL: f_to_ui: +; CHECK-DAG: fcvtl2 +; CHECK-DAG: fcvtl +; CHECK-DAG: fcvtzu +; CHECK-DAG: fcvtzu + %1 = fptoui <8 x half> %a to <8 x i32> + ret <8 x i32> %1 +} + + +define <8 x half> @si_to_h(<8 x i32> %a) { +; CHECK-LABEL: si_to_h: +; CHECK: scvtf +; CHECK: scvtf +; CHECK: fcvtn +; CHECK: fcvtn +; CHECK: ins + %1 = sitofp <8 x i32> %a to <8 x half> + ret <8 x half> %1 +} + + +define <8 x half> @ui_to_h(<8 x i32> %a) { +; CHECK-LABEL: ui_to_h: +; CHECK: ucvtf +; CHECK: ucvtf +; CHECK: fcvtn +; CHECK: fcvtn +; CHECK: ins + %1 = uitofp <8 x i32> %a to <8 x half> + ret <8 x half> %1 +} + + +define <8 x half> @bitcast_i_to_h(<8 x i16> %a) { +; CHECK-LABEL: bitcast_i_to_h: +; CHECK-NOT: fmov + %1 = bitcast <8 x i16> %a to <8 x half> + ret <8 x half> %1 +} + + +define <8 x i16> @bitcast_h_to_i(<8 x half> %a) { +; CHECK-LABEL: bitcast_h_to_i: +; CHECK-NOT: fmov + %1 = bitcast <8 x half> %a to <8 x i16> + ret <8 x i16> %1 +} + Index: test/CodeGen/AArch64/fp16-vector-shuffle.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/fp16-vector-shuffle.ll @@ -0,0 +1,301 @@ +; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s + +; float16x4_t select_64(float16x4_t a, float16x4_t b, uint16x4_t c) { return vbsl_u16(c, a, b); } +define <4 x half> @select_64(<4 x half> %a, <4 x half> %b, <4 x i16> %c) #0 { +; CHECK-LABEL: select_64: +; CHECK: bsl +entry: + %0 = bitcast <4 x half> %a to <4 x i16> + %1 = bitcast <4 x half> %b to <4 x i16> + %vbsl3.i = and <4 x i16> %0, %c + %2 = xor <4 x i16> %c, + %vbsl4.i = and <4 x i16> %1, %2 + %vbsl5.i = or <4 x i16> %vbsl3.i, %vbsl4.i + %3 = bitcast <4 x i16> %vbsl5.i to <4 x half> + ret <4 x half> %3 +} + +; float16x8_t select_128(float16x8_t a, float16x8_t b, uint16x8_t c) { return vbslq_u16(c, a, b); } +define <8 x half> @select_128(<8 x half> %a, <8 x half> %b, <8 x i16> %c) #0 { +; CHECK-LABEL: select_128: +; CHECK: bsl +entry: + %0 = bitcast <8 x half> %a to <8 x i16> + %1 = bitcast <8 x half> %b to <8 x i16> + %vbsl3.i = and <8 x i16> %0, %c + %2 = xor <8 x i16> %c, + %vbsl4.i = and <8 x i16> %1, %2 + %vbsl5.i = or <8 x i16> %vbsl3.i, %vbsl4.i + %3 = bitcast <8 x i16> %vbsl5.i to <8 x half> + ret <8 x half> %3 +} + +; float16x4_t lane_64_64(float16x4_t a, float16x4_t b) { +; return vcopy_lane_s16(a, 1, b, 2); +; } +define <4 x half> @lane_64_64(<4 x half> %a, <4 x half> %b) #0 { +; CHECK-LABEL: lane_64_64: +; CHECK: ins +entry: + %0 = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> + ret <4 x half> %0 +} + +; float16x8_t lane_128_64(float16x8_t a, float16x4_t b) { +; return vcopyq_lane_s16(a, 1, b, 2); +; } +define <8 x half> @lane_128_64(<8 x half> %a, <4 x half> %b) #0 { +; CHECK-LABEL: lane_128_64: +; CHECK: ins +entry: + %0 = bitcast <4 x half> %b to <4 x i16> + %vget_lane = extractelement <4 x i16> %0, i32 2 + %1 = bitcast <8 x half> %a to <8 x i16> + %vset_lane = insertelement <8 x i16> %1, i16 %vget_lane, i32 1 + %2 = bitcast <8 x i16> %vset_lane to <8 x half> + ret <8 x half> %2 +} + +; float16x4_t lane_64_128(float16x4_t a, float16x8_t b) { +; return vcopy_laneq_s16(a, 3, b, 5); +; } +define <4 x half> @lane_64_128(<4 x half> %a, <8 x half> %b) #0 { +; CHECK-LABEL: lane_64_128: +; CHECK: ins +entry: + %0 = bitcast <8 x half> %b to <8 x i16> + %vgetq_lane = extractelement <8 x i16> %0, i32 5 + %1 = bitcast <4 x half> %a to <4 x i16> + %vset_lane = insertelement <4 x i16> %1, i16 %vgetq_lane, i32 3 + %2 = bitcast <4 x i16> %vset_lane to <4 x half> + ret <4 x half> %2 +} + +; float16x8_t lane_128_128(float16x8_t a, float16x8_t b) { +; return vcopyq_laneq_s16(a, 3, b, 5); +; } +define <8 x half> @lane_128_128(<8 x half> %a, <8 x half> %b) #0 { +; CHECK-LABEL: lane_128_128: +; CHECK: ins +entry: + %0 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> + ret <8 x half> %0 +} + +; float16x4_t ext_64(float16x4_t a, float16x4_t b) { +; return vext_s16(a, b, 3); +; } +define <4 x half> @ext_64(<4 x half> %a, <4 x half> %b) #0 { +; CHECK-LABEL: ext_64: +; CHECK: ext +entry: + %0 = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> + ret <4 x half> %0 +} + +; float16x8_t ext_128(float16x8_t a, float16x8_t b) { +; return vextq_s16(a, b, 3); +; } +define <8 x half> @ext_128(<8 x half> %a, <8 x half> %b) #0 { +; CHECK-LABEL: ext_128: +; CHECK: ext +entry: + %0 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> + ret <8 x half> %0 +} + +; float16x4_t rev32_64(float16x4_t a) { +; return vrev32_s16(a); +; } +define <4 x half> @rev32_64(<4 x half> %a) #0 { +entry: +; CHECK-LABEL: rev32_64: +; CHECK: rev32 + %0 = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> + ret <4 x half> %0 +} + +; float16x4_t rev64_64(float16x4_t a) { +; return vrev64_s16(a); +; } +define <4 x half> @rev64_64(<4 x half> %a) #0 { +entry: +; CHECK-LABEL: rev64_64: +; CHECK: rev64 + %0 = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> + ret <4 x half> %0 +} + +; float16x8_t rev32_128(float16x8_t a) { +; return vrev32q_s16(a); +; } +define <8 x half> @rev32_128(<8 x half> %a) #0 { +entry: +; CHECK-LABEL: rev32_128: +; CHECK: rev32 + %0 = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> + ret <8 x half> %0 +} + +; float16x8_t rev64_128(float16x8_t a) { +; return vrev64q_s16(a); +; } +define <8 x half> @rev64_128(<8 x half> %a) #0 { +entry: +; CHECK-LABEL: rev64_128: +; CHECK: rev64 + %0 = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> + ret <8 x half> %0 +} + +; float16x4_t create_64(long long a) { return vcreate_f16(a); } +define <4 x half> @create_64(i64 %a) #0 { +; CHECK-LABEL: create_64: +; CHECK: fmov +entry: + %0 = bitcast i64 %a to <4 x half> + ret <4 x half> %0 +} + +; float16x4_t dup_64(__fp16 a) { return vdup_n_f16(a); } +define <4 x half> @dup_64(half %a) #0 { +; CHECK-LABEL: dup_64: +; CHECK: dup +entry: + %vecinit = insertelement <4 x half> undef, half %a, i32 0 + %vecinit1 = insertelement <4 x half> %vecinit, half %a, i32 1 + %vecinit2 = insertelement <4 x half> %vecinit1, half %a, i32 2 + %vecinit3 = insertelement <4 x half> %vecinit2, half %a, i32 3 + ret <4 x half> %vecinit3 +} + +; float16x8_t dup_128(__fp16 a) { return vdupq_n_f16(a); } +define <8 x half> @dup_128(half %a) #0 { +entry: +; CHECK-LABEL: dup_128: +; CHECK: dup + %vecinit = insertelement <8 x half> undef, half %a, i32 0 + %vecinit1 = insertelement <8 x half> %vecinit, half %a, i32 1 + %vecinit2 = insertelement <8 x half> %vecinit1, half %a, i32 2 + %vecinit3 = insertelement <8 x half> %vecinit2, half %a, i32 3 + %vecinit4 = insertelement <8 x half> %vecinit3, half %a, i32 4 + %vecinit5 = insertelement <8 x half> %vecinit4, half %a, i32 5 + %vecinit6 = insertelement <8 x half> %vecinit5, half %a, i32 6 + %vecinit7 = insertelement <8 x half> %vecinit6, half %a, i32 7 + ret <8 x half> %vecinit7 +} + +; float16x4_t dup_lane_64(float16x4_t a) { return vdup_lane_f16(a, 2); } +define <4 x half> @dup_lane_64(<4 x half> %a) #0 { +entry: +; CHECK-LABEL: dup_lane_64: +; CHECK: dup + %shuffle = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> + ret <4 x half> %shuffle +} + +; float16x8_t dup_lane_128(float16x4_t a) { return vdupq_lane_f16(a, 2); } +define <8 x half> @dup_lane_128(<4 x half> %a) #0 { +entry: +; CHECK-LABEL: dup_lane_128: +; CHECK: dup + %shuffle = shufflevector <4 x half> %a, <4 x half> undef, <8 x i32> + ret <8 x half> %shuffle +} + +; float16x4_t dup_laneq_64(float16x8_t a) { return vdup_laneq_f16(a, 2); } +define <4 x half> @dup_laneq_64(<8 x half> %a) #0 { +entry: +; CHECK-LABEL: dup_laneq_64: +; CHECK: dup + %shuffle = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> + ret <4 x half> %shuffle +} + +; float16x8_t dup_laneq_128(float16x8_t a) { return vdupq_laneq_f16(a, 2); } +define <8 x half> @dup_laneq_128(<8 x half> %a) #0 { +entry: +; CHECK-LABEL: dup_laneq_128: +; CHECK: dup + %shuffle = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> + ret <8 x half> %shuffle +} + +; float16x8_t vcombine(float16x4_t a, float16x4_t b) { return vcombine_f16(a, b); } +define <8 x half> @vcombine(<4 x half> %a, <4 x half> %b) #0 { +entry: +; CHECK-LABEL: vcombine: +; CHECK: ins + %shuffle.i = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> + ret <8 x half> %shuffle.i +} + +; float16x4_t get_high(float16x8_t a) { return vget_high_f16(a); } +define <4 x half> @get_high(<8 x half> %a) #0 { +; CHECK-LABEL: get_high: +; CHECK: ext +entry: + %shuffle.i = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> + ret <4 x half> %shuffle.i +} + + +; float16x4_t get_low(float16x8_t a) { return vget_low_f16(a); } +define <4 x half> @get_low(<8 x half> %a) #0 { +; CHECK-LABEL: get_low: +; CHECK-NOT: ext +entry: + %shuffle.i = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> + ret <4 x half> %shuffle.i +} + +; float16x4_t set_lane_64(float16x4_t a, __fp16 b) { return vset_lane_f16(b, a, 2); } +define <4 x half> @set_lane_64(<4 x half> %a, half %b) #0 { +; CHECK-LABEL: set_lane_64: +; CHECK: fmov +; CHECK: ins +entry: + %0 = bitcast half %b to i16 + %1 = bitcast <4 x half> %a to <4 x i16> + %vset_lane = insertelement <4 x i16> %1, i16 %0, i32 2 + %2 = bitcast <4 x i16> %vset_lane to <4 x half> + ret <4 x half> %2 +} + + +; float16x8_t set_lane_128(float16x8_t a, __fp16 b) { return vsetq_lane_f16(b, a, 2); } +define <8 x half> @set_lane_128(<8 x half> %a, half %b) #0 { +; CHECK-LABEL: set_lane_128: +; CHECK: fmov +; CHECK: ins +entry: + %0 = bitcast half %b to i16 + %1 = bitcast <8 x half> %a to <8 x i16> + %vset_lane = insertelement <8 x i16> %1, i16 %0, i32 2 + %2 = bitcast <8 x i16> %vset_lane to <8 x half> + ret <8 x half> %2 +} + +; __fp16 get_lane_64(float16x4_t a) { return vget_lane_f16(a, 2); } +define half @get_lane_64(<4 x half> %a) #0 { +; CHECK-LABEL: get_lane_64: +; CHECK: umov +; CHECK: fmov +entry: + %0 = bitcast <4 x half> %a to <4 x i16> + %vget_lane = extractelement <4 x i16> %0, i32 2 + %1 = bitcast i16 %vget_lane to half + ret half %1 +} + +; __fp16 get_lane_128(float16x8_t a) { return vgetq_lane_f16(a, 2); } +define half @get_lane_128(<8 x half> %a) #0 { +; CHECK-LABEL: get_lane_128: +; CHECK: umov +; CHECK: fmov +entry: + %0 = bitcast <8 x half> %a to <8 x i16> + %vgetq_lane = extractelement <8 x i16> %0, i32 2 + %1 = bitcast i16 %vgetq_lane to half + ret half %1 +}