Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8739,7 +8739,8 @@ for (unsigned i = 0, e = Ins.size(); i != e; ++i) { assert(InVals[i].getNode() && "LowerFormalArguments emitted a null value!"); - assert(EVT(Ins[i].VT) == InVals[i].getValueType() && + assert((InVals[i].getValueType() == MVT::f16 || + EVT(Ins[i].VT) == InVals[i].getValueType()) && "LowerFormalArguments emitted a value with the wrong type!"); } }); Index: lib/Target/ARM/ARMCallingConv.td =================================================================== --- lib/Target/ARM/ARMCallingConv.td +++ lib/Target/ARM/ARMCallingConv.td @@ -156,6 +156,8 @@ // Handles byval parameters. CCIfByVal>, + CCIfType<[f16], CCBitConvertToType>, + // The 'nest' parameter, if any, is passed in R12. CCIfNest>, @@ -187,6 +189,9 @@ CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>, CCIfType<[f32], CCBitConvertToType>, + + CCIfType<[f16], CCBitConvertToType>, + CCDelegateTo ]>; @@ -214,8 +219,8 @@ CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, - CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, - S9, S10, S11, S12, S13, S14, S15]>>, + CCIfType<[f16, f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, CCDelegateTo ]>; @@ -232,8 +237,8 @@ CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, - CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, - S9, S10, S11, S12, S13, S14, S15]>>, + CCIfType<[f16, f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, CCDelegateTo ]>; Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -662,7 +662,8 @@ SDValue &Chain) const; SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -522,6 +522,112 @@ addRegisterClass(MVT::f64, &ARM::DPRRegClass); } + // The HPR registerclass and f16 type are added as a legal type when: + // - FullFP16 is enabled, which means support for the Armv8.2-A FP16 instructions, + // - FP16 is enabled, which means support for the f16 <-> f32 conversion + // instructions, which are a VFP3 extension and part of VFP4. + // + // It's obvious why f16 is legal for the former case, but the latter is perhaps the + // more interesting one. Making fp16 legal for FP16, results in + // f16 LOADs/STOREs while we don't have instructions for them. So the approach is + // to custom lower f16 LOAD/STORE nodes. + // + // The reason to make f16 legal when FP16 is supported is: + // - avoid very early legalization/combining of f16 arguments to f32 types, + // which would again interpret the higher 16 bits in 32-bit registers and that + // would be wrong. Instead of trying to undo this early legalization/combining, + // this approach is easier and cleaner. + // - As a consequence, the isel dags are in a more 'normal form'. I.e. it relies + // less on funny nodes FP_TO_FP16 and FP16_TO_FP, which are funny because they + // perform float up/down converts and produce i32 values by moving from/to integer + // and float registers. Instead, FP_EXTEND and FP_ROUND nodes will be introduced, + // so this is more a clean up rather than e.g. addressing a correctness issue. + // Unfortunatly I found that I can't completely get rid of nodes FP16_TO_FP, see + // - When these FP_EXTEND and FP_ROUND are introduced by the legalizer, and + // we don't have the FP16 conversion instructions available, they will be + // custom lowered to EABI calls h2f and f2h. + // + if (Subtarget->hasFP16() || Subtarget->hasFullFP16()) { + addRegisterClass(MVT::f16, &ARM::HPRRegClass); + } + + if (!Subtarget->hasFullFP16()) { + setOperationAction(ISD::SELECT, MVT::f16, Promote); + setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); + setOperationAction(ISD::SETCC, MVT::f16, Promote); + setOperationAction(ISD::BR_CC, MVT::f16, Custom); + setOperationAction(ISD::FADD, MVT::f16, Promote); + setOperationAction(ISD::FSUB, MVT::f16, Promote); + setOperationAction(ISD::FMUL, MVT::f16, Promote); + setOperationAction(ISD::FDIV, MVT::f16, Promote); + setOperationAction(ISD::FREM, MVT::f16, Promote); + setOperationAction(ISD::FMA, MVT::f16, Promote); + setOperationAction(ISD::FNEG, MVT::f16, Promote); + setOperationAction(ISD::FABS, MVT::f16, Promote); + setOperationAction(ISD::FCEIL, MVT::f16, Promote); + setOperationAction(ISD::FSQRT, MVT::f16, Promote); + setOperationAction(ISD::FFLOOR, MVT::f16, Promote); + setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); + setOperationAction(ISD::FRINT, MVT::f16, Promote); + setOperationAction(ISD::FROUND, MVT::f16, Promote); + setOperationAction(ISD::FTRUNC, MVT::f16, Promote); + setOperationAction(ISD::FPOWI, MVT::f16, Promote); + setOperationAction(ISD::FPOW, MVT::f16, Promote); + setOperationAction(ISD::FEXP, MVT::f16, Promote); + setOperationAction(ISD::FEXP2, MVT::f16, Promote); + setOperationAction(ISD::FSIN, MVT::f16, Promote); + setOperationAction(ISD::FCOS, MVT::f16, Promote); + setOperationAction(ISD::FLOG, MVT::f16, Promote); + setOperationAction(ISD::FLOG2, MVT::f16, Promote); + setOperationAction(ISD::FLOG10, MVT::f16, Promote); + setOperationAction(ISD::FMINNUM, MVT::f16, Promote); + setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom); + + // When we don't have FullFP16 support, and thus don't have FP16 load/store + // instructions, we create integer half-word integer load/stores. + // + // And input IR like this e.g.: + // + // %1 = load i16, i16 * ... + // %2 = tail call float @llvm.convert.from.fp16.f32(i16 ...) + // .. = fadd %2 .. + // + // gets combined very early to f16 loads when f16 types are legal. So we + // are custom lowering these f16 loads and stores, using integer loads and + // stores. This matches the storage-only semantics of __fp16, where + // arithmetic is done in single-precision, but results written back to + // half-precision. IR like the example above, can be generated due to use + // of __fp16. + setOperationAction(ISD::LOAD, MVT::f16, Custom); + setOperationAction(ISD::STORE, MVT::f16, Custom); + + // This is a cleanup. We unfortunately need a FP_TO_FP16 node to create + // a truncating i32 -> i16 integer store. + setOperationAction(ISD::FP_TO_FP16, MVT::i32, Custom); + + // Another case from the use of __fp16 and passing halfs as i16. I.e. when + // function arguments are passed as i16, but converted to f32 or f64 in the + // function body, an i16 truncate, f16 bitcast, and an FP_EXTEND are + // generated. When f16 is not a legal type, the f16 bitcast is legalized + // to FP16_TO_FP. But when f16 is a legal type, this does not happen, and + // the truncate results in code generation and stack loads/stores. We want + // to avoid this, and custom lower the truncate/bitcast to FP16_TO_FP. + setOperationAction(ISD::BITCAST, MVT::i16, Custom); + + // Create a libcall for f64 -> f16 conversion if necessary. + setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); + + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + } + for (MVT VT : MVT::vector_valuetypes()) { for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); @@ -707,6 +813,7 @@ setTargetDAGCombine(ISD::FP_TO_UINT); setTargetDAGCombine(ISD::FDIV); setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::BITCAST); // It is legal to extload from v4i8 to v4i16 or v4i32. for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, @@ -1502,6 +1609,7 @@ case ISD::SETUGE: return ARMCC::HS; case ISD::SETULT: return ARMCC::LO; case ISD::SETULE: return ARMCC::LS; + case ISD::SETOLT: return ARMCC::MI; } } @@ -3683,7 +3791,9 @@ } else { const TargetRegisterClass *RC; - if (RegVT == MVT::f32) + if (RegVT == MVT::f16) { + RC = &ARM::HPRRegClass; + } else if (RegVT == MVT::f32) RC = &ARM::SPRRegClass; else if (RegVT == MVT::f64) RC = &ARM::DPRRegClass; @@ -3706,7 +3816,21 @@ switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; + case CCValAssign::AExt: + if (VA.getValVT() == MVT::f16) { + ArgValue = DAG.getNode(ISD::BITCAST, dl, MVT::f32, ArgValue); + ArgValue = DAG.getFPExtendOrRound(ArgValue, dl, MVT::f16); + break; + } + assert(0 && "Unknown loc info!"); + break; case CCValAssign::BCvt: + if (Ins[VA.getValNo()].ArgVT == MVT::f16 && + !Subtarget->isTargetHardFloat()) { + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, ArgValue); + ArgValue = DAG.getNode(ISD::BITCAST, dl, MVT::f16, ArgValue); + break; + } ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); break; case CCValAssign::SExt: @@ -4534,6 +4658,12 @@ Chain, Dest, ARMcc, CCR, Cmp); } + if (LHS.getValueType() == MVT::f16) { + LHS = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), MVT::f32, LHS); + assert(RHS.getValueType() == MVT::f16); + RHS = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), MVT::f32, RHS); + } + assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); if (getTargetMachine().Options.UnsafeFPMath && @@ -4620,15 +4750,49 @@ EVT VT = Op.getValueType(); if (VT.isVector()) return LowerVectorFP_TO_INT(Op, DAG); - if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) { + + const EVT OpType = Op.getOperand(0).getValueType(); + + if (VT == MVT::i64 && OpType == MVT::f16) { + SDValue Cvt; + if (Op.getOperand(0).getOpcode() == ISD::LOAD) { + LoadSDNode *LD = cast(Op.getOperand(0).getNode()); + SDValue NewLD = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD, MVT::i32, + SDLoc(Op), LD->getOperand(0), LD->getBasePtr(), + LD->getOffset(), MVT::i16, LD->getMemOperand()); + Cvt = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), MVT::f32, NewLD); + } else + Cvt = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), MVT::f32, Op.getOperand(0)); + + SDValue Fp2int = DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Cvt); + + const EVT OpType = Fp2int.getOperand(0).getValueType(); + RTLIB::Libcall LC; if (Op.getOpcode() == ISD::FP_TO_SINT) - LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), - Op.getValueType()); + LC = RTLIB::getFPTOSINT(OpType, VT); else - LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), - Op.getValueType()); - return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), + LC = RTLIB::getFPTOUINT(OpType, VT); + + return makeLibCall(DAG, LC, VT, Fp2int.getOperand(0), + /*isSigned*/ false, SDLoc(Op)).first; + } + + if (OpType == MVT::f16) { + SDValue Cvt = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), + MVT::f32, Op.getOperand(0)); + return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Cvt); + } + + if ((Subtarget->isFPOnlySP() && OpType == MVT::f64) || + Op.getValueType() == MVT::i64) { + RTLIB::Libcall LC; + if (Op.getOpcode() == ISD::FP_TO_SINT) + LC = RTLIB::getFPTOSINT(OpType, VT); + else + LC = RTLIB::getFPTOUINT(OpType, VT); + + return makeLibCall(DAG, LC, VT, Op.getOperand(0), /*isSigned*/ false, SDLoc(Op)).first; } @@ -4669,10 +4833,30 @@ } SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { + if (!Subtarget->hasFullFP16() && Op.getValueType() == MVT::f16 && + Op.getOperand(0).getValueType() == MVT::i32) { + // Legalize: + // t2: f16 = [su]int_to_fp t1 + // to: + // t2: f32 = [su]int_to_fp t1 + // t3: f16 = fp_round t2 + SDValue I2F = DAG.getNode(Op.getOpcode(), SDLoc(Op), MVT::f32, + Op.getOperand(0)); + return DAG.getFPExtendOrRound(I2F, SDLoc(Op), MVT::f16); + } + EVT VT = Op.getValueType(); if (VT.isVector()) return LowerVectorINT_TO_FP(Op, DAG); - if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) { + + const bool IsF16Write = Op.getValueType() == MVT::f16; + const bool IsI64Read = Op.getOperand(0).getValueType() == MVT::i64; + + if (IsF16Write) + Op = DAG.getNode(Op.getOpcode(), SDLoc(Op), MVT::f32, Op.getOperand(0)); + + if ((Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) || + IsF16Write || IsI64Read) { RTLIB::Libcall LC; if (Op.getOpcode() == ISD::SINT_TO_FP) LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), @@ -4692,6 +4876,13 @@ SDValue Tmp0 = Op.getOperand(0); SDValue Tmp1 = Op.getOperand(1); SDLoc dl(Op); + + if (Tmp0.getValueType() == MVT::f16 && Tmp1.getValueType() == MVT::f16) { + Tmp0 = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), MVT::f32, Tmp0); + Tmp1 = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), MVT::f32, Tmp1); + Op = DAG.getNode(ISD::FCOPYSIGN, SDLoc(Op), MVT::f32, Tmp0, Tmp1); + } + EVT VT = Op.getValueType(); EVT SrcVT = Tmp1.getValueType(); bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || @@ -4908,8 +5099,96 @@ // source or destination of the bit convert. EVT SrcVT = Op.getValueType(); EVT DstVT = N->getValueType(0); - assert((SrcVT == MVT::i64 || DstVT == MVT::i64) && - "ExpandBITCAST called for non-i64 type"); + + if (SrcVT == MVT::i16 && DstVT == MVT::f16) { + // Handle @llvm.convert.from.fp16.f64(i16 %in), which generates IR like: + // + // t2: i32,ch = CopyFromReg t0, ... + // t3: i16 = truncate t2 + // t4: f16 = bitcast t3 + // t5: f64 = fp_extend t4 + // + // We want to custom lower the truncate->bitcast->fp_extend pattern to + // just a fp16_to_fp node: + // + // t2: i32,ch = CopyFromReg t0, Register:i32 %vreg0 + // tx: f64 = fp16_to_fp t2 + // + // This avoids stack loads/stores code generation for the bitcast node, + // and thus just generates a mov and convert. + // + if (Op.getOpcode() != ISD::TRUNCATE) + return SDValue(); + + auto BitcastUse = N->use_begin(); + + if (N->use_size() == 1 && BitcastUse->getOpcode() == ISD::FP_EXTEND) { + SDValue Cvt = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), + BitcastUse->getValueType(0), Op.getOperand(0)); + DAG.ReplaceAllUsesWith(*BitcastUse, Cvt.getNode()); + return Cvt; + } + + // If the use of the bitcast is not an extend, it's a data processing + // instructions, and we want to convert its operand to f32: + // + // t9: i16 = truncate t5 + // t10: f16 = bitcast t9 + // t11: f16 = fadd t8, t10 + // + SDValue Cvt = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), + MVT::f32, Op.getOperand(0)); + return Cvt; + } + + if (SrcVT == MVT::f16 && DstVT == MVT::i16) { + // Very similarly for e.g. f64, we want to transform: + // + // t2: f64,ch = CopyFromReg t0, Register:f64 %vreg0 + // t4: f16 = fp_round t2, TargetConstant:i32<0> + // t5: i16 = bitcast t4 + // t6: i32 = any_extend t5 + // + // into: + // + // t2: f64,ch = CopyFromReg t0, Register:f64 %vreg0 + // t13: i32 = fp_to_fp16 t2 + // t15: i32 = and t13, Constant:i32<65535> + // + if (Op.getOpcode() == ISD::FP_ROUND) { + auto FPAnyExtend = N->use_begin(); + if (N->use_size() != 1 || FPAnyExtend->getOpcode() != ISD::ANY_EXTEND) + return SDValue(); + SDValue Cvt = DAG.getNode(ISD::FP_TO_FP16, SDLoc(Op), + MVT::i32, Op.getOperand(0)); + SDValue And = DAG.getNode(ISD::AND, SDLoc(Op), MVT::i32, Cvt, + DAG.getConstant(65535, SDLoc(Op), MVT::i32)); + DAG.ReplaceAllUsesWith(*FPAnyExtend, And.getNode()); + return And; + } + + // Custom lower the f16 -> i16 -> i32 -> f32 conversion pattern: + // + // t12: i16 = bitcast t11 + // t13: i32 = zero_extend t12 + // t14: f32 = bitcast t13 + // + auto ZeroExtend = N->use_begin(); + if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + auto BitCast = ZeroExtend->use_begin(); + if (BitCast->use_size() != 1 || BitCast->getOpcode() != ISD::BITCAST) + return SDValue(); + if (BitCast->getValueType(0) != MVT::f32) + return SDValue(); + + SDValue V = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op); + DAG.ReplaceAllUsesWith(*BitCast, V.getNode()); + return V; + } + + if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) + return SDValue(); // Turn i64->f64 into VMOVDRR. if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { @@ -7771,10 +8050,214 @@ return !CI.second.getNode() ? DAG.getRoot() : CI.first; } +// This is a cleanup for the (corner)case when a load instruction directly +// feeds a store. For a load -> store chain, when the f16 store is legalized +// first, we unfortunately need to introduce a helper FP_TO_FP16 node +// in order to create a truncating i32 -> i16 integer store; this node somehow +// models a convert from a float to int type, which allows us to create a +// int store. This FP_TO_FP16 needs to be cleaned up though, as it should not +// lead to any code generation. When it is not a load/store chain, there will +// be f16 data processing instruction between loads/stores; the f16 operands of +// f16 data processing instructions would have been legalized and FP_EXTEND and +// FP_ROUND instructions would have been introduced. +// We want to transform this: +// +// t12: i32,ch = load t0, t2, undef:i32 +// t10: i32 = fp_to_fp16 t12 +// t11: ch = store t12:1, t10, t4, undef:i32 +// +// into: +// +// t12: i32,ch = load t0, t2, undef:i32 +// t11: ch = store t12:1, t12, t4, undef:i32 +// +// so that we just generate a LDRH and STRH half-word integer load/stores. +// +static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + if(!Op.hasOneUse()) + return SDValue(); + + auto Use = Op.getNode()->use_begin(); + if (Use->getOpcode() != ISD::STORE) { + dbgs() << "LowerFP_TO_FP16: use not a store, not cleaning it up\n"; + return SDValue(); + } + + // return the operand so that the uses get properly updated/replaced. + return Op.getOperand(0); +} + +static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + assert(!Subtarget->hasFullFP16()); + SDValue F16Op = Op.getOperand(1); + assert(F16Op.getValueType() == MVT::f16); + SDNode *N = Op.getNode(); + StoreSDNode *ST = cast(N); + + DEBUG( dbgs() << "Creating truncating i16 store for: "; F16Op.dump()); + SDValue Fp2fp16 = DAG.getNode(ISD::FP_TO_FP16, SDLoc(Op), MVT::i32, F16Op); + SDValue NewST = DAG.getTruncStore(Op.getOperand(0), SDLoc(Op), Fp2fp16, + ST->getBasePtr(), MVT::i16, + ST->getMemOperand()); + DEBUG(dbgs() << "New i16 store: "; NewST.dump()); + DAG.ReplaceAllUsesOfValueWith(Op, NewST); + return NewST; +} + +static SDNode *IsF16LoadStoreChain(SDNode *N) { + assert(N->getOpcode() == ISD::LOAD); + + if (N->getNumValues() != 2) { + DEBUG(dbgs() << "expecting 2 values\n"); + return nullptr; + } + + if (N->use_size() != 2) + return nullptr; + + // We expect the LD node of a LD->ST chain to have 2 uses: + // + // 1) the bitcast node, which feeds an extend to i32 + // 2) the ST node + + bool UseIsAStore = false; + bool UseIsABitCastAndExtend = false; + SDNode *ZEXT; + + for (auto U : N->uses()) { + switch (U->getOpcode()) { + default: return nullptr; + case ISD::STORE: + DEBUG(dbgs() << "Found a ST as a use: "; U->dump()); + UseIsAStore = true; + continue; + case ISD::BITCAST: + DEBUG(dbgs() << "Found a BITCAST as a use: "; U->dump()); + // Bail out if the bitcast has more uses, because then it is + // not a simple LD-ST chain. + if (!U->hasOneUse()) + return nullptr; + ZEXT = *U->use_begin(); + if (ZEXT->getOpcode () != ISD::ZERO_EXTEND) + return nullptr; + UseIsABitCastAndExtend = true; + break; + } + } + + if (!UseIsAStore || !UseIsABitCastAndExtend || !ZEXT->hasOneUse()) + return nullptr; + + return ZEXT; +} + +static SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + assert(!Subtarget->hasFullFP16() && Op.getValueType() == MVT::f16); + + SDNode *N = Op.getNode(); + LoadSDNode *LD = cast(N); + SDValue NewLD = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD, MVT::i32, + SDLoc(Op), Op.getOperand(0), LD->getBasePtr(), + LD->getOffset(), MVT::i16, LD->getMemOperand()); + + DEBUG(dbgs() << "Custom lowering f16 load: "; Op.dump(); + dbgs() << "Creating new i32 load: "; NewLD.dump()); + + // Now the more tricky part: fixing up the dag, i.e. replace the uses of the + // old load node. There are a few corner cases. For this example: + // + // t1: f16,ch = LD2 + // t2: ch = ST2 t1 + // + // The ST2 could have been legalized first, introducing a bitcast, an extend, + // and a truncating integer store, so that the DAG looks likes this + // + // t1: f16 = LD2 + // t2: i16 = bitcast t1 + // t3: i32 = zero_extend t2 + // t4: ch = ST2 t3 + // + // And we want to avoid code generation for these bitcast and extend nodes, + // by making the load a direct producer of the store. To achieve this, we + // need to replace uses, but can't use t1 because that expects f16 values. + // Instead we look for the extend, and replace its uses with the new i32 load + // node. + // + // Case I: Load -> Store + // + SDNode *From = IsF16LoadStoreChain(N); + if (From) { + // replace the chain + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1)); + // replace the uses of i32 zero extend + DAG.ReplaceAllUsesWith(From, NewLD.getNode()); + return NewLD; + } + + // Case II: Load -> FP_EXTEND + // + // The other special case is a f16 load producing a value for a fpextend node. + // We cannot simply replace the uses because of type mismatches, so we work + // around that by creating a FP16_TO_FP node to model the extend, and then + // replace the uses of the extend. + // + SDNode * FPExtend = nullptr; + for (auto U : N->uses()) { + if (U->getOpcode() == ISD::FP_EXTEND) + FPExtend = U; + } + + if (FPExtend != nullptr) { + DEBUG(dbgs() << "Creating i32 -> f32 node\n"); + SDValue F16ToF = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), + FPExtend->getValueType(0), NewLD); + DAG.ReplaceAllUsesWith(FPExtend, F16ToF.getNode()); + return NewLD; + } + + // Case III: Load -> CopyToReg + // + // When a load produces a value for a copytoreg, we need to make sure to upconvert + // the f16 to f32. I.e., we want to transform this: + // + // t5: f16,ch = load t0, t2, undef:i32 + // t7: ch = CopyToReg t0, Register:f16 %0, t5 + // + // into: + // t8: i32,ch = load t0, t2, undef:i32 + // t9: f32 = fp16_to_fp t8 + // t11: ch,glue = CopyToReg t0, Register:f32 %4, t9 + // + auto Copy2Reg = *N->use_begin(); + if (N->hasOneUse() && Copy2Reg->getOpcode() == ISD::CopyToReg) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &RI = MF.getRegInfo(); + unsigned VReg = RI.createVirtualRegister(&ARM::SPRRegClass); + + SDValue F16ToF = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), + MVT::f32, NewLD); + SDValue C2R = DAG.getCopyToReg(Copy2Reg->getOperand(0), SDLoc(Copy2Reg), + VReg, F16ToF); + + DEBUG(dbgs() << "Old CopyToReg: "; Copy2Reg->dump(); + dbgs() << "New CopyToReg: "; C2R->dump()); + + DAG.ReplaceAllUsesWith(Copy2Reg, C2R.getNode()); + return NewLD; + } + return NewLD; +} + SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { DEBUG(dbgs() << "Lowering node: "; Op.dump()); switch (Op.getOpcode()) { default: llvm_unreachable("Don't know how to custom lower this!"); + case ISD::LOAD: return LowerLOAD(Op, DAG, Subtarget); + case ISD::STORE: return LowerSTORE(Op, DAG, Subtarget); + case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG, Subtarget); case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); @@ -7850,7 +8333,7 @@ if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment()) return LowerDYNAMIC_STACKALLOC(Op, DAG); llvm_unreachable("Don't know how to custom lower this!"); - case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); + case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG, Subtarget); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG); case ARMISD::WIN__DBZCHK: return SDValue(); @@ -7897,6 +8380,14 @@ switch (N->getOpcode()) { default: llvm_unreachable("Don't know how to custom expand this!"); + case ISD::UINT_TO_FP: + case ISD::SINT_TO_FP: + Res = LowerINT_TO_FP(SDValue(N, 0), DAG); + break; + case ISD::FP_TO_UINT: + case ISD::FP_TO_SINT: + Res = LowerFP_TO_INT(SDValue(N, 0), DAG); + break; case ISD::READ_REGISTER: ExpandREAD_REGISTER(N, Results, DAG); break; @@ -13531,8 +14022,8 @@ } SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { - assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() && - "Unexpected type for custom-lowering FP_EXTEND"); + if (Op.getOperand(0).getValueType() == MVT::i32) + return SDValue(); RTLIB::Libcall LC; LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); @@ -13542,17 +14033,34 @@ SDLoc(Op)).first; } -SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { - assert(Op.getOperand(0).getValueType() == MVT::f64 && - Subtarget->isFPOnlySP() && - "Unexpected type for custom-lowering FP_ROUND"); +SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) const { + EVT DstType = Op.getValueType(); + const EVT SrcType = Op.getOperand(0).getValueType(); + const bool F32ToF16 = (SrcType == MVT::f32 && DstType == MVT::f16); + const bool F64ToF16 = (SrcType == MVT::f64 && DstType == MVT::f16); + + // 1) fptrunc float to half + // Supported: FP16 + // + // 2) fptrunc double to half + // Supported: V8 + // + if ((F32ToF16 && ST->hasFP16()) || // case 1) + (F64ToF16 && ST->hasFPARMv8() && !ST->isFPOnlySP())) // case 2) + return Op; RTLIB::Libcall LC; LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); - SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, - SDLoc(Op)).first; + + if (DstType == MVT::f16) + DstType = MVT::i32; + + SDValue NewNode = makeLibCall(DAG, LC, DstType, SrcVal, + /*isSigned*/ false, SDLoc(Op)).first; + DEBUG(dbgs() << "New node: "; NewNode.dump()); + return NewNode; } bool Index: lib/Target/ARM/ARMInstrVFP.td =================================================================== --- lib/Target/ARM/ARMInstrVFP.td +++ lib/Target/ARM/ARMInstrVFP.td @@ -69,10 +69,19 @@ let ParserMatchClass = FPImmOperand; } +def alignedload16 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast(N)->getAlignment() >= 2; +}]>; + def alignedload32 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return cast(N)->getAlignment() >= 4; }]>; +def alignedstore16 : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast(N)->getAlignment() >= 2; +}]>; + def alignedstore32 : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ return cast(N)->getAlignment() >= 4; @@ -113,9 +122,9 @@ let D = VFPNeonDomain; } -def VLDRH : AHI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5fp16:$addr), +def VLDRH : AHI5<0b1101, 0b01, (outs HPR:$Sd), (ins addrmode5fp16:$addr), IIC_fpLoad16, "vldr", ".16\t$Sd, $addr", - []>, + [(set HPR:$Sd, (alignedload16 addrmode5:$addr))]>, Requires<[HasFullFP16]>; } // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in' @@ -132,9 +141,9 @@ let D = VFPNeonDomain; } -def VSTRH : AHI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5fp16:$addr), +def VSTRH : AHI5<0b1101, 0b00, (outs), (ins HPR:$Sd, addrmode5fp16:$addr), IIC_fpStore16, "vstr", ".16\t$Sd, $addr", - []>, + [(alignedstore16 HPR:$Sd, addrmode5:$addr)]>, Requires<[HasFullFP16]>; //===----------------------------------------------------------------------===// @@ -335,9 +344,9 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in def VADDH : AHbI<0b11100, 0b11, 0, 0, - (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fadd HPR:$Sn, HPR:$Sm))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd" in @@ -360,9 +369,9 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in def VSUBH : AHbI<0b11100, 0b11, 1, 0, - (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fsub HPR:$Sn, HPR:$Sm))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd" in @@ -659,16 +668,15 @@ } // Between half, single and double-precision. For disassembly only. - -def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), +def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins HPR:$Sm), /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>, + [(set SPR:$Sd, (fpextend HPR:$Sm))]>, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), +def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs HPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>, + [(set HPR:$Sd, (fpround SPR:$Sm))]>, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; @@ -698,9 +706,10 @@ } def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0, - (outs SPR:$Sd), (ins DPR:$Dm), + (outs HPR:$Sd), (ins DPR:$Dm), NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm", - []>, Requires<[HasFPARMv8, HasDPVFP]> { + [(set HPR:$Sd, (fpround DPR:$Dm))]>, + Requires<[HasFPARMv8, HasDPVFP]> { // Instruction operands. bits<5> Sd; bits<5> Dm; @@ -739,15 +748,19 @@ let Inst{5} = Dm{4}; } +// f16 -> f32 conversions +def : Pat<(fp_to_f16 HPR:$a), + (i32 (COPY_TO_REGCLASS HPR:$a, GPR))>; def : Pat<(fp_to_f16 SPR:$a), (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>; -def : Pat<(fp_to_f16 (f64 DPR:$a)), - (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>; - +// f32 -> f16 conversions def : Pat<(f16_to_fp GPR:$a), - (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>; + (VCVTBHS (COPY_TO_REGCLASS GPR:$a, HPR))>; +// f16 <-> f64 conversions +def : Pat<(fp_to_f16 (f64 DPR:$a)), + (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>; def : Pat<(f64 (f16_to_fp GPR:$a)), (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>; @@ -1290,6 +1303,9 @@ let D = VFPNeonA8Domain; } +def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)), + (VCVTBSH (VSITOS (COPY_TO_REGCLASS GPR:$a, SPR)))>; + def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)), (VSITOS (COPY_TO_REGCLASS GPR:$a, SPR))>; Index: lib/Target/ARM/ARMRegisterInfo.td =================================================================== --- lib/Target/ARM/ARMRegisterInfo.td +++ lib/Target/ARM/ARMRegisterInfo.td @@ -307,6 +307,23 @@ let DiagnosticString = "operand must be a register in range [s0, s31]"; } +// Half-precision (FullFP16) register class. It's exactly the same as the +// single-precision class, using the same S-registers. Each instruction that generates a +// FP16 result writes that to the bottom 16 bits of the associated 32-bit Floating-point +// register and the top 16 bits of the 32-bit floating-point register are written to 0. +// A different register class is added, as opposed to adding f16 to SPR, to avoid +// modifying and adding type information to the rules. +def HPR : RegisterClass<"ARM", [f16], 32, (sequence "S%u", 0, 31)> { + let AltOrders = [(add (decimate HPR, 2), SPR), + (add (decimate HPR, 4), + (decimate HPR, 2), + (decimate (rotl HPR, 1), 4), + (decimate (rotl HPR, 1), 2))]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget().useStride4VFPs(MF); + }]; +} + // Subset of SPR which can be used as a source of NEON scalars for 16-bit // operations def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)> { Index: lib/Target/ARM/Disassembler/ARMDisassembler.cpp =================================================================== --- lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -158,6 +158,8 @@ uint64_t Address, const void *Decoder); static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo, @@ -182,6 +184,8 @@ uint64_t Address, const void *Decoder); static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeHPRRegListOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val, @@ -996,6 +1000,11 @@ return MCDisassembler::Success; } +static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + return DecodeSPRRegisterClass(Inst, RegNo, Address, Decoder); +} + static const uint16_t DPRDecoderTable[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3, ARM::D4, ARM::D5, ARM::D6, ARM::D7, @@ -1253,6 +1262,11 @@ return S; } +static DecodeStatus DecodeHPRRegListOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + return DecodeSPRRegListOperand(Inst, Val, Address, Decoder); +} + static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; Index: test/CodeGen/ARM/GlobalISel/arm-unsupported.ll =================================================================== --- test/CodeGen/ARM/GlobalISel/arm-unsupported.ll +++ test/CodeGen/ARM/GlobalISel/arm-unsupported.ll @@ -43,7 +43,7 @@ } define half @test_half(half %a, half %b) { -; CHECK: remark: {{.*}} unable to lower arguments: half (half, half)* +; CHECK: remark: {{.*}} unable to legalize instruction: %{{.}}:_(s16) = G_FADD %{{.}}, %{{.}} ; CHECK-LABEL: warning: Instruction selection used fallback path for test_half %res = fadd half %a, %b ret half %res Index: test/CodeGen/ARM/fp16-args.ll =================================================================== --- test/CodeGen/ARM/fp16-args.ll +++ test/CodeGen/ARM/fp16-args.ll @@ -33,9 +33,7 @@ ; HARD: vcvtb.f32.f16 {{s[0-9]+}}, s0 ; HARD: vadd.f32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ; HARD: vcvtb.f16.f32 [[SREG:s[0-9]+]], {{s[0-9]+}} -; HARD-NEXT: vmov [[REG0:r[0-9]+]], [[SREG]] -; HARD-NEXT: uxth [[REG1:r[0-9]+]], [[REG0]] -; HARD-NEXT: vmov s0, [[REG1]] +; HARD-NEXT: vcvtb.f32.f16 [[SREG]], [[SREG]] ; CHECK: bx lr } Index: test/CodeGen/ARM/fp16-instructions.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/fp16-instructions.ll @@ -0,0 +1,51 @@ +; RUN: llc < %s -mtriple=arm-none-eabi -float-abi=soft | FileCheck %s --check-prefix=CHECK-SOFT +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp4 -float-abi=hard | FileCheck %s --check-prefix=CHECK-FP16 +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+neon,+fullfp16 -float-abi=hard | FileCheck %s --check-prefix=CHECK-FULLFP16 + +define half @Sub(half %a, half %b) local_unnamed_addr { +entry: +;CHECK-SOFT-LABEL: Sub: +;CHECK-SOFT: bl __aeabi_h2f +;CHECK-SOFT: bl __aeabi_h2f +;CHECK-SOFT: bl __aeabi_fsub +;CHECK-SOFT: bl __aeabi_f2h + +;CHECK-FP16-LABEL: Sub: +;CHECK-FP16: vcvtb.f32.f16 s2, s1 +;CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0 +;CHECK-FP16-NEXT: vsub.f32 s0, s0, s2 +;CHECK-FP16-NEXT: vcvtb.f16.f32 s0, s0 +;CHECK-FP16-NEXT: mov pc, lr + +;CHECK-FULLFP16-LABEL: Sub: +;CHECK-FULLFP16: vsub.f16 s0, s0, s1 +;CHECK-FULLFP16-NEXT: mov pc, lr + + %sub = fsub half %a, %b + ret half %sub +} + +define half @Add(half %a, half %b) local_unnamed_addr { +entry: +;CHECK-SOFT-LABEL: Add: +;CHECK-SOFT: bl __aeabi_h2f +;CHECK-SOFT: bl __aeabi_h2f +;CHECK-SOFT: bl __aeabi_fadd +;CHECK-SOFT: bl __aeabi_f2h + +;CHECK-FP16-LABEL: Add: +;CHECK-FP16: vcvtb.f32.f16 s2, s1 +;CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0 +;CHECK-FP16-NEXT: vadd.f32 s0, s0, s2 +;CHECK-FP16-NEXT: vcvtb.f16.f32 s0, s0 +;CHECK-FP16-NEXT: mov pc, lr + +;CHECK-FULLFP16-LABEL: Add: +;CHECK-FULLFP16: vadd.f16 s0, s0, s1 +;CHECK-FULLFP16-NEXT: mov pc, lr + + %add = fadd half %a, %b + ret half %add +} + + Index: test/CodeGen/ARM/fp16-v3.ll =================================================================== --- test/CodeGen/ARM/fp16-v3.ll +++ test/CodeGen/ARM/fp16-v3.ll @@ -11,12 +11,15 @@ ; CHECK: vadd.f32 [[SREG5:s[0-9]+]], [[SREG4]], [[SREG1]] ; CHECK-NEXT: vcvtb.f16.f32 [[SREG6:s[0-9]+]], [[SREG5]] ; CHECK-NEXT: vmov [[RREG1:r[0-9]+]], [[SREG6]] -; CHECK-DAG: uxth [[RREG2:r[0-9]+]], [[RREG1]] -; CHECK-DAG: pkhbt [[RREG3:r[0-9]+]], [[RREG1]], [[RREG1]], lsl #16 +; The next store/load pair, is a result of code generation for a bitcast, +; and is a minor performance regression that needs looking into. +; CHECK-NEXT: strh +; CHECK-NEXT: ldrh [[RREG2:r[0-9]+]], [sp, #2] ; CHECK-DAG: strh [[RREG1]], [r0, #4] +; CHECK-DAG: orr [[RREG3:r[0-9]+]], [[RREG2]], [[RREG2]], lsl #16 ; CHECK-DAG: vmov [[DREG:d[0-9]+]], [[RREG3]], [[RREG2]] ; CHECK-DAG: vst1.32 {[[DREG]][0]}, [r0:32] -; CHECK-NEXT: bx lr +; CHECK: bx lr define void @test_vec3(<3 x half>* %arr, i32 %i) #0 { %H = sitofp i32 %i to half %S = fadd half %H, 0xH4A00