Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8723,7 +8723,8 @@ for (unsigned i = 0, e = Ins.size(); i != e; ++i) { assert(InVals[i].getNode() && "LowerFormalArguments emitted a null value!"); - assert(EVT(Ins[i].VT) == InVals[i].getValueType() && + assert((InVals[i].getValueType() == MVT::f16 || + EVT(Ins[i].VT) == InVals[i].getValueType()) && "LowerFormalArguments emitted a value with the wrong type!"); } }); Index: lib/Target/ARM/ARMCallingConv.td =================================================================== --- lib/Target/ARM/ARMCallingConv.td +++ lib/Target/ARM/ARMCallingConv.td @@ -156,6 +156,8 @@ // Handles byval parameters. CCIfByVal>, + CCIfType<[f16], CCBitConvertToType>, + // The 'nest' parameter, if any, is passed in R12. CCIfNest>, @@ -187,6 +189,9 @@ CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>, CCIfType<[f32], CCBitConvertToType>, + + CCIfType<[f16], CCBitConvertToType>, + CCDelegateTo ]>; @@ -214,8 +219,8 @@ CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, - CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, - S9, S10, S11, S12, S13, S14, S15]>>, + CCIfType<[f16, f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, CCDelegateTo ]>; @@ -232,8 +237,8 @@ CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, - CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, - S9, S10, S11, S12, S13, S14, S15]>>, + CCIfType<[f16, f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, CCDelegateTo ]>; Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -530,6 +530,83 @@ addRegisterClass(MVT::f64, &ARM::DPRRegClass); } + // Hard float ABI defaults to VFP4, which supports the storage-only + // half-precision conversion instructions (and it's an extension for VFP3). + // We make f16 a legal type for this case, and not only when FullFP16 is + // supported (Armv8.2-A), for a few reasons. The AAPCS specifies that half + // floats sit in the lower 16 bits of the single precision registers, the + // upper half contains unspecified values. By making it a legal type, we + // avoid early legalization of arguments to f32 types, which would result in + // incorrectly interpreting the upper 16 bits. Another reason is that this + // avoids the obscure FP16_TO_FP and FP_TO_FP16 nodes. Instread, if + // instructions operands need promotion to f32 types, the 'normal' FP_EXTEND + // and FP_ROUND nodes will be introduced. + if (Subtarget->hasFP16() || Subtarget->hasFullFP16()) { + addRegisterClass(MVT::f16, &ARM::HPRRegClass); + } + + if (!Subtarget->hasFullFP16()) { + setOperationAction(ISD::SELECT, MVT::f16, Promote); + setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); + setOperationAction(ISD::SETCC, MVT::f16, Promote); + setOperationAction(ISD::BR_CC, MVT::f16, Promote); + setOperationAction(ISD::FADD, MVT::f16, Promote); + setOperationAction(ISD::FSUB, MVT::f16, Promote); + setOperationAction(ISD::FMUL, MVT::f16, Promote); + setOperationAction(ISD::FDIV, MVT::f16, Promote); + setOperationAction(ISD::FREM, MVT::f16, Promote); + setOperationAction(ISD::FMA, MVT::f16, Promote); + setOperationAction(ISD::FNEG, MVT::f16, Promote); + setOperationAction(ISD::FABS, MVT::f16, Promote); + setOperationAction(ISD::FCEIL, MVT::f16, Promote); + setOperationAction(ISD::FSQRT, MVT::f16, Promote); + setOperationAction(ISD::FFLOOR, MVT::f16, Promote); + setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); + setOperationAction(ISD::FRINT, MVT::f16, Promote); + setOperationAction(ISD::FROUND, MVT::f16, Promote); + setOperationAction(ISD::FTRUNC, MVT::f16, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote); + + // When we don't have FullFP16 support, and thus don't have FP16 load/store + // instructions, we create integer half-word integer load/stores. + // + // And input IR like this e.g.: + // + // %1 = load i16, i16 * ... + // %2 = tail call float @llvm.convert.from.fp16.f32(i16 ...) + // .. = fadd %2 .. + // + // gets combined very early to f16 loads when f16 types are legal. So we + // are custom lowering these f16 loads and stores, using integer loads and + // stores. This matches the storage-only semantics of __fp16, where + // arithmetic is done in single-precision, but results written back to + // half-precision. IR like the example above, can be generated due to use + // of __fp16. + setOperationAction(ISD::LOAD, MVT::f16, Custom); + setOperationAction(ISD::STORE, MVT::f16, Custom); + + // This is cleanup. We unfortunately need a FP_TO_FP16 node to create + // a truncating i32 -> i16 integer store. + setOperationAction(ISD::FP_TO_FP16, MVT::i32, Custom); + + // Another case from the use of __fp16 and passing halfs as i16. I.e. when + // function arguments are passed as i16, but converted to f32 or f64 in the + // function body, an i16 truncate, f16 bitcast, and an FP_EXTEND are + // generated. When f16 is not a legal type, the f16 bitcast is legalized + // to FP16_TO_FP. But when f16 is a legal type, this does not happen, and + // the truncate results in code generation and stack loads/stores. We want + // to avoid this, and custom lower the truncate/bitcast to FP16_TO_FP. + if (Subtarget->isTargetHardFloat()) + setOperationAction(ISD::BITCAST, MVT::i16, Custom); + } + + // Create f2h and h2f conversion EABI libcalls. + if (!Subtarget->hasFP16()) { + setOperationAction(ISD::FP_EXTEND, MVT::f16, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); + setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); + } + for (MVT VT : MVT::vector_valuetypes()) { for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); @@ -715,6 +792,7 @@ setTargetDAGCombine(ISD::FP_TO_UINT); setTargetDAGCombine(ISD::FDIV); setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::BITCAST); // It is legal to extload from v4i8 to v4i16 or v4i32. for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, @@ -3699,7 +3777,9 @@ } else { const TargetRegisterClass *RC; - if (RegVT == MVT::f32) + if (RegVT == MVT::f16) { + RC = &ARM::HPRRegClass; + } else if (RegVT == MVT::f32) RC = &ARM::SPRRegClass; else if (RegVT == MVT::f64) RC = &ARM::DPRRegClass; @@ -3723,6 +3803,12 @@ default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: + if (Ins[VA.getValNo()].ArgVT == MVT::f16 && + !Subtarget->isTargetHardFloat()) { + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, ArgValue); + ArgValue = DAG.getNode(ISD::BITCAST, dl, MVT::f16, ArgValue); + break; + } ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); break; case CCValAssign::SExt: @@ -4917,8 +5003,68 @@ // source or destination of the bit convert. EVT SrcVT = Op.getValueType(); EVT DstVT = N->getValueType(0); - assert((SrcVT == MVT::i64 || DstVT == MVT::i64) && - "ExpandBITCAST called for non-i64 type"); + + if (SrcVT == MVT::i16 && DstVT == MVT::f16) { + // Handle @llvm.convert.from.fp16.f64(i16 %in), which generates IR like: + // + // t2: i32,ch = CopyFromReg t0, ... + // t3: i16 = truncate t2 + // t4: f16 = bitcast t3 + // t5: f64 = fp_extend t4 + // + // We want to custom lower the truncate->bitcast->fp_extend pattern to + // just a fp16_to_fp node: + // + // t2: i32,ch = CopyFromReg t0, Register:i32 %vreg0 + // tx: f64 = fp16_to_fp t2 + // + // This avoids stack loads/stores code generation for the bitcast node, + // and thus just generates a mov and convert. + if (Op.getOpcode() != ISD::TRUNCATE) + return SDValue(); + + auto FPExtend = N->use_begin(); + if (N->use_size() != 1 || FPExtend->getOpcode() != ISD::FP_EXTEND) + return SDValue(); + + SDValue Cvt = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), + FPExtend->getValueType(0), Op.getOperand(0)); + DAG.ReplaceAllUsesWith(*FPExtend, Cvt.getNode()); + return Cvt; + } + + if (SrcVT == MVT::f16 && DstVT == MVT::i16) { + // Very similarly for f64, we want to transform: + // + // t2: f64,ch = CopyFromReg t0, Register:f64 %vreg0 + // t4: f16 = fp_round t2, TargetConstant:i32<0> + // t5: i16 = bitcast t4 + // t6: i32 = any_extend t5 + // + // into: + // + // t2: f64,ch = CopyFromReg t0, Register:f64 %vreg0 + // t13: i32 = fp_to_fp16 t2 + // t15: i32 = and t13, Constant:i32<65535> + + if (Op.getOpcode() != ISD::FP_ROUND) + return SDValue(); + + auto FPAnyExtend = N->use_begin(); + if (N->use_size() != 1 || FPAnyExtend->getOpcode() != ISD::ANY_EXTEND) + return SDValue(); + + SDValue Cvt = DAG.getNode(ISD::FP_TO_FP16, SDLoc(Op), + MVT::i32, Op.getOperand(0)); + SDValue And = DAG.getNode(ISD::AND, SDLoc(Op), MVT::i32, Cvt, + DAG.getConstant(65535, SDLoc(Op), MVT::i32)); + + DAG.ReplaceAllUsesWith(*FPAnyExtend, And.getNode()); + return And; + } + + if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) + return SDValue(); // Turn i64->f64 into VMOVDRR. if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { @@ -7780,10 +7926,198 @@ return !CI.second.getNode() ? DAG.getRoot() : CI.first; } +// This is a cleanup for the (corner)case when a load instruction directly +// feeds a store. For a load -> store chain, when the f16 store is legalized +// first, we unfortunately need to introduce a helper FP_TO_FP16 node +// in order to create a truncating i32 -> i16 integer store; this node somehow +// models a convert from a float to int type, which allows us to create a +// int store. This FP_TO_FP16 needs to be cleaned up though, as it should not +// lead to any code generation. When it is not a load/store chain, there will +// be f16 data processing instruction between loads/stores; the f16 operands of +// f16 data processing instructions would have been legalized and FP_EXTEND and +// FP_ROUND instructions would have been introduced. +// We want to transform this: +// +// t12: i32,ch = load t0, t2, undef:i32 +// t10: i32 = fp_to_fp16 t12 +// t11: ch = store t12:1, t10, t4, undef:i32 +// +// into: +// +// t12: i32,ch = load t0, t2, undef:i32 +// t11: ch = store t12:1, t12, t4, undef:i32 +// +// so that we just generate a LDRH and STRH half-word integer load/stores. +// +static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + if(!Op.hasOneUse()) + return SDValue(); + + auto Use = Op.getNode()->use_begin(); + if (Use->getOpcode() != ISD::STORE) { + dbgs() << "LowerFP_TO_FP16: use not a store, not cleaning it up\n"; + return SDValue(); + } + + SDValue Load = Op.getOperand(0); + if (Load.getOpcode() != ISD::LOAD) { + dbgs() << "LowerFP_TO_FP16: operand not a load, not cleaning it up\n"; + return SDValue(); + } + + DAG.ReplaceAllUsesOfValueWith(Op, Load); + return Load; +} + +static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + assert(!Subtarget->hasFullFP16()); + SDValue F16Op = Op.getOperand(1); + assert(F16Op.getValueType() == MVT::f16); + SDNode *N = Op.getNode(); + StoreSDNode *ST = cast(N); + + DEBUG( dbgs() << "Creating truncating i16 store for: "; F16Op.dump()); + SDValue Fp2fp16 = DAG.getNode(ISD::FP_TO_FP16, SDLoc(Op), MVT::i32, F16Op); + SDValue NewST = DAG.getTruncStore(Op.getOperand(0), SDLoc(Op), Fp2fp16, + ST->getBasePtr(), MVT::i16, + ST->getMemOperand()); + DEBUG(dbgs() << "New i16 store: "; NewST.dump()); + DAG.ReplaceAllUsesOfValueWith(Op, NewST); + return NewST; +} + +static SDNode *IsF16LoadStoreChain(SDNode *N) { + assert(N->getOpcode() == ISD::LOAD); + + if (N->getNumValues() != 2) { + DEBUG(dbgs() << "expecting 2 values\n"); + return nullptr; + } + + if (N->use_size() != 2) + return nullptr; + + // We expect the LD node of a LD->ST chain to have 2 uses: + // + // 1) the bitcast node, which feeds an extend to i32 + // 2) the ST node + + bool UseIsAStore = false; + bool UseIsABitCastAndExtend = false; + SDNode *ZEXT; + + for (auto U : N->uses()) { + switch (U->getOpcode()) { + default: return nullptr; + case ISD::STORE: + DEBUG(dbgs() << "Found a ST as a use: "; U->dump()); + UseIsAStore = true; + continue; + case ISD::BITCAST: + DEBUG(dbgs() << "Found a BITCAST as a use: "; U->dump()); + // bail out if the bitcast has more uses, because then it is + // not a simple LD-ST chain. + if (!U->hasOneUse()) + return nullptr; + ZEXT = *U->use_begin(); + if (ZEXT->getOpcode () != ISD::ZERO_EXTEND) + return nullptr; + UseIsABitCastAndExtend = true; + break; + } + } + + if (!UseIsAStore || !UseIsABitCastAndExtend) + return nullptr; + + if (!ZEXT->hasOneUse()) + return nullptr; + + return ZEXT; +} + +static SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + assert(!Subtarget->hasFullFP16() && Op.getValueType() == MVT::f16); + DEBUG(dbgs() << "Lowering f16 load, creating an i32 load for: "; Op.dump()); + + // Input DAG: + // + // tx: f16,ch = LD2 + // t.: ch = ST2 tx + // + // If ST2 is legalized first, a bitcast and extend are introduced to create a + // truncating integer store: + // + // tx: f16 = LD2 + // ty: i16 = bitcast tx + // tz: i32 = zero_extend ty + // t.: ch = ST2 tz + // + // Now we pick up the LD for legalization, and want to create: + // + // tx: i32 = LD2 + // t.: ch = ST2 tx, ... + // + // To achieve this, we need to: + // + // 1) Create the widening i32 LD, + // 2) Be careful how we replace nodes: + // uses of tx expect f16 values, so we can't replace uses of 'tx' with + // the new i32 node: there's a f16 <-> i32 type mismatch. What we need to + // is to replace uses of 'tz' with this new node. + + // 1) Create the new i32 load: + SDNode *N = Op.getNode(); + LoadSDNode *LD = cast(N); + SDValue NewLD = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD, MVT::i32, + SDLoc(Op), Op.getOperand(0), LD->getBasePtr(), + LD->getOffset(), MVT::i16, LD->getMemOperand()); + DEBUG(dbgs() << "New i32 load: "; NewLD.dump()); + + // Fixup the DAG + // + // Case I: Load -> FP_EXTEND + SDNode * FPExtend = nullptr; + for (auto U : N->uses()) { + if (U->getOpcode() == ISD::FP_EXTEND) + FPExtend = U; + } + + if (FPExtend != nullptr && !Subtarget->hasFullFP16()) { + DEBUG(dbgs() << "Creating i32 -> f32 bitcast\n"); + SDValue NewBitcast = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), + FPExtend->getValueType(0), NewLD); + DAG.ReplaceAllUsesWith(FPExtend, NewBitcast.getNode()); + return NewLD; + } + + // Case II: Load -> Store + SDNode *From = IsF16LoadStoreChain(N); + if (From) { + // replace the chain + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1)); + // replace the uses of i32 zero extend + DAG.ReplaceAllUsesWith(From, NewLD.getNode()); + return NewLD; + } + + // Case III: Load -> ret (copytoreg) + // + // The load feeds a return node, and we don't need special casing + // to fixup the uses of the old load node. + return NewLD; +} + SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { DEBUG(dbgs() << "Lowering node: "; Op.dump()); switch (Op.getOpcode()) { default: llvm_unreachable("Don't know how to custom lower this!"); + case ISD::LOAD: return LowerLOAD(Op, DAG, Subtarget); + case ISD::STORE: return LowerSTORE(Op, DAG, Subtarget); + case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG, Subtarget); case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); @@ -13407,8 +13741,8 @@ } SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { - assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() && - "Unexpected type for custom-lowering FP_EXTEND"); + if (Op.getOperand(0).getValueType() == MVT::i32) + return SDValue(); RTLIB::Libcall LC; LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); @@ -13419,16 +13753,14 @@ } SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { - assert(Op.getOperand(0).getValueType() == MVT::f64 && - Subtarget->isFPOnlySP() && - "Unexpected type for custom-lowering FP_ROUND"); - RTLIB::Libcall LC; LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, - SDLoc(Op)).first; + SDValue NewNode = makeLibCall(DAG, LC, Op.getValueType(), SrcVal, + /*isSigned*/ false, SDLoc(Op)).first; + DEBUG(dbgs() << "New node: "; NewNode.dump()); + return NewNode; } bool Index: lib/Target/ARM/ARMInstrVFP.td =================================================================== --- lib/Target/ARM/ARMInstrVFP.td +++ lib/Target/ARM/ARMInstrVFP.td @@ -69,10 +69,19 @@ let ParserMatchClass = FPImmOperand; } +def alignedload16 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast(N)->getAlignment() >= 2; +}]>; + def alignedload32 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return cast(N)->getAlignment() >= 4; }]>; +def alignedstore16 : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast(N)->getAlignment() >= 2; +}]>; + def alignedstore32 : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ return cast(N)->getAlignment() >= 4; @@ -113,9 +122,9 @@ let D = VFPNeonDomain; } -def VLDRH : AHI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5fp16:$addr), +def VLDRH : AHI5<0b1101, 0b01, (outs HPR:$Sd), (ins addrmode5fp16:$addr), IIC_fpLoad16, "vldr", ".16\t$Sd, $addr", - []>, + [(set HPR:$Sd, (alignedload16 addrmode5:$addr))]>, Requires<[HasFullFP16]>; } // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in' @@ -132,9 +141,9 @@ let D = VFPNeonDomain; } -def VSTRH : AHI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5fp16:$addr), +def VSTRH : AHI5<0b1101, 0b00, (outs), (ins HPR:$Sd, addrmode5fp16:$addr), IIC_fpStore16, "vstr", ".16\t$Sd, $addr", - []>, + [(alignedstore16 HPR:$Sd, addrmode5:$addr)]>, Requires<[HasFullFP16]>; //===----------------------------------------------------------------------===// @@ -355,9 +364,9 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in def VADDH : AHbI<0b11100, 0b11, 0, 0, - (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fadd HPR:$Sn, HPR:$Sm))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd" in @@ -380,9 +389,9 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in def VSUBH : AHbI<0b11100, 0b11, 1, 0, - (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fsub HPR:$Sn, HPR:$Sm))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd" in @@ -679,16 +688,15 @@ } // Between half, single and double-precision. For disassembly only. - -def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), +def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins HPR:$Sm), /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>, + [(set SPR:$Sd, (fpextend HPR:$Sm))]>, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), +def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs HPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>, + [(set HPR:$Sd, (fpround SPR:$Sm))]>, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; @@ -718,9 +726,10 @@ } def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0, - (outs SPR:$Sd), (ins DPR:$Dm), + (outs HPR:$Sd), (ins DPR:$Dm), NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm", - []>, Requires<[HasFPARMv8, HasDPVFP]> { + [(set HPR:$Sd, (fpround DPR:$Dm))]> { , + Requires<[HasFPARMv8, HasDPVFP]> { // Instruction operands. bits<5> Sd; bits<5> Dm; @@ -759,15 +768,20 @@ let Inst{5} = Dm{4}; } + +// f16 -> f32 conversions +def : Pat<(fp_to_f16 HPR:$a), + (i32 (COPY_TO_REGCLASS HPR:$a, GPR))>; def : Pat<(fp_to_f16 SPR:$a), (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>; -def : Pat<(fp_to_f16 (f64 DPR:$a)), - (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>; - +// f32 -> f16 conversions def : Pat<(f16_to_fp GPR:$a), - (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>; + (VCVTBHS (COPY_TO_REGCLASS GPR:$a, HPR))>; +// f16 <-> f64 conversions +def : Pat<(fp_to_f16 (f64 DPR:$a)), + (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>; def : Pat<(f64 (f16_to_fp GPR:$a)), (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>; Index: lib/Target/ARM/ARMRegisterInfo.td =================================================================== --- lib/Target/ARM/ARMRegisterInfo.td +++ lib/Target/ARM/ARMRegisterInfo.td @@ -307,6 +307,23 @@ let DiagnosticString = "operand must be a register in range [s0, s31]"; } +// Half-precision (FullFP16) register class. It's exactly the same as the +// single-precision class, using the same S-registers. Each instruction that generates a +// FP16 result writes that to the bottom 16 bits of the associated 32-bit Floating-point +// register and the top 16 bits of the 32-bit floating-point register are written to 0. +// A different register class is added, as opposed to adding f16 to SPR, to avoid +// modifying and adding type information to the rules. +def HPR : RegisterClass<"ARM", [f16], 32, (sequence "S%u", 0, 31)> { + let AltOrders = [(add (decimate HPR, 2), SPR), + (add (decimate HPR, 4), + (decimate HPR, 2), + (decimate (rotl HPR, 1), 4), + (decimate (rotl HPR, 1), 2))]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget().useStride4VFPs(MF); + }]; +} + // Subset of SPR which can be used as a source of NEON scalars for 16-bit // operations def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)> { Index: lib/Target/ARM/Disassembler/ARMDisassembler.cpp =================================================================== --- lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -158,6 +158,8 @@ uint64_t Address, const void *Decoder); static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo, @@ -182,6 +184,8 @@ uint64_t Address, const void *Decoder); static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeHPRRegListOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val, @@ -996,6 +1000,11 @@ return MCDisassembler::Success; } +static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + return DecodeSPRRegisterClass(Inst, RegNo, Address, Decoder); +} + static const uint16_t DPRDecoderTable[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3, ARM::D4, ARM::D5, ARM::D6, ARM::D7, @@ -1253,6 +1262,11 @@ return S; } +static DecodeStatus DecodeHPRRegListOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + return DecodeSPRRegListOperand(Inst, Val, Address, Decoder); +} + static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; Index: test/CodeGen/ARM/fp16-instructions.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/fp16-instructions.ll @@ -0,0 +1,51 @@ +; RUN: llc < %s -mtriple=arm-none-eabi -float-abi=soft | FileCheck %s --check-prefix=CHECK-SOFT +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp4 -float-abi=hard | FileCheck %s --check-prefix=CHECK-FP16 +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+neon,+fullfp16 -float-abi=hard | FileCheck %s --check-prefix=CHECK-FULLFP16 + +define half @Sub(half %a, half %b) local_unnamed_addr { +entry: +;CHECK-SOFT-LABEL: Sub: +;CHECK-SOFT: bl __aeabi_h2f +;CHECK-SOFT: bl __aeabi_h2f +;CHECK-SOFT: bl __aeabi_fsub +;CHECK-SOFT: bl __aeabi_f2h + +;CHECK-FP16-LABEL: Sub: +;CHECK-FP16: vcvtb.f32.f16 s2, s1 +;CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0 +;CHECK-FP16-NEXT: vsub.f32 s0, s0, s2 +;CHECK-FP16-NEXT: vcvtb.f16.f32 s0, s0 +;CHECK-FP16-NEXT: mov pc, lr + +;CHECK-FULLFP16-LABEL: Sub: +;CHECK-FULLFP16: vsub.f16 s0, s0, s1 +;CHECK-FULLFP16-NEXT: mov pc, lr + + %sub = fsub half %a, %b + ret half %sub +} + +define half @Add(half %a, half %b) local_unnamed_addr { +entry: +;CHECK-SOFT-LABEL: Add: +;CHECK-SOFT: bl __aeabi_h2f +;CHECK-SOFT: bl __aeabi_h2f +;CHECK-SOFT: bl __aeabi_fadd +;CHECK-SOFT: bl __aeabi_f2h + +;CHECK-FP16-LABEL: Add: +;CHECK-FP16: vcvtb.f32.f16 s2, s1 +;CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0 +;CHECK-FP16-NEXT: vadd.f32 s0, s0, s2 +;CHECK-FP16-NEXT: vcvtb.f16.f32 s0, s0 +;CHECK-FP16-NEXT: mov pc, lr + +;CHECK-FULLFP16-LABEL: Add: +;CHECK-FULLFP16: vadd.f16 s0, s0, s1 +;CHECK-FULLFP16-NEXT: mov pc, lr + + %add = fadd half %a, %b + ret half %add +} + +