Index: lib/Target/ARM/ARMBaseInstrInfo.cpp =================================================================== --- lib/Target/ARM/ARMBaseInstrInfo.cpp +++ lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -2409,6 +2409,14 @@ NumBits = 8; Scale = 4; break; + case ARMII::AddrMode5FP16: + ImmIdx = FrameRegIdx+1; + InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 8; + Scale = 2; + break; default: llvm_unreachable("Unsupported addressing mode!"); } Index: lib/Target/ARM/ARMCallingConv.td =================================================================== --- lib/Target/ARM/ARMCallingConv.td +++ lib/Target/ARM/ARMCallingConv.td @@ -156,6 +156,8 @@ // Handles byval parameters. CCIfByVal>, + CCIfType<[f16], CCBitConvertToType>, + // The 'nest' parameter, if any, is passed in R12. CCIfNest>, @@ -187,6 +189,9 @@ CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>, CCIfType<[f32], CCBitConvertToType>, + + CCIfType<[f16], CCBitConvertToType>, + CCDelegateTo ]>; @@ -214,8 +219,8 @@ CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, - CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, - S9, S10, S11, S12, S13, S14, S15]>>, + CCIfType<[f16, f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, CCDelegateTo ]>; @@ -232,8 +237,8 @@ CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, - CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, - S9, S10, S11, S12, S13, S14, S15]>>, + CCIfType<[f16, f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, CCDelegateTo ]>; Index: lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- lib/Target/ARM/ARMISelDAGToDAG.cpp +++ lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -118,8 +118,8 @@ SDValue &Offset, SDValue &Opc); bool SelectAddrMode3Offset(SDNode *Op, SDValue N, SDValue &Offset, SDValue &Opc); - bool SelectAddrMode5(SDValue N, SDValue &Base, - SDValue &Offset); + bool SelectAddrMode5(SDValue N, SDValue &Base, SDValue &Offset); + bool SelectAddrMode5FP16(SDValue N, SDValue &Base, SDValue &Offset); bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align); bool SelectAddrMode6Offset(SDNode *Op, SDValue N, SDValue &Offset); @@ -932,6 +932,53 @@ return true; } +bool ARMDAGToDAGISel::SelectAddrMode5FP16(SDValue N, + SDValue &Base, SDValue &Offset) { + if (!CurDAG->isBaseWithConstantOffset(N)) { + Base = N; + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } else if (N.getOpcode() == ARMISD::Wrapper && + N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && + N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol && + N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { + Base = N.getOperand(0); + } + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5FP16Opc(ARM_AM::add, 0), + SDLoc(N), MVT::i32); + return true; + } + + // If the RHS is +/- imm8, fold into addr mode. + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/2, + -512 + 1, 512, RHSC)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } + + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = -RHSC; + } + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5FP16Opc(AddSub, RHSC), + SDLoc(N), MVT::i32); + return true; + } + + Base = N; + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5FP16Opc(ARM_AM::add, 0), + SDLoc(N), MVT::i32); + return true; +} + + bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr, SDValue &Align) { Addr = N; Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -522,6 +522,13 @@ addRegisterClass(MVT::f64, &ARM::DPRRegClass); } + if (Subtarget->hasFullFP16()) { + addRegisterClass(MVT::f16, &ARM::HPRRegClass); + // Clean up bitcast of incoming arguments if hard float abi is enabled. + if (Subtarget->isTargetHardFloat()) + setOperationAction(ISD::BITCAST, MVT::i16, Custom); + } + for (MVT VT : MVT::vector_valuetypes()) { for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); @@ -2473,12 +2480,37 @@ assert(VA.isRegLoc() && "Can only return in registers!"); SDValue Arg = OutVals[realRVLocIdx]; + bool ReturnF16 = false; + + if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) { + // Half-precision return values can be returned like this: + // + // t11 f16 = fadd ... + // t12: i16 = bitcast t11 + // t13: i32 = zero_extend t12 + // t14: f32 = bitcast t13 + // + // to avoid code generation for bitcasts, we simply set Arg to the node + // that produces the f16 value, t11 in this case. + // + if (Arg.getValueType() == MVT::f32) { + SDValue ZE = Arg.getOperand(0); + if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) { + SDValue BC = ZE.getOperand(0); + if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) { + Arg = BC.getOperand(0); + ReturnF16 = true; + } + } + } + } switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: - Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); + if (!ReturnF16) + Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); break; } @@ -2526,7 +2558,8 @@ // Guarantee that all emitted copies are // stuck together, avoiding something bad. Flag = Chain.getValue(1); - RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), + ReturnF16 ? MVT::f16 : VA.getLocVT())); } const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = @@ -3683,7 +3716,10 @@ } else { const TargetRegisterClass *RC; - if (RegVT == MVT::f32) + + if (RegVT == MVT::f16) + RC = &ARM::HPRRegClass; + else if (RegVT == MVT::f32) RC = &ARM::SPRRegClass; else if (RegVT == MVT::f64) RC = &ARM::DPRRegClass; @@ -4899,7 +4935,8 @@ /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 /// operand type is illegal (e.g., v2f32 for a target that doesn't support /// vectors), since the legalizer won't know what to do with that. -static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { +static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc dl(N); SDValue Op = N->getOperand(0); @@ -4908,6 +4945,37 @@ // source or destination of the bit convert. EVT SrcVT = Op.getValueType(); EVT DstVT = N->getValueType(0); + + // Half-precision arguments can be passed in like this: + // + // t4: f32,ch = CopyFromReg t0, Register:f32 %1 + // t8: i32 = bitcast t4 + // t9: i16 = truncate t8 + // t10: f16 = bitcast t9 <~~~~ SDNode N + // + // but we want to avoid code generation for the bitcast, so transform this + // into: + // + // t18: f16 = CopyFromReg t0, Register:f32 %0 + // + if (SrcVT == MVT::i16 && DstVT == MVT::f16) { + if (Op.getOpcode() != ISD::TRUNCATE) + return SDValue(); + + SDValue Bitcast = Op.getOperand(0); + if (Bitcast.getOpcode() != ISD::BITCAST || + Bitcast.getValueType() != MVT::i32) + return SDValue(); + + SDValue Copy = Bitcast.getOperand(0); + if (Copy.getOpcode() != ISD::CopyFromReg || + Copy.getValueType() != MVT::f32) + return SDValue(); + + SDValue Ops[] = { Copy->getOperand(0), Copy->getOperand(1) }; + return DAG.getNode(ISD::CopyFromReg, SDLoc(Copy), MVT::f16, Ops); + } + assert((SrcVT == MVT::i64 || DstVT == MVT::i64) && "ExpandBITCAST called for non-i64 type"); @@ -7799,7 +7867,7 @@ case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, Subtarget); - case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG); + case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); case ISD::SHL: case ISD::SRL: case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); @@ -7901,7 +7969,7 @@ ExpandREAD_REGISTER(N, Results, DAG); break; case ISD::BITCAST: - Res = ExpandBITCAST(N, DAG); + Res = ExpandBITCAST(N, DAG, Subtarget); break; case ISD::SRL: case ISD::SRA: Index: lib/Target/ARM/ARMInstrFormats.td =================================================================== --- lib/Target/ARM/ARMInstrFormats.td +++ lib/Target/ARM/ARMInstrFormats.td @@ -108,6 +108,7 @@ def AddrModeT2_pc : AddrMode<14>; def AddrModeT2_i8s4 : AddrMode<15>; def AddrMode_i12 : AddrMode<16>; +def AddrMode5FP16 : AddrMode<17>; // Load / store index mode. class IndexMode val> { @@ -1527,7 +1528,7 @@ class AHI5 opcod1, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list pattern> - : VFPI { list Predicates = [HasFullFP16]; Index: lib/Target/ARM/ARMInstrVFP.td =================================================================== --- lib/Target/ARM/ARMInstrVFP.td +++ lib/Target/ARM/ARMInstrVFP.td @@ -69,10 +69,19 @@ let ParserMatchClass = FPImmOperand; } +def alignedload16 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast(N)->getAlignment() >= 2; +}]>; + def alignedload32 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return cast(N)->getAlignment() >= 4; }]>; +def alignedstore16 : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast(N)->getAlignment() >= 2; +}]>; + def alignedstore32 : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ return cast(N)->getAlignment() >= 4; @@ -113,9 +122,9 @@ let D = VFPNeonDomain; } -def VLDRH : AHI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5fp16:$addr), +def VLDRH : AHI5<0b1101, 0b01, (outs HPR:$Sd), (ins addrmode5fp16:$addr), IIC_fpLoad16, "vldr", ".16\t$Sd, $addr", - []>, + [(set HPR:$Sd, (alignedload16 addrmode5fp16:$addr))]>, Requires<[HasFullFP16]>; } // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in' @@ -132,9 +141,9 @@ let D = VFPNeonDomain; } -def VSTRH : AHI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5fp16:$addr), +def VSTRH : AHI5<0b1101, 0b00, (outs), (ins HPR:$Sd, addrmode5fp16:$addr), IIC_fpStore16, "vstr", ".16\t$Sd, $addr", - []>, + [(alignedstore16 HPR:$Sd, addrmode5fp16:$addr)]>, Requires<[HasFullFP16]>; //===----------------------------------------------------------------------===// @@ -335,9 +344,9 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in def VADDH : AHbI<0b11100, 0b11, 0, 0, - (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fadd HPR:$Sn, HPR:$Sm))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd" in @@ -360,9 +369,9 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in def VSUBH : AHbI<0b11100, 0b11, 1, 0, - (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fsub HPR:$Sn, HPR:$Sm))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd" in @@ -658,17 +667,19 @@ let Predicates = [HasVFP2, HasDPVFP]; } -// Between half, single and double-precision. For disassembly only. - +// Between half, single and double-precision. def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>, + [ /* intentionally left blank, see rule below */ ]>, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; +def : Pat<(f32 (fpextend HPR:$Sm)), + (VCVTBHS (COPY_TO_REGCLASS HPR:$Sm, SPR))>; + def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>, + []>, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; @@ -698,9 +709,12 @@ } def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0, + //(outs HPR:$Sd), (ins DPR:$Dm), (outs SPR:$Sd), (ins DPR:$Dm), NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm", - []>, Requires<[HasFPARMv8, HasDPVFP]> { + //[(set HPR:$Sd, (fpround DPR:$Dm))]>, + [(set SPR:$Sd, (fpround DPR:$Dm))]>, + Requires<[HasFPARMv8, HasDPVFP]> { // Instruction operands. bits<5> Sd; bits<5> Dm; Index: lib/Target/ARM/ARMRegisterInfo.td =================================================================== --- lib/Target/ARM/ARMRegisterInfo.td +++ lib/Target/ARM/ARMRegisterInfo.td @@ -307,6 +307,18 @@ let DiagnosticString = "operand must be a register in range [s0, s31]"; } +def HPR : RegisterClass<"ARM", [f16], 32, (sequence "S%u", 0, 31)> { + let AltOrders = [(add (decimate HPR, 2), SPR), + (add (decimate HPR, 4), + (decimate HPR, 2), + (decimate (rotl HPR, 1), 4), + (decimate (rotl HPR, 1), 2))]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget().useStride4VFPs(MF); + }]; + let DiagnosticString = "operand must be a register in range [s0, s31]"; +} + // Subset of SPR which can be used as a source of NEON scalars for 16-bit // operations def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)> { Index: lib/Target/ARM/Disassembler/ARMDisassembler.cpp =================================================================== --- lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -158,6 +158,8 @@ uint64_t Address, const void *Decoder); static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo, @@ -996,6 +998,11 @@ return MCDisassembler::Success; } +static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + return DecodeSPRRegisterClass(Inst, RegNo, Address, Decoder); +} + static const uint16_t DPRDecoderTable[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3, ARM::D4, ARM::D5, ARM::D6, ARM::D7, Index: lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h =================================================================== --- lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h +++ lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h @@ -186,7 +186,8 @@ AddrModeT2_so = 13, AddrModeT2_pc = 14, // +/- i12 for pc relative data AddrModeT2_i8s4 = 15, // i8 * 4 - AddrMode_i12 = 16 + AddrMode_i12 = 16, + AddrMode5FP16 = 17 // i8 * 2 }; inline static const char *AddrModeToString(AddrMode addrmode) { @@ -197,6 +198,7 @@ case AddrMode3: return "AddrMode3"; case AddrMode4: return "AddrMode4"; case AddrMode5: return "AddrMode5"; + case AddrMode5FP16: return "AddrMode5FP16"; case AddrMode6: return "AddrMode6"; case AddrModeT1_1: return "AddrModeT1_1"; case AddrModeT1_2: return "AddrModeT1_2"; @@ -268,7 +270,7 @@ //===------------------------------------------------------------------===// // This four-bit field describes the addressing mode used. - AddrModeMask = 0x1f, // The AddrMode enums are declared in ARMBaseInfo.h + AddrModeMask = 0x3f, // The AddrMode enums are declared in ARMBaseInfo.h // IndexMode - Unindex, pre-indexed, or post-indexed are valid for load // and store ops only. Generic "updating" flag is used for ld/st multiple. Index: test/CodeGen/ARM/GlobalISel/arm-unsupported.ll =================================================================== --- test/CodeGen/ARM/GlobalISel/arm-unsupported.ll +++ test/CodeGen/ARM/GlobalISel/arm-unsupported.ll @@ -43,7 +43,7 @@ } define half @test_half(half %a, half %b) { -; CHECK: remark: {{.*}} unable to lower arguments: half (half, half)* +; CHECK: remark: {{.*}} unable to legalize instruction: %{{.}}:_(s16) = G_FADD %{{.}}, %{{.}} ; CHECK-LABEL: warning: Instruction selection used fallback path for test_half %res = fadd half %a, %b ret half %res Index: test/CodeGen/ARM/fp16-instructions.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/fp16-instructions.ll @@ -0,0 +1,72 @@ +; SOFT: +; RUN: llc < %s -mtriple=arm-none-eabi -float-abi=soft | FileCheck %s --check-prefix=CHECK-SOFT + +; SOFTFP: +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp3 | FileCheck %s --check-prefix=CHECK-SOFTFP-VFP3 +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp4 | FileCheck %s --check-prefix=CHECK-SOFTFP-FP16 +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+fullfp16 | FileCheck %s --check-prefix=CHECK-SOFTFP-FULLFP16 + +; HARD: +; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+vfp3 | FileCheck %s --check-prefix=CHECK-HARDFP-VFP3 +; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+vfp4 | FileCheck %s --check-prefix=CHECK-HARDFP-FP16 +; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefix=CHECK-HARDFP-FULLFP16 + +define float @Add(float %a.coerce, float %b.coerce) local_unnamed_addr { +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = bitcast float %b.coerce to i32 + %tmp1.0.extract.trunc = trunc i32 %2 to i16 + %3 = bitcast i16 %tmp1.0.extract.trunc to half + %add = fadd half %1, %3 + %4 = bitcast half %add to i16 + %tmp4.0.insert.ext = zext i16 %4 to i32 + %5 = bitcast i32 %tmp4.0.insert.ext to float + ret float %5 + +; CHECK-SOFT: bl __aeabi_h2f +; CHECK-SOFT: bl __aeabi_h2f +; CHECK-SOFT: bl __aeabi_fadd +; CHECK-SOFT: bl __aeabi_f2h + +; CHECK-SOFTFP-VFP3: bl __aeabi_h2f +; CHECK-SOFTFP-VFP3: bl __aeabi_h2f +; CHECK-SOFTFP-VFP3: vadd.f32 +; CHECK-SOFTFP-VFP3: bl __aeabi_f2h + +; CHECK-SOFTFP-FP16: vmov [[S2:s[0-9]]], r1 +; CHECK-SOFTFP-FP16: vmov [[S0:s[0-9]]], r0 +; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S2]], [[S2]] +; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S0]], [[S0]] +; CHECK-SOFTFP-FP16: vadd.f32 [[S0]], [[S0]], [[S2]] +; CHECK-SOFTFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]] +; CHECK-SOFTFP-FP16: vmov r0, s0 + +; CHECK-SOFTFP-FULLFP16: strh r1, {{.*}} +; CHECK-SOFTFP-FULLFP16: strh r0, {{.*}} +; CHECK-SOFTFP-FULLFP16: vldr.16 [[S0:s[0-9]]], {{.*}} +; CHECK-SOFTFP-FULLFP16: vldr.16 [[S2:s[0-9]]], {{.*}} +; CHECK-SOFTFP-FULLFP16: vadd.f16 [[S0]], [[S2]], [[S0]] +; CHECK-SOFTFP-FULLFP16: vstr.16 [[S2:s[0-9]]], {{.*}} +; CHECK-SOFTFP-FULLFP16: ldrh r0, {{.*}} +; CHECK-SOFTFP-FULLFP16: mov pc, lr + +; CHECK-HARDFP-VFP3: vmov r{{.}}, s0 +; CHECK-HARDFP-VFP3: vmov{{.*}}, s1 +; CHECK-HARDFP-VFP3: bl __aeabi_h2f +; CHECK-HARDFP-VFP3: bl __aeabi_h2f +; CHECK-HARDFP-VFP3: vadd.f32 +; CHECK-HARDFP-VFP3: bl __aeabi_f2h +; CHECK-HARDFP-VFP3: vmov s0, r0 + +; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S2:s[0-9]]], s1 +; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S0:s[0-9]]], s0 +; CHECK-HARDFP-FP16: vadd.f32 [[S0]], [[S0]], [[S2]] +; CHECK-HARDFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]] + +; CHECK-HARDFP-FULLFP16: vadd.f16 s0, s0, s1 +; CHECK-HARDFP-FULLFP16-NEXT: mov pc, lr + +} +