Index: llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -2409,6 +2409,14 @@ NumBits = 8; Scale = 4; break; + case ARMII::AddrMode5FP16: + ImmIdx = FrameRegIdx+1; + InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 8; + Scale = 2; + break; default: llvm_unreachable("Unsupported addressing mode!"); } Index: llvm/trunk/lib/Target/ARM/ARMCallingConv.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMCallingConv.td +++ llvm/trunk/lib/Target/ARM/ARMCallingConv.td @@ -187,6 +187,7 @@ CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>, CCIfType<[f32], CCBitConvertToType>, + CCDelegateTo ]>; @@ -233,7 +234,7 @@ CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, - S9, S10, S11, S12, S13, S14, S15]>>, + S9, S10, S11, S12, S13, S14, S15]>>, CCDelegateTo ]>; Index: llvm/trunk/lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -118,8 +118,10 @@ SDValue &Offset, SDValue &Opc); bool SelectAddrMode3Offset(SDNode *Op, SDValue N, SDValue &Offset, SDValue &Opc); - bool SelectAddrMode5(SDValue N, SDValue &Base, - SDValue &Offset); + bool IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset, + int Lwb, int Upb, bool FP16); + bool SelectAddrMode5(SDValue N, SDValue &Base, SDValue &Offset); + bool SelectAddrMode5FP16(SDValue N, SDValue &Base, SDValue &Offset); bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align); bool SelectAddrMode6Offset(SDNode *Op, SDValue N, SDValue &Offset); @@ -886,8 +888,8 @@ return true; } -bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N, - SDValue &Base, SDValue &Offset) { +bool ARMDAGToDAGISel::IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset, + int Lwb, int Upb, bool FP16) { if (!CurDAG->isBaseWithConstantOffset(N)) { Base = N; if (N.getOpcode() == ISD::FrameIndex) { @@ -907,8 +909,9 @@ // If the RHS is +/- imm8, fold into addr mode. int RHSC; - if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4, - -256 + 1, 256, RHSC)) { + const int Scale = FP16 ? 2 : 4; + + if (isScaledConstantInRange(N.getOperand(1), Scale, Lwb, Upb, RHSC)) { Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); @@ -921,17 +924,43 @@ AddSub = ARM_AM::sub; RHSC = -RHSC; } - Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC), - SDLoc(N), MVT::i32); + + if (FP16) + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5FP16Opc(AddSub, RHSC), + SDLoc(N), MVT::i32); + else + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC), + SDLoc(N), MVT::i32); + return true; } Base = N; - Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0), - SDLoc(N), MVT::i32); + + if (FP16) + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5FP16Opc(ARM_AM::add, 0), + SDLoc(N), MVT::i32); + else + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0), + SDLoc(N), MVT::i32); + return true; } +bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N, + SDValue &Base, SDValue &Offset) { + int Lwb = -256 + 1; + int Upb = 256; + return IsAddressingMode5(N, Base, Offset, Lwb, Upb, /*FP16=*/ false); +} + +bool ARMDAGToDAGISel::SelectAddrMode5FP16(SDValue N, + SDValue &Base, SDValue &Offset) { + int Lwb = -512 + 1; + int Upb = 512; + return IsAddressingMode5(N, Base, Offset, Lwb, Upb, /*FP16=*/ true); +} + bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr, SDValue &Align) { Addr = N; Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp @@ -522,6 +522,13 @@ addRegisterClass(MVT::f64, &ARM::DPRRegClass); } + if (Subtarget->hasFullFP16()) { + addRegisterClass(MVT::f16, &ARM::HPRRegClass); + // Clean up bitcast of incoming arguments if hard float abi is enabled. + if (Subtarget->isTargetHardFloat()) + setOperationAction(ISD::BITCAST, MVT::i16, Custom); + } + for (MVT VT : MVT::vector_valuetypes()) { for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); @@ -2474,12 +2481,37 @@ assert(VA.isRegLoc() && "Can only return in registers!"); SDValue Arg = OutVals[realRVLocIdx]; + bool ReturnF16 = false; + + if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) { + // Half-precision return values can be returned like this: + // + // t11 f16 = fadd ... + // t12: i16 = bitcast t11 + // t13: i32 = zero_extend t12 + // t14: f32 = bitcast t13 + // + // to avoid code generation for bitcasts, we simply set Arg to the node + // that produces the f16 value, t11 in this case. + // + if (Arg.getValueType() == MVT::f32) { + SDValue ZE = Arg.getOperand(0); + if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) { + SDValue BC = ZE.getOperand(0); + if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) { + Arg = BC.getOperand(0); + ReturnF16 = true; + } + } + } + } switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: - Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); + if (!ReturnF16) + Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); break; } @@ -2527,7 +2559,8 @@ // Guarantee that all emitted copies are // stuck together, avoiding something bad. Flag = Chain.getValue(1); - RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), + ReturnF16 ? MVT::f16 : VA.getLocVT())); } const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = @@ -3684,7 +3717,10 @@ } else { const TargetRegisterClass *RC; - if (RegVT == MVT::f32) + + if (RegVT == MVT::f16) + RC = &ARM::HPRRegClass; + else if (RegVT == MVT::f32) RC = &ARM::SPRRegClass; else if (RegVT == MVT::f64) RC = &ARM::DPRRegClass; @@ -5024,6 +5060,37 @@ // source or destination of the bit convert. EVT SrcVT = Op.getValueType(); EVT DstVT = N->getValueType(0); + + // Half-precision arguments can be passed in like this: + // + // t4: f32,ch = CopyFromReg t0, Register:f32 %1 + // t8: i32 = bitcast t4 + // t9: i16 = truncate t8 + // t10: f16 = bitcast t9 <~~~~ SDNode N + // + // but we want to avoid code generation for the bitcast, so transform this + // into: + // + // t18: f16 = CopyFromReg t0, Register:f32 %0 + // + if (SrcVT == MVT::i16 && DstVT == MVT::f16) { + if (Op.getOpcode() != ISD::TRUNCATE) + return SDValue(); + + SDValue Bitcast = Op.getOperand(0); + if (Bitcast.getOpcode() != ISD::BITCAST || + Bitcast.getValueType() != MVT::i32) + return SDValue(); + + SDValue Copy = Bitcast.getOperand(0); + if (Copy.getOpcode() != ISD::CopyFromReg || + Copy.getValueType() != MVT::f32) + return SDValue(); + + SDValue Ops[] = { Copy->getOperand(0), Copy->getOperand(1) }; + return DAG.getNode(ISD::CopyFromReg, SDLoc(Copy), MVT::f16, Ops); + } + assert((SrcVT == MVT::i64 || DstVT == MVT::i64) && "ExpandBITCAST called for non-i64 type"); Index: llvm/trunk/lib/Target/ARM/ARMInstrFormats.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrFormats.td +++ llvm/trunk/lib/Target/ARM/ARMInstrFormats.td @@ -108,6 +108,7 @@ def AddrModeT2_pc : AddrMode<14>; def AddrModeT2_i8s4 : AddrMode<15>; def AddrMode_i12 : AddrMode<16>; +def AddrMode5FP16 : AddrMode<17>; // Load / store index mode. class IndexMode val> { @@ -1527,7 +1528,7 @@ class AHI5 opcod1, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list pattern> - : VFPI { list Predicates = [HasFullFP16]; Index: llvm/trunk/lib/Target/ARM/ARMInstrVFP.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrVFP.td +++ llvm/trunk/lib/Target/ARM/ARMInstrVFP.td @@ -69,10 +69,19 @@ let ParserMatchClass = FPImmOperand; } +def alignedload16 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast(N)->getAlignment() >= 2; +}]>; + def alignedload32 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return cast(N)->getAlignment() >= 4; }]>; +def alignedstore16 : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast(N)->getAlignment() >= 2; +}]>; + def alignedstore32 : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ return cast(N)->getAlignment() >= 4; @@ -113,9 +122,9 @@ let D = VFPNeonDomain; } -def VLDRH : AHI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5fp16:$addr), +def VLDRH : AHI5<0b1101, 0b01, (outs HPR:$Sd), (ins addrmode5fp16:$addr), IIC_fpLoad16, "vldr", ".16\t$Sd, $addr", - []>, + [(set HPR:$Sd, (alignedload16 addrmode5fp16:$addr))]>, Requires<[HasFullFP16]>; } // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in' @@ -132,9 +141,9 @@ let D = VFPNeonDomain; } -def VSTRH : AHI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5fp16:$addr), +def VSTRH : AHI5<0b1101, 0b00, (outs), (ins HPR:$Sd, addrmode5fp16:$addr), IIC_fpStore16, "vstr", ".16\t$Sd, $addr", - []>, + [(alignedstore16 HPR:$Sd, addrmode5fp16:$addr)]>, Requires<[HasFullFP16]>; //===----------------------------------------------------------------------===// @@ -335,9 +344,9 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in def VADDH : AHbI<0b11100, 0b11, 0, 0, - (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fadd HPR:$Sn, HPR:$Sm))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd" in @@ -360,9 +369,9 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in def VSUBH : AHbI<0b11100, 0b11, 1, 0, - (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fsub HPR:$Sn, HPR:$Sm))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd" in @@ -658,17 +667,19 @@ let Predicates = [HasVFP2, HasDPVFP]; } -// Between half, single and double-precision. For disassembly only. - +// Between half, single and double-precision. def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>, + [ /* intentionally left blank, see rule below */ ]>, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; +def : Pat<(f32 (fpextend HPR:$Sm)), + (VCVTBHS (COPY_TO_REGCLASS HPR:$Sm, SPR))>; + def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>, + []>, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; Index: llvm/trunk/lib/Target/ARM/ARMRegisterInfo.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMRegisterInfo.td +++ llvm/trunk/lib/Target/ARM/ARMRegisterInfo.td @@ -307,6 +307,18 @@ let DiagnosticString = "operand must be a register in range [s0, s31]"; } +def HPR : RegisterClass<"ARM", [f16], 32, (sequence "S%u", 0, 31)> { + let AltOrders = [(add (decimate HPR, 2), SPR), + (add (decimate HPR, 4), + (decimate HPR, 2), + (decimate (rotl HPR, 1), 4), + (decimate (rotl HPR, 1), 2))]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget().useStride4VFPs(MF); + }]; + let DiagnosticString = "operand must be a register in range [s0, s31]"; +} + // Subset of SPR which can be used as a source of NEON scalars for 16-bit // operations def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)> { Index: llvm/trunk/lib/Target/ARM/Disassembler/ARMDisassembler.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ llvm/trunk/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -158,6 +158,8 @@ uint64_t Address, const void *Decoder); static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo, @@ -996,6 +998,11 @@ return MCDisassembler::Success; } +static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + return DecodeSPRRegisterClass(Inst, RegNo, Address, Decoder); +} + static const uint16_t DPRDecoderTable[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3, ARM::D4, ARM::D5, ARM::D6, ARM::D7, Index: llvm/trunk/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h =================================================================== --- llvm/trunk/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h +++ llvm/trunk/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h @@ -186,7 +186,8 @@ AddrModeT2_so = 13, AddrModeT2_pc = 14, // +/- i12 for pc relative data AddrModeT2_i8s4 = 15, // i8 * 4 - AddrMode_i12 = 16 + AddrMode_i12 = 16, + AddrMode5FP16 = 17 // i8 * 2 }; inline static const char *AddrModeToString(AddrMode addrmode) { @@ -197,6 +198,7 @@ case AddrMode3: return "AddrMode3"; case AddrMode4: return "AddrMode4"; case AddrMode5: return "AddrMode5"; + case AddrMode5FP16: return "AddrMode5FP16"; case AddrMode6: return "AddrMode6"; case AddrModeT1_1: return "AddrModeT1_1"; case AddrModeT1_2: return "AddrModeT1_2"; Index: llvm/trunk/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll +++ llvm/trunk/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll @@ -43,7 +43,7 @@ } define half @test_half(half %a, half %b) { -; CHECK: remark: {{.*}} unable to lower arguments: half (half, half)* +; CHECK: remark: {{.*}} unable to lower arguments: half (half, half)* (in function: test_half) ; CHECK-LABEL: warning: Instruction selection used fallback path for test_half %res = fadd half %a, %b ret half %res Index: llvm/trunk/test/CodeGen/ARM/fp16-instructions.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/fp16-instructions.ll +++ llvm/trunk/test/CodeGen/ARM/fp16-instructions.ll @@ -0,0 +1,72 @@ +; SOFT: +; RUN: llc < %s -mtriple=arm-none-eabi -float-abi=soft | FileCheck %s --check-prefix=CHECK-SOFT + +; SOFTFP: +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp3 | FileCheck %s --check-prefix=CHECK-SOFTFP-VFP3 +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp4 | FileCheck %s --check-prefix=CHECK-SOFTFP-FP16 +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+fullfp16 | FileCheck %s --check-prefix=CHECK-SOFTFP-FULLFP16 + +; HARD: +; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+vfp3 | FileCheck %s --check-prefix=CHECK-HARDFP-VFP3 +; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+vfp4 | FileCheck %s --check-prefix=CHECK-HARDFP-FP16 +; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefix=CHECK-HARDFP-FULLFP16 + +define float @Add(float %a.coerce, float %b.coerce) local_unnamed_addr { +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = bitcast float %b.coerce to i32 + %tmp1.0.extract.trunc = trunc i32 %2 to i16 + %3 = bitcast i16 %tmp1.0.extract.trunc to half + %add = fadd half %1, %3 + %4 = bitcast half %add to i16 + %tmp4.0.insert.ext = zext i16 %4 to i32 + %5 = bitcast i32 %tmp4.0.insert.ext to float + ret float %5 + +; CHECK-SOFT: bl __aeabi_h2f +; CHECK-SOFT: bl __aeabi_h2f +; CHECK-SOFT: bl __aeabi_fadd +; CHECK-SOFT: bl __aeabi_f2h + +; CHECK-SOFTFP-VFP3: bl __aeabi_h2f +; CHECK-SOFTFP-VFP3: bl __aeabi_h2f +; CHECK-SOFTFP-VFP3: vadd.f32 +; CHECK-SOFTFP-VFP3: bl __aeabi_f2h + +; CHECK-SOFTFP-FP16: vmov [[S2:s[0-9]]], r1 +; CHECK-SOFTFP-FP16: vmov [[S0:s[0-9]]], r0 +; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S2]], [[S2]] +; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S0]], [[S0]] +; CHECK-SOFTFP-FP16: vadd.f32 [[S0]], [[S0]], [[S2]] +; CHECK-SOFTFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]] +; CHECK-SOFTFP-FP16: vmov r0, s0 + +; CHECK-SOFTFP-FULLFP16: strh r1, {{.*}} +; CHECK-SOFTFP-FULLFP16: strh r0, {{.*}} +; CHECK-SOFTFP-FULLFP16: vldr.16 [[S0:s[0-9]]], {{.*}} +; CHECK-SOFTFP-FULLFP16: vldr.16 [[S2:s[0-9]]], {{.*}} +; CHECK-SOFTFP-FULLFP16: vadd.f16 [[S0]], [[S2]], [[S0]] +; CHECK-SOFTFP-FULLFP16: vstr.16 [[S2:s[0-9]]], {{.*}} +; CHECK-SOFTFP-FULLFP16: ldrh r0, {{.*}} +; CHECK-SOFTFP-FULLFP16: mov pc, lr + +; CHECK-HARDFP-VFP3: vmov r{{.}}, s0 +; CHECK-HARDFP-VFP3: vmov{{.*}}, s1 +; CHECK-HARDFP-VFP3: bl __aeabi_h2f +; CHECK-HARDFP-VFP3: bl __aeabi_h2f +; CHECK-HARDFP-VFP3: vadd.f32 +; CHECK-HARDFP-VFP3: bl __aeabi_f2h +; CHECK-HARDFP-VFP3: vmov s0, r0 + +; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S2:s[0-9]]], s1 +; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S0:s[0-9]]], s0 +; CHECK-HARDFP-FP16: vadd.f32 [[S0]], [[S0]], [[S2]] +; CHECK-HARDFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]] + +; CHECK-HARDFP-FULLFP16: vadd.f16 s0, s0, s1 +; CHECK-HARDFP-FULLFP16-NEXT: mov pc, lr + +} +