Index: lib/Target/ARM/ARMCallingConv.td =================================================================== --- lib/Target/ARM/ARMCallingConv.td +++ lib/Target/ARM/ARMCallingConv.td @@ -20,7 +20,7 @@ // Handles byval parameters. CCIfByVal>, - + CCIfType<[i1, i8, i16], CCPromoteToType>, // Pass SwiftSelf in a callee saved register. @@ -214,8 +214,8 @@ CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, - CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, - S9, S10, S11, S12, S13, S14, S15]>>, + CCIfType<[f16, f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, CCDelegateTo ]>; @@ -232,7 +232,7 @@ CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, - CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + CCIfType<[f16, f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15]>>, CCDelegateTo ]>; Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -530,6 +530,9 @@ addRegisterClass(MVT::f64, &ARM::DPRRegClass); } + if (Subtarget->hasFullFP16()) + addRegisterClass(MVT::f16, &ARM::HPRRegClass); + for (MVT VT : MVT::vector_valuetypes()) { for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); @@ -3699,7 +3702,9 @@ } else { const TargetRegisterClass *RC; - if (RegVT == MVT::f32) + if (RegVT == MVT::f16) + RC = &ARM::HPRRegClass; + else if (RegVT == MVT::f32) RC = &ARM::SPRRegClass; else if (RegVT == MVT::f64) RC = &ARM::DPRRegClass; Index: lib/Target/ARM/ARMInstrVFP.td =================================================================== --- lib/Target/ARM/ARMInstrVFP.td +++ lib/Target/ARM/ARMInstrVFP.td @@ -355,9 +355,9 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in def VADDH : AHbI<0b11100, 0b11, 0, 0, - (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fadd HPR:$Sn, HPR:$Sm))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd" in @@ -380,9 +380,9 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in def VSUBH : AHbI<0b11100, 0b11, 1, 0, - (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fsub HPR:$Sn, HPR:$Sm))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd" in Index: lib/Target/ARM/ARMRegisterInfo.td =================================================================== --- lib/Target/ARM/ARMRegisterInfo.td +++ lib/Target/ARM/ARMRegisterInfo.td @@ -296,6 +296,23 @@ }]; } +// Half-precision (FullFP16) register class. It's exactly the same as the +// single-precision class, using the same S-registers. Each instruction that generates a +// FP16 result writes that to the bottom 16 bits of the associated 32-bit Floating-point +// register and the top 16 bits of the 32-bit floating-point register are written to 0. +// A different register class is added, as opposed to adding f16 to SPR, to avoid +// modifying and adding type information to the rules. +def HPR : RegisterClass<"ARM", [f16], 32, (sequence "S%u", 0, 31)> { + let AltOrders = [(add (decimate SPR, 2), SPR), + (add (decimate SPR, 4), + (decimate SPR, 2), + (decimate (rotl SPR, 1), 4), + (decimate (rotl SPR, 1), 2))]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget().useStride4VFPs(MF); + }]; +} + // Subset of SPR which can be used as a source of NEON scalars for 16-bit // operations def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)>; Index: lib/Target/ARM/Disassembler/ARMDisassembler.cpp =================================================================== --- lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -158,6 +158,8 @@ uint64_t Address, const void *Decoder); static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo, @@ -182,6 +184,8 @@ uint64_t Address, const void *Decoder); static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeHPRRegListOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val, @@ -996,6 +1000,11 @@ return MCDisassembler::Success; } +static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + return DecodeSPRRegisterClass(Inst, RegNo, Address, Decoder); +} + static const uint16_t DPRDecoderTable[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3, ARM::D4, ARM::D5, ARM::D6, ARM::D7, @@ -1253,6 +1262,11 @@ return S; } +static DecodeStatus DecodeHPRRegListOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + return DecodeSPRRegListOperand(Inst, Val, Address, Decoder); +} + static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; Index: test/CodeGen/ARM/fp16-instructions.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/fp16-instructions.ll @@ -0,0 +1,43 @@ +; RUN: llc < %s -mtriple=arm-none-eabi -float-abi=soft | FileCheck %s --check-prefix=CHECK-SOFT +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp4 -float-abi=hard | FileCheck %s --check-prefix=CHECK-FP16 +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+neon,+fullfp16 -float-abi=hard | FileCheck %s --check-prefix=CHECK-FULLFP16 + +define half @Sub(half %a, half %b) local_unnamed_addr { +entry: +;CHECK-SOFT-LABEL: Sub: +;CHECK-SOFT: bl __aeabi_h2f +;CHECK-SOFT: bl __aeabi_h2f +;CHECK-SOFT: bl __aeabi_fsub +;CHECK-SOFT: bl __aeabi_f2h + +;CHECK-FP16-LABEL: Sub: +;CHECK-FP16: vsub.f32 s0, s0, s2 +;CHECK-FP16-NEXT: mov pc, lr + +;CHECK-FULLFP16-LABEL: Sub: +;CHECK-FULLFP16: vsub.f16 s0, s0, s1 +;CHECK-FULLFP16-NEXT: mov pc, lr + + %sub = fsub half %a, %b + ret half %sub +} + +define half @Add(half %a, half %b) local_unnamed_addr { +entry: +;CHECK-SOFT-LABEL: Add: +;CHECK-SOFT: bl __aeabi_h2f +;CHECK-SOFT: bl __aeabi_h2f +;CHECK-SOFT: bl __aeabi_fadd +;CHECK-SOFT: bl __aeabi_f2h + +;CHECK-FP16-LABEL: Add: +;CHECK-FP16: vadd.f32 s0, s0, s2 +;CHECK-FP16-NEXT: mov pc, lr + +;CHECK-FULLFP16-LABEL: Add: +;CHECK-FULLFP16: vadd.f16 s0, s0, s1 +;CHECK-FULLFP16-NEXT: mov pc, lr + + %add = fadd half %a, %b + ret half %add +}