Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -524,9 +524,8 @@ if (Subtarget->hasFullFP16()) { addRegisterClass(MVT::f16, &ARM::HPRRegClass); - // Clean up bitcast of incoming arguments if hard float abi is enabled. - if (Subtarget->isTargetHardFloat()) - setOperationAction(ISD::BITCAST, MVT::i16, Custom); + setOperationAction(ISD::BITCAST, MVT::i16, Custom); + setOperationAction(ISD::BITCAST, MVT::f16, Custom); } for (MVT VT : MVT::vector_valuetypes()) { @@ -5061,38 +5060,78 @@ EVT SrcVT = Op.getValueType(); EVT DstVT = N->getValueType(0); - // Half-precision arguments can be passed in like this: - // - // t4: f32,ch = CopyFromReg t0, Register:f32 %1 - // t8: i32 = bitcast t4 - // t9: i16 = truncate t8 - // t10: f16 = bitcast t9 <~~~~ SDNode N - // - // but we want to avoid code generation for the bitcast, so transform this - // into: - // - // t18: f16 = CopyFromReg t0, Register:f32 %0 - // + + // Half-precision arguments: avoid stack stores/loads if (SrcVT == MVT::i16 && DstVT == MVT::f16) { - if (Op.getOpcode() != ISD::TRUNCATE) - return SDValue(); + if (Op.getOpcode() != ISD::TRUNCATE) + return SDValue(); + // Transform this: + // + // t4: f32,ch = CopyFromReg t0, Register:f32 %1 + // t8: i32 = bitcast t4 + // t9: i16 = truncate t8 <~~~~ Op + // t10: f16 = bitcast t9 <~~~~ SDNode N + // + // into an f16 copy from reg: + // + // t18: f16 = CopyFromReg t0, Register:f32 %0 + // SDValue Bitcast = Op.getOperand(0); - if (Bitcast.getOpcode() != ISD::BITCAST || - Bitcast.getValueType() != MVT::i32) - return SDValue(); + if (Bitcast.getOpcode() == ISD::BITCAST && + Bitcast.getValueType() == MVT::i32) { + + SDValue Copy = Bitcast.getOperand(0); + if (Copy.getOpcode() != ISD::CopyFromReg || + Copy.getValueType() != MVT::f32) + return SDValue(); - SDValue Copy = Bitcast.getOperand(0); - if (Copy.getOpcode() != ISD::CopyFromReg || - Copy.getValueType() != MVT::f32) + SDValue Ops[] = { Copy->getOperand(0), Copy->getOperand(1) }; + return DAG.getNode(ISD::CopyFromReg, SDLoc(Copy), MVT::f16, Ops); + } + + // And for FullFP16 we can have this: + // + // t5: i32,ch = CopyFromReg t0, Register:i32 %1 + // t9: i16 = truncate t5 <~~~~ Op + // t10: f16 = bitcast t9 <~~~~ SDNode N + // t11: f16 = fadd t8, t10 + // + SDValue Copy = Op.getOperand(0); + if (Copy.getOpcode() == ISD::CopyFromReg && + Copy.getValueType() == MVT::i32) { + // We use FP16_TO_FP just to model a GPR -> HPR move + return DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), + MVT::f32, Op.getOperand(0)); + } + return SDValue(); + } + + // Half-precision return values: avoid stack stores/loads + if (SrcVT == MVT::f16 && DstVT == MVT::i16) { + // + // t11: f16 = fadd t8, t10 + // t12: i16 = bitcast t11 <~~~ SDNode N + // t13: i32 = zero_extend t12 + // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13 + // + auto ZeroExtend = N->use_begin(); + if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND || + ZeroExtend->getValueType(0) != MVT::i32) return SDValue(); - SDValue Ops[] = { Copy->getOperand(0), Copy->getOperand(1) }; - return DAG.getNode(ISD::CopyFromReg, SDLoc(Copy), MVT::f16, Ops); + auto Copy = ZeroExtend->use_begin(); + if (Copy->getOpcode() == ISD::CopyToReg) { + // We use FP_TO_FP16 just to model a HPR -> GPR move + SDValue Cvt = DAG.getNode(ISD::FP_TO_FP16, SDLoc(Op), MVT::i32, Op); + DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt); + return Cvt; + } + return SDValue(); } - assert((SrcVT == MVT::i64 || DstVT == MVT::i64) && - "ExpandBITCAST called for non-i64 type"); + if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) + return SDValue(); // Turn i64->f64 into VMOVDRR. if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { Index: lib/Target/ARM/ARMInstrVFP.td =================================================================== --- lib/Target/ARM/ARMInstrVFP.td +++ lib/Target/ARM/ARMInstrVFP.td @@ -750,6 +750,13 @@ let Inst{5} = Dm{4}; } +let Predicates = [HasFullFP16] in { + def : Pat<(f16_to_fp GPR:$a), + (f32 (COPY_TO_REGCLASS GPR:$a, HPR))>; + def : Pat<(fp_to_f16 HPR:$a), + (i32 (COPY_TO_REGCLASS HPR:$a, GPR))>; +} + def : Pat<(fp_to_f16 SPR:$a), (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>; Index: test/CodeGen/ARM/fp16-instructions.ll =================================================================== --- test/CodeGen/ARM/fp16-instructions.ll +++ test/CodeGen/ARM/fp16-instructions.ll @@ -43,14 +43,11 @@ ; CHECK-SOFTFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]] ; CHECK-SOFTFP-FP16: vmov r0, s0 -; CHECK-SOFTFP-FULLFP16: strh r1, {{.*}} -; CHECK-SOFTFP-FULLFP16: strh r0, {{.*}} -; CHECK-SOFTFP-FULLFP16: vldr.16 [[S0:s[0-9]]], {{.*}} -; CHECK-SOFTFP-FULLFP16: vldr.16 [[S2:s[0-9]]], {{.*}} -; CHECK-SOFTFP-FULLFP16: vadd.f16 [[S0]], [[S2]], [[S0]] -; CHECK-SOFTFP-FULLFP16: vstr.16 [[S2:s[0-9]]], {{.*}} -; CHECK-SOFTFP-FULLFP16: ldrh r0, {{.*}} -; CHECK-SOFTFP-FULLFP16: mov pc, lr +; CHECK-SOFTFP-FULLFP16: vmov [[S0:s[0-9]]], r1 +; CHECK-SOFTFP-FULLFP16: vmov [[S2:s[0-9]]], r0 +; CHECK-SOFTFP-FULLFP16: vadd.f16 [[S0]], [[S2]], [[S0]] +; CHECK-SOFTFP-FULLFP16-NEXT: vmov r0, s0 +; CHECK-SOFTFP-FULLFP16-NEXT: mov pc, lr ; CHECK-HARDFP-VFP3: vmov r{{.}}, s0 ; CHECK-HARDFP-VFP3: vmov{{.*}}, s1 @@ -69,4 +66,3 @@ ; CHECK-HARDFP-FULLFP16-NEXT: mov pc, lr } -