diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -764,6 +764,8 @@ SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const; void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed, SmallVectorImpl &Results) const; + SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) const; SDValue LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed, SDValue &Chain) const; SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const; @@ -787,6 +789,11 @@ bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; + SDValue MoveToHPR(const SDLoc &dl, SelectionDAG &DAG, MVT LocVT, MVT ValVT, + SDValue Val) const; + SDValue MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG, MVT LocVT, + MVT ValVT, SDValue Val) const; + SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCallResult(SDValue Chain, SDValue InFlag, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -721,10 +721,14 @@ setOperationAction(ISD::FMINNUM, MVT::f16, Legal); setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); + } - // For the time being bfloat is only supported when fullfp16 is present. - if (Subtarget->hasBF16()) - addRegisterClass(MVT::bf16, &ARM::HPRRegClass); + if (Subtarget->hasBF16()) { + addRegisterClass(MVT::bf16, &ARM::HPRRegClass); + if (!Subtarget->hasFullFP16()) { + setAllExpand(MVT::bf16); + setOperationAction(ISD::BITCAST, MVT::bf16, Custom); + } } for (MVT VT : MVT::fixedlen_vector_valuetypes()) { @@ -2006,6 +2010,35 @@ } } +SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG, + MVT LocVT, MVT ValVT, SDValue Val) const { + Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()), + Val); + if (Subtarget->hasFullFP16()) { + Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val); + } else { + Val = DAG.getNode(ISD::TRUNCATE, dl, + MVT::getIntegerVT(ValVT.getSizeInBits()), Val); + Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val); + } + return Val; +} + +SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG, + MVT LocVT, MVT ValVT, + SDValue Val) const { + if (Subtarget->hasFullFP16()) { + Val = DAG.getNode(ARMISD::VMOVrh, dl, + MVT::getIntegerVT(LocVT.getSizeInBits()), Val); + } else { + Val = DAG.getNode(ISD::BITCAST, dl, + MVT::getIntegerVT(ValVT.getSizeInBits()), Val); + Val = DAG.getNode(ISD::ZERO_EXTEND, dl, + MVT::getIntegerVT(LocVT.getSizeInBits()), Val); + } + return DAG.getNode(ISD::BITCAST, dl, LocVT, Val); +} + /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. SDValue ARMTargetLowering::LowerCallResult( @@ -2087,13 +2120,8 @@ // had been copied to the LSBs of a 32-bit register. // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) if (VA.needsCustom() && - (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) { - assert(Subtarget->hasFullFP16() && - "Lowering half precision fp return without full fp16 support"); - Val = DAG.getNode(ISD::BITCAST, dl, - MVT::getIntegerVT(VA.getLocVT().getSizeInBits()), Val); - Val = DAG.getNode(ARMISD::VMOVhr, dl, VA.getValVT(), Val); - } + (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) + Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val); InVals.push_back(Val); } @@ -2268,11 +2296,7 @@ // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) if (VA.needsCustom() && (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) { - assert(Subtarget->hasFullFP16() && - "Lowering half precision fp argument without full fp16 support"); - Arg = DAG.getNode(ARMISD::VMOVrh, dl, - MVT::getIntegerVT(VA.getLocVT().getSizeInBits()), Arg); - Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); + Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg); } else { // f16 arguments could have been extended prior to argument lowering. // Mask them arguments if this is a CMSE nonsecure call. @@ -2956,12 +2980,7 @@ auto RetVT = Outs[realRVLocIdx].ArgVT; if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) { if (VA.needsCustom() && VA.getValVT() == MVT::f16) { - assert(Subtarget->hasFullFP16() && - "Lowering f16 type argument without full fp16 support"); - Arg = - DAG.getNode(ARMISD::VMOVrh, dl, - MVT::getIntegerVT(VA.getLocVT().getSizeInBits()), Arg); - Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); + Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg); } else { auto LocBits = VA.getLocVT().getSizeInBits(); auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits()); @@ -4331,14 +4350,8 @@ // had been copied to the LSBs of a 32-bit register. // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) if (VA.needsCustom() && - (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) { - assert(Subtarget->hasFullFP16() && - "Lowering half precision fp argument without full fp16 support"); - ArgValue = DAG.getNode(ISD::BITCAST, dl, - MVT::getIntegerVT(VA.getLocVT().getSizeInBits()), - ArgValue); - ArgValue = DAG.getNode(ARMISD::VMOVhr, dl, VA.getValVT(), ArgValue); - } + (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) + ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue); InVals.push_back(ArgValue); } else { // VA.isRegLoc() @@ -5918,8 +5931,8 @@ /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 /// operand type is illegal (e.g., v2f32 for a target that doesn't support /// vectors), since the legalizer won't know what to do with that. -static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { +SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) const { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc dl(N); SDValue Op = N->getOperand(0); @@ -5929,21 +5942,16 @@ EVT SrcVT = Op.getValueType(); EVT DstVT = N->getValueType(0); - if (SrcVT == MVT::i16 && (DstVT == MVT::f16 || DstVT == MVT::bf16)) { - if (!Subtarget->hasFullFP16()) - return SDValue(); - // (b)f16 bitcast i16 -> VMOVhr - return DAG.getNode(ARMISD::VMOVhr, SDLoc(N), DstVT, - DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op)); - } + if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) && + (DstVT == MVT::f16 || DstVT == MVT::bf16)) + return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(), + DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op)); - if ((SrcVT == MVT::f16 || SrcVT == MVT::bf16) && DstVT == MVT::i16) { - if (!Subtarget->hasFullFP16()) - return SDValue(); - // i16 bitcast (b)f16 -> VMOVrh - return DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i16, - DAG.getNode(ARMISD::VMOVrh, SDLoc(N), MVT::i32, Op)); - } + if ((DstVT == MVT::i16 || DstVT == MVT::i32) && + (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) + return DAG.getNode( + ISD::TRUNCATE, SDLoc(N), DstVT, + MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op)); if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) return SDValue(); diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td --- a/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -163,10 +163,20 @@ } // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in' -def : FPRegs16Pat<(f16 (alignedload16 addrmode5fp16:$addr)), - (VLDRH addrmode5fp16:$addr)>; -def : FPRegs16Pat<(bf16 (alignedload16 addrmode5fp16:$addr)), - (VLDRH addrmode5fp16:$addr)>; +foreach fptype = [f16, bf16] in { + def : Pat<(fptype (alignedload16 addrmode5fp16:$addr)), + (VLDRH addrmode5fp16:$addr)> { + let Predicates = [HasFPRegs16]; + } + def : Pat<(fptype (alignedload16 addrmode3:$addr)), + (COPY_TO_REGCLASS (LDRH addrmode3:$addr), HPR)> { + let Predicates = [HasNoFPRegs16, IsARM]; + } + def : Pat<(fptype (alignedload16 t2addrmode_imm12:$addr)), + (COPY_TO_REGCLASS (t2LDRHi12 t2addrmode_imm12:$addr), HPR)> { + let Predicates = [HasNoFPRegs16, IsThumb]; + } +} def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr), IIC_fpStore64, "vstr", "\t$Dd, $addr", @@ -188,10 +198,20 @@ []>, Requires<[HasFPRegs16]>; -def : FPRegs16Pat<(alignedstore16 (f16 HPR:$Sd), addrmode5fp16:$addr), - (VSTRH (f16 HPR:$Sd), addrmode5fp16:$addr)>; -def : FPRegs16Pat<(alignedstore16 (bf16 HPR:$Sd), addrmode5fp16:$addr), - (VSTRH (bf16 HPR:$Sd), addrmode5fp16:$addr)>; +foreach fptype = [f16, bf16] in { + def : Pat<(alignedstore16 (fptype HPR:$Sd), addrmode5fp16:$addr), + (VSTRH (fptype HPR:$Sd), addrmode5fp16:$addr)> { + let Predicates = [HasFPRegs16]; + } + def : Pat<(alignedstore16 (fptype HPR:$Sd), addrmode3:$addr), + (STRH (COPY_TO_REGCLASS $Sd, GPR), addrmode3:$addr)> { + let Predicates = [HasNoFPRegs16, IsARM]; + } + def : Pat<(alignedstore16 (fptype HPR:$Sd), t2addrmode_imm12:$addr), + (t2STRHi12 (COPY_TO_REGCLASS $Sd, GPR), t2addrmode_imm12:$addr)> { + let Predicates = [HasNoFPRegs16, IsThumb]; + } +} //===----------------------------------------------------------------------===// // Load / store multiple Instructions. diff --git a/llvm/lib/Target/ARM/ARMPredicates.td b/llvm/lib/Target/ARM/ARMPredicates.td --- a/llvm/lib/Target/ARM/ARMPredicates.td +++ b/llvm/lib/Target/ARM/ARMPredicates.td @@ -44,6 +44,9 @@ def HasFPRegs16 : Predicate<"Subtarget->hasFPRegs16()">, AssemblerPredicate<(all_of FeatureFPRegs16), "16-bit fp registers">; +def HasNoFPRegs16 : Predicate<"!Subtarget->hasFPRegs16()">, + AssemblerPredicate<(all_of (not FeatureFPRegs16)), + "16-bit fp registers">; def HasFPRegs64 : Predicate<"Subtarget->hasFPRegs64()">, AssemblerPredicate<(all_of FeatureFPRegs64), "64-bit fp registers">; diff --git a/llvm/test/CodeGen/ARM/arm-bf16-pcs.ll b/llvm/test/CodeGen/ARM/arm-bf16-pcs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/arm-bf16-pcs.ll @@ -0,0 +1,319 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple armv8.6a-arm-none-eabi -o - %s | FileCheck %s --check-prefix=BASE --check-prefix=BASE-ARM +; RUN: llc -mtriple thumbv8.6a-arm-none-eabi -o - %s | FileCheck %s --check-prefix=BASE --check-prefix=BASE-THUMB +; RUN: llc -mtriple armv8.6a-arm-none-eabi -mattr=+fullfp16 -o - %s | FileCheck %s --check-prefix=FULLFP16 --check-prefix=FULLFP16-ARM +; RUN: llc -mtriple thumbv8.6a-arm-none-eabi -mattr=+fullfp16 -o - %s | FileCheck %s --check-prefix=FULLFP16 --check-prefix=FULLFP16-THUMB + +define bfloat @bf_load_soft(bfloat* %p) { +; BASE-LABEL: bf_load_soft: +; BASE: @ %bb.0: +; BASE-NEXT: ldrh r0, [r0] +; BASE-NEXT: bx lr +; +; FULLFP16-LABEL: bf_load_soft: +; FULLFP16: @ %bb.0: +; FULLFP16-NEXT: vldr.16 s0, [r0] +; FULLFP16-NEXT: vmov r0, s0 +; FULLFP16-NEXT: bx lr + %f = load bfloat, bfloat* %p, align 2 + ret bfloat %f +} + +define arm_aapcs_vfpcc bfloat @bf_load_hard(bfloat* %p) { +; BASE-LABEL: bf_load_hard: +; BASE: @ %bb.0: +; BASE-NEXT: ldrh r0, [r0] +; BASE-NEXT: vmov s0, r0 +; BASE-NEXT: bx lr +; +; FULLFP16-LABEL: bf_load_hard: +; FULLFP16: @ %bb.0: +; FULLFP16-NEXT: vldr.16 s0, [r0] +; FULLFP16-NEXT: bx lr + %f = load bfloat, bfloat* %p, align 2 + ret bfloat %f +} + +define void @bf_store_soft(bfloat* %p, bfloat %f) { +; BASE-LABEL: bf_store_soft: +; BASE: @ %bb.0: +; BASE-NEXT: strh r1, [r0] +; BASE-NEXT: bx lr +; +; FULLFP16-LABEL: bf_store_soft: +; FULLFP16: @ %bb.0: +; FULLFP16-NEXT: vmov.f16 s0, r1 +; FULLFP16-NEXT: vstr.16 s0, [r0] +; FULLFP16-NEXT: bx lr + store bfloat %f, bfloat* %p, align 2 + ret void +} + +define arm_aapcs_vfpcc void @bf_store_hard(bfloat* %p, bfloat %f) { +; BASE-LABEL: bf_store_hard: +; BASE: @ %bb.0: +; BASE-NEXT: vmov r1, s0 +; BASE-NEXT: strh r1, [r0] +; BASE-NEXT: bx lr +; +; FULLFP16-LABEL: bf_store_hard: +; FULLFP16: @ %bb.0: +; FULLFP16-NEXT: vstr.16 s0, [r0] +; FULLFP16-NEXT: bx lr + store bfloat %f, bfloat* %p, align 2 + ret void +} + +define i32 @bf_to_int_soft(bfloat %f) { +; BASE-LABEL: bf_to_int_soft: +; BASE: @ %bb.0: +; BASE-NEXT: uxth r0, r0 +; BASE-NEXT: bx lr +; +; FULLFP16-LABEL: bf_to_int_soft: +; FULLFP16: @ %bb.0: +; FULLFP16-NEXT: vmov.f16 s0, r0 +; FULLFP16-NEXT: vmov.f16 r0, s0 +; FULLFP16-NEXT: bx lr + %h = bitcast bfloat %f to i16 + %w = zext i16 %h to i32 + ret i32 %w +} + +define arm_aapcs_vfpcc i32 @bf_to_int_hard(bfloat %f) { +; BASE-LABEL: bf_to_int_hard: +; BASE: @ %bb.0: +; BASE-NEXT: vmov r0, s0 +; BASE-NEXT: uxth r0, r0 +; BASE-NEXT: bx lr +; +; FULLFP16-LABEL: bf_to_int_hard: +; FULLFP16: @ %bb.0: +; FULLFP16-NEXT: vmov.f16 r0, s0 +; FULLFP16-NEXT: bx lr + %h = bitcast bfloat %f to i16 + %w = zext i16 %h to i32 + ret i32 %w +} + +define bfloat @bf_from_int_soft(i32 %w) { +; BASE-ARM-LABEL: bf_from_int_soft: +; BASE-ARM: @ %bb.0: +; BASE-ARM-NEXT: .pad #4 +; BASE-ARM-NEXT: sub sp, sp, #4 +; BASE-ARM-NEXT: strh r0, [sp, #2] +; BASE-ARM-NEXT: ldrh r0, [sp, #2] +; BASE-ARM-NEXT: add sp, sp, #4 +; BASE-ARM-NEXT: bx lr +; +; BASE-THUMB-LABEL: bf_from_int_soft: +; BASE-THUMB: @ %bb.0: +; BASE-THUMB-NEXT: .pad #4 +; BASE-THUMB-NEXT: sub sp, #4 +; BASE-THUMB-NEXT: strh.w r0, [sp, #2] +; BASE-THUMB-NEXT: ldrh.w r0, [sp, #2] +; BASE-THUMB-NEXT: add sp, #4 +; BASE-THUMB-NEXT: bx lr +; +; FULLFP16-LABEL: bf_from_int_soft: +; FULLFP16: @ %bb.0: +; FULLFP16-NEXT: vmov.f16 s0, r0 +; FULLFP16-NEXT: vmov r0, s0 +; FULLFP16-NEXT: bx lr + %h = trunc i32 %w to i16 + %f = bitcast i16 %h to bfloat + ret bfloat %f +} + +define arm_aapcs_vfpcc bfloat @bf_from_int_hard(i32 %w) { +; BASE-ARM-LABEL: bf_from_int_hard: +; BASE-ARM: @ %bb.0: +; BASE-ARM-NEXT: .pad #4 +; BASE-ARM-NEXT: sub sp, sp, #4 +; BASE-ARM-NEXT: strh r0, [sp, #2] +; BASE-ARM-NEXT: ldrh r0, [sp, #2] +; BASE-ARM-NEXT: vmov s0, r0 +; BASE-ARM-NEXT: add sp, sp, #4 +; BASE-ARM-NEXT: bx lr +; +; BASE-THUMB-LABEL: bf_from_int_hard: +; BASE-THUMB: @ %bb.0: +; BASE-THUMB-NEXT: .pad #4 +; BASE-THUMB-NEXT: sub sp, #4 +; BASE-THUMB-NEXT: strh.w r0, [sp, #2] +; BASE-THUMB-NEXT: ldrh.w r0, [sp, #2] +; BASE-THUMB-NEXT: vmov s0, r0 +; BASE-THUMB-NEXT: add sp, #4 +; BASE-THUMB-NEXT: bx lr +; +; FULLFP16-LABEL: bf_from_int_hard: +; FULLFP16: @ %bb.0: +; FULLFP16-NEXT: vmov.f16 s0, r0 +; FULLFP16-NEXT: bx lr + %h = trunc i32 %w to i16 + %f = bitcast i16 %h to bfloat + ret bfloat %f +} + +define bfloat @test_fncall_soft(bfloat %bf, bfloat (bfloat, bfloat)* %f) { +; BASE-ARM-LABEL: test_fncall_soft: +; BASE-ARM: @ %bb.0: +; BASE-ARM-NEXT: .save {r4, r5, r11, lr} +; BASE-ARM-NEXT: push {r4, r5, r11, lr} +; BASE-ARM-NEXT: .pad #8 +; BASE-ARM-NEXT: sub sp, sp, #8 +; BASE-ARM-NEXT: uxth r5, r0 +; BASE-ARM-NEXT: mov r4, r1 +; BASE-ARM-NEXT: mov r0, r5 +; BASE-ARM-NEXT: mov r1, r5 +; BASE-ARM-NEXT: blx r4 +; BASE-ARM-NEXT: strh r0, [sp, #6] +; BASE-ARM-NEXT: uxth r1, r0 +; BASE-ARM-NEXT: mov r0, r5 +; BASE-ARM-NEXT: blx r4 +; BASE-ARM-NEXT: ldrh r0, [sp, #6] +; BASE-ARM-NEXT: add sp, sp, #8 +; BASE-ARM-NEXT: pop {r4, r5, r11, pc} +; +; BASE-THUMB-LABEL: test_fncall_soft: +; BASE-THUMB: @ %bb.0: +; BASE-THUMB-NEXT: .save {r4, r5, r7, lr} +; BASE-THUMB-NEXT: push {r4, r5, r7, lr} +; BASE-THUMB-NEXT: .pad #8 +; BASE-THUMB-NEXT: sub sp, #8 +; BASE-THUMB-NEXT: uxth r5, r0 +; BASE-THUMB-NEXT: mov r4, r1 +; BASE-THUMB-NEXT: mov r0, r5 +; BASE-THUMB-NEXT: mov r1, r5 +; BASE-THUMB-NEXT: blx r4 +; BASE-THUMB-NEXT: uxth r1, r0 +; BASE-THUMB-NEXT: strh.w r0, [sp, #6] +; BASE-THUMB-NEXT: mov r0, r5 +; BASE-THUMB-NEXT: blx r4 +; BASE-THUMB-NEXT: ldrh.w r0, [sp, #6] +; BASE-THUMB-NEXT: add sp, #8 +; BASE-THUMB-NEXT: pop {r4, r5, r7, pc} +; +; FULLFP16-ARM-LABEL: test_fncall_soft: +; FULLFP16-ARM: @ %bb.0: +; FULLFP16-ARM-NEXT: .save {r4, r5, r11, lr} +; FULLFP16-ARM-NEXT: push {r4, r5, r11, lr} +; FULLFP16-ARM-NEXT: .vsave {d8} +; FULLFP16-ARM-NEXT: vpush {d8} +; FULLFP16-ARM-NEXT: vmov.f16 s0, r0 +; FULLFP16-ARM-NEXT: mov r4, r1 +; FULLFP16-ARM-NEXT: vmov.f16 r5, s0 +; FULLFP16-ARM-NEXT: mov r0, r5 +; FULLFP16-ARM-NEXT: mov r1, r5 +; FULLFP16-ARM-NEXT: blx r4 +; FULLFP16-ARM-NEXT: vmov.f16 s16, r0 +; FULLFP16-ARM-NEXT: mov r0, r5 +; FULLFP16-ARM-NEXT: vmov.f16 r1, s16 +; FULLFP16-ARM-NEXT: blx r4 +; FULLFP16-ARM-NEXT: vmov r0, s16 +; FULLFP16-ARM-NEXT: vpop {d8} +; FULLFP16-ARM-NEXT: pop {r4, r5, r11, pc} +; +; FULLFP16-THUMB-LABEL: test_fncall_soft: +; FULLFP16-THUMB: @ %bb.0: +; FULLFP16-THUMB-NEXT: .save {r4, r5, r7, lr} +; FULLFP16-THUMB-NEXT: push {r4, r5, r7, lr} +; FULLFP16-THUMB-NEXT: .vsave {d8} +; FULLFP16-THUMB-NEXT: vpush {d8} +; FULLFP16-THUMB-NEXT: vmov.f16 s0, r0 +; FULLFP16-THUMB-NEXT: mov r4, r1 +; FULLFP16-THUMB-NEXT: vmov.f16 r5, s0 +; FULLFP16-THUMB-NEXT: mov r0, r5 +; FULLFP16-THUMB-NEXT: mov r1, r5 +; FULLFP16-THUMB-NEXT: blx r4 +; FULLFP16-THUMB-NEXT: vmov.f16 s16, r0 +; FULLFP16-THUMB-NEXT: mov r0, r5 +; FULLFP16-THUMB-NEXT: vmov.f16 r1, s16 +; FULLFP16-THUMB-NEXT: blx r4 +; FULLFP16-THUMB-NEXT: vmov r0, s16 +; FULLFP16-THUMB-NEXT: vpop {d8} +; FULLFP16-THUMB-NEXT: pop {r4, r5, r7, pc} + %call = tail call bfloat %f(bfloat %bf, bfloat %bf) + %call1 = tail call bfloat %f(bfloat %bf, bfloat %call) + ret bfloat %call +} + +define arm_aapcs_vfpcc bfloat @test_fncall_hard(bfloat %bf, bfloat (bfloat, bfloat)* %f) { +; BASE-ARM-LABEL: test_fncall_hard: +; BASE-ARM: @ %bb.0: +; BASE-ARM-NEXT: .save {r4, lr} +; BASE-ARM-NEXT: push {r4, lr} +; BASE-ARM-NEXT: .vsave {d8} +; BASE-ARM-NEXT: vpush {d8} +; BASE-ARM-NEXT: .pad #8 +; BASE-ARM-NEXT: sub sp, sp, #8 +; BASE-ARM-NEXT: mov r4, r0 +; BASE-ARM-NEXT: vmov r0, s0 +; BASE-ARM-NEXT: uxth r0, r0 +; BASE-ARM-NEXT: vmov s16, r0 +; BASE-ARM-NEXT: vmov.f32 s0, s16 +; BASE-ARM-NEXT: vmov.f32 s1, s16 +; BASE-ARM-NEXT: blx r4 +; BASE-ARM-NEXT: vmov r0, s0 +; BASE-ARM-NEXT: vmov.f32 s0, s16 +; BASE-ARM-NEXT: strh r0, [sp, #6] +; BASE-ARM-NEXT: uxth r0, r0 +; BASE-ARM-NEXT: vmov s1, r0 +; BASE-ARM-NEXT: blx r4 +; BASE-ARM-NEXT: ldrh r0, [sp, #6] +; BASE-ARM-NEXT: vmov s0, r0 +; BASE-ARM-NEXT: add sp, sp, #8 +; BASE-ARM-NEXT: vpop {d8} +; BASE-ARM-NEXT: pop {r4, pc} +; +; BASE-THUMB-LABEL: test_fncall_hard: +; BASE-THUMB: @ %bb.0: +; BASE-THUMB-NEXT: .save {r4, lr} +; BASE-THUMB-NEXT: push {r4, lr} +; BASE-THUMB-NEXT: .vsave {d8} +; BASE-THUMB-NEXT: vpush {d8} +; BASE-THUMB-NEXT: .pad #8 +; BASE-THUMB-NEXT: sub sp, #8 +; BASE-THUMB-NEXT: mov r4, r0 +; BASE-THUMB-NEXT: vmov r0, s0 +; BASE-THUMB-NEXT: uxth r0, r0 +; BASE-THUMB-NEXT: vmov s16, r0 +; BASE-THUMB-NEXT: vmov.f32 s0, s16 +; BASE-THUMB-NEXT: vmov.f32 s1, s16 +; BASE-THUMB-NEXT: blx r4 +; BASE-THUMB-NEXT: vmov r0, s0 +; BASE-THUMB-NEXT: vmov.f32 s0, s16 +; BASE-THUMB-NEXT: strh.w r0, [sp, #6] +; BASE-THUMB-NEXT: uxth r0, r0 +; BASE-THUMB-NEXT: vmov s1, r0 +; BASE-THUMB-NEXT: blx r4 +; BASE-THUMB-NEXT: ldrh.w r0, [sp, #6] +; BASE-THUMB-NEXT: vmov s0, r0 +; BASE-THUMB-NEXT: add sp, #8 +; BASE-THUMB-NEXT: vpop {d8} +; BASE-THUMB-NEXT: pop {r4, pc} +; +; FULLFP16-LABEL: test_fncall_hard: +; FULLFP16: @ %bb.0: +; FULLFP16-NEXT: .save {r4, lr} +; FULLFP16-NEXT: push {r4, lr} +; FULLFP16-NEXT: .vsave {d8, d9} +; FULLFP16-NEXT: vpush {d8, d9} +; FULLFP16-NEXT: mov r4, r0 +; FULLFP16-NEXT: vmov.f16 r0, s0 +; FULLFP16-NEXT: vmov s16, r0 +; FULLFP16-NEXT: vmov.f32 s0, s16 +; FULLFP16-NEXT: vmov.f32 s1, s16 +; FULLFP16-NEXT: blx r4 +; FULLFP16-NEXT: vmov.f16 r0, s0 +; FULLFP16-NEXT: vmov.f32 s18, s0 +; FULLFP16-NEXT: vmov.f32 s0, s16 +; FULLFP16-NEXT: vmov s1, r0 +; FULLFP16-NEXT: blx r4 +; FULLFP16-NEXT: vmov.f32 s0, s18 +; FULLFP16-NEXT: vpop {d8, d9} +; FULLFP16-NEXT: pop {r4, pc} + %call = tail call arm_aapcs_vfpcc bfloat %f(bfloat %bf, bfloat %bf) + %call1 = tail call arm_aapcs_vfpcc bfloat %f(bfloat %bf, bfloat %call) + ret bfloat %call +}