diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3586,6 +3586,24 @@ // the SelectionDAGBuilder code knows how to lower these. // + /// Target-specific splitting of values into parts that fit a register + /// storing a legal type + virtual bool splitValueIntoRegisterParts(SelectionDAG &DAG, const SDLoc &DL, + SDValue Val, SDValue *Parts, + unsigned NumParts, MVT PartVT, + Optional CC) const { + return false; + } + + /// Target-specific combining of register parts into its original value + virtual SDValue + joinRegisterPartsIntoValue(SelectionDAG &DAG, const SDLoc &DL, + const SDValue *Parts, unsigned NumParts, + MVT PartVT, EVT ValueVT, + Optional CC) const { + return SDValue(); + } + /// This hook must be implemented to lower the incoming (formal) arguments, /// described by the Ins array, into the specified DAG. The implementation /// should fill in the InVals array with legal-type argument values, and diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -205,12 +205,17 @@ MVT PartVT, EVT ValueVT, const Value *V, Optional CC = None, Optional AssertOp = None) { + // Let the target assemble the parts if it wants to + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (SDValue Val = TLI.joinRegisterPartsIntoValue(DAG, DL, Parts, NumParts, + PartVT, ValueVT, CC)) + return Val; + if (ValueVT.isVector()) return getCopyFromPartsVector(DAG, DL, Parts, NumParts, PartVT, ValueVT, V, CC); assert(NumParts > 0 && "No parts to assemble!"); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Val = Parts[0]; if (NumParts > 1) { @@ -512,6 +517,11 @@ const Value *V, Optional CallConv = None, ISD::NodeType ExtendKind = ISD::ANY_EXTEND) { + // Let the target split the parts if it wants to + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.splitValueIntoRegisterParts(DAG, DL, Val, Parts, NumParts, PartVT, + CallConv)) + return; EVT ValueVT = Val.getValueType(); // Handle the vector case separately. @@ -913,7 +923,6 @@ if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT)) ExtendKind = ISD::ZERO_EXTEND; - getCopyToParts(DAG, dl, Val.getValue(Val.getResNo() + Value), &Parts[Part], NumParts, RegisterVT, V, CallConv, ExtendKind); Part += NumParts; diff --git a/llvm/lib/Target/ARM/ARMCallingConv.cpp b/llvm/lib/Target/ARM/ARMCallingConv.cpp --- a/llvm/lib/Target/ARM/ARMCallingConv.cpp +++ b/llvm/lib/Target/ARM/ARMCallingConv.cpp @@ -13,8 +13,11 @@ #include "ARM.h" #include "ARMCallingConv.h" +#include "ARMISelLowering.h" #include "ARMSubtarget.h" #include "ARMRegisterInfo.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/MC/MCRegister.h" using namespace llvm; // APCS f64 is in register pairs, possibly split to stack @@ -280,5 +283,35 @@ return true; } +static bool CustomAssignInRegList(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, CCState &State, + ArrayRef RegList) { + unsigned Reg = State.AllocateReg(RegList); + if (Reg) { + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + } else { + State.addLoc(CCValAssign::getCustomMem( + ValNo, ValVT, State.AllocateStack(4, 4), LocVT, LocInfo)); + } + return true; +} + +static bool CC_ARM_AAPCS_Custom_f16(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + // f16 arguments are extended to i32 and assigned to a register in [r0, r3] + return CustomAssignInRegList(ValNo, ValVT, MVT::i32, LocInfo, State, + RRegList); +} + +static bool CC_ARM_AAPCS_VFP_Custom_f16(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, + CCState &State) { + // f16 arguments are extended to f32 and assigned to a register in [s0, s15] + return CustomAssignInRegList(ValNo, ValVT, MVT::f32, LocInfo, State, + SRegList); +} + // Include the table generated calling convention implementations. #include "ARMGenCallingConv.inc" diff --git a/llvm/lib/Target/ARM/ARMCallingConv.td b/llvm/lib/Target/ARM/ARMCallingConv.td --- a/llvm/lib/Target/ARM/ARMCallingConv.td +++ b/llvm/lib/Target/ARM/ARMCallingConv.td @@ -176,6 +176,7 @@ CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>, CCIfType<[f32], CCBitConvertToType>, + CCIfType<[f16], CCCustom<"CC_ARM_AAPCS_Custom_f16">>, CCDelegateTo ]>; @@ -193,6 +194,7 @@ CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>, CCIfType<[f32], CCBitConvertToType>, + CCIfType<[f16], CCCustom<"CC_ARM_AAPCS_Custom_f16">>, CCDelegateTo ]>; @@ -224,6 +226,7 @@ CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15]>>, + CCIfType<[f16], CCCustom<"CC_ARM_AAPCS_VFP_Custom_f16">>, CCDelegateTo ]>; @@ -243,6 +246,7 @@ CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15]>>, + CCIfType<[f16], CCCustom<"CC_ARM_AAPCS_VFP_Custom_f16">>, CCDelegateTo ]>; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -806,6 +806,15 @@ MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const override; + bool + splitValueIntoRegisterParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val, + SDValue *Parts, unsigned NumParts, MVT PartVT, + Optional CC) const override; + + SDValue joinRegisterPartsIntoValue(SelectionDAG &DAG, const SDLoc &DL, + const SDValue *Parts, unsigned NumParts, + MVT PartVT, EVT ValueVT, Optional CC) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -2024,7 +2024,8 @@ } SDValue Val; - if (VA.needsCustom()) { + if (VA.needsCustom() && + ((VA.getLocVT() == MVT::f64) || (VA.getLocVT() == MVT::v2f64))) { // Handle f64 or half of a v2f64. SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); @@ -2073,6 +2074,17 @@ break; } + // f16 arguments have their size extended to 4 bytes and passed as if they + // had been copied to the LSBs of a 32-bit register. + // For that, we extend it to f32 (hard ABI) or i32 (soft ABI) + if (VA.needsCustom() && VA.getValVT() == MVT::f16) { + Val = DAG.getNode(ISD::BITCAST, dl, + MVT::getIntegerVT(VA.getLocVT().getSizeInBits()), Val); + Val = DAG.getNode(ISD::TRUNCATE, dl, + MVT::getIntegerVT(VA.getValVT().getSizeInBits()), Val); + Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); + } + InVals.push_back(Val); } @@ -2241,9 +2253,19 @@ break; } + // f16 arguments have their size extended to 4 bytes and passed as if they + // had been copied to the LSBs of a 32-bit register. + // For that, we extend it to f32 (hard ABI) or i32 (soft ABI) + if (VA.needsCustom() && VA.getValVT() == MVT::f16) { + Arg = DAG.getNode(ISD::BITCAST, dl, + MVT::getIntegerVT(VA.getValVT().getSizeInBits()), Arg); + Arg = DAG.getNode(ISD::ANY_EXTEND, dl, + MVT::getIntegerVT(VA.getLocVT().getSizeInBits()), Arg); + Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); + } + // f64 and v2f64 might be passed in i32 pairs and must be split into pieces - if (VA.needsCustom()) { - if (VA.getLocVT() == MVT::v2f64) { + if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) { SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, DAG.getConstant(0, dl, MVT::i32)); SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, @@ -2262,10 +2284,9 @@ MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, dl, DAG, VA, Flags)); } - } else { + } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) { PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); - } } else if (VA.isRegLoc()) { if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && Outs[0].VT == MVT::i32) { @@ -2755,7 +2776,8 @@ ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; if (VA.getLocInfo() == CCValAssign::Indirect) return false; - if (VA.needsCustom()) { + if (VA.needsCustom() && + ((RegVT == MVT::f64) || (RegVT == MVT::v2f64))) { // f64 and vector types are split into multiple registers or // register/stack-slot combinations. The types will not match // the registers; give up on memory f64 refs until we figure @@ -2907,23 +2929,24 @@ break; } - if (VA.needsCustom()) { + if (VA.needsCustom() && + (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) { if (VA.getLocVT() == MVT::v2f64) { // Extract the first half and return it in two registers. SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, - DAG.getConstant(0, dl, MVT::i32)); + DAG.getConstant(0, dl, MVT::i32)); SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, - DAG.getVTList(MVT::i32, MVT::i32), Half); + DAG.getVTList(MVT::i32, MVT::i32), Half); Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), - HalfGPRs.getValue(isLittleEndian ? 0 : 1), - Flag); + HalfGPRs.getValue(isLittleEndian ? 0 : 1), + Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), - HalfGPRs.getValue(isLittleEndian ? 1 : 0), - Flag); + HalfGPRs.getValue(isLittleEndian ? 1 : 0), + Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc @@ -4080,6 +4103,40 @@ AFI->setVarArgsFrameIndex(FrameIndex); } +bool ARMTargetLowering::splitValueIntoRegisterParts( + SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, + unsigned NumParts, MVT PartVT, Optional CC) const { + bool IsABIRegCopy = CC.hasValue(); + EVT ValueVT = Val.getValueType(); + if (IsABIRegCopy && ValueVT == MVT::f16 && PartVT == MVT::f32) { + unsigned ValueBits = ValueVT.getSizeInBits(); + unsigned PartBits = PartVT.getSizeInBits(); + Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val); + Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val); + Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); + Parts[0] = Val; + return true; + } + return false; +} + +SDValue ARMTargetLowering::joinRegisterPartsIntoValue( + SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, + MVT PartVT, EVT ValueVT, Optional CC) const { + bool IsABIRegCopy = CC.hasValue(); + if (IsABIRegCopy && ValueVT == MVT::f16 && PartVT == MVT::f32) { + unsigned ValueBits = ValueVT.getSizeInBits(); + unsigned PartBits = PartVT.getSizeInBits(); + SDValue Val = Parts[0]; + + Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val); + Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val); + Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); + return Val; + } + return SDValue(); +} + SDValue ARMTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, @@ -4152,33 +4209,32 @@ if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); - if (VA.needsCustom()) { + if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) { // f64 and vector types are split up into multiple registers or // combinations of registers and stack slots. - if (VA.getLocVT() == MVT::v2f64) { - SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], - Chain, DAG, dl); - VA = ArgLocs[++i]; // skip ahead to next loc - SDValue ArgValue2; - if (VA.isMemLoc()) { - int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, - MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), FI)); - } else { - ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], - Chain, DAG, dl); - } - ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); - ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, - ArgValue, ArgValue1, - DAG.getIntPtrConstant(0, dl)); - ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, - ArgValue, ArgValue2, - DAG.getIntPtrConstant(1, dl)); - } else - ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); + SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], + Chain, DAG, dl); + VA = ArgLocs[++i]; // skip ahead to next loc + SDValue ArgValue2; + if (VA.isMemLoc()) { + int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), FI)); + } else { + ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], + Chain, DAG, dl); + } + ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, + ArgValue, ArgValue1, + DAG.getIntPtrConstant(0, dl)); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, + ArgValue, ArgValue2, + DAG.getIntPtrConstant(1, dl)); + } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) { + ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); } else { const TargetRegisterClass *RC; @@ -4229,6 +4285,19 @@ break; } + // f16 arguments have their size extended to 4 bytes and passed as if they + // had been copied to the LSBs of a 32-bit register. + // For that, we extend it to f32 (hard ABI) or i32 (soft ABI) + if (VA.needsCustom() && VA.getValVT() == MVT::f16) { + ArgValue = DAG.getNode(ISD::BITCAST, dl, + MVT::getIntegerVT(VA.getLocVT().getSizeInBits()), + ArgValue); + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, + MVT::getIntegerVT(VA.getValVT().getSizeInBits()), + ArgValue); + ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); + } + InVals.push_back(ArgValue); } else { // VA.isRegLoc() // sanity check diff --git a/llvm/test/CodeGen/ARM/fp16-args.ll b/llvm/test/CodeGen/ARM/fp16-args.ll --- a/llvm/test/CodeGen/ARM/fp16-args.ll +++ b/llvm/test/CodeGen/ARM/fp16-args.ll @@ -1,41 +1,46 @@ -; RUN: llc -float-abi soft -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT -; RUN: llc -float-abi hard -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=armv7a--none-eabi -float-abi soft -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT +; RUN: llc -mtriple=armv7a--none-eabi -float-abi hard -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD +; RUN: llc -mtriple=armv7a--none-eabi -float-abi soft -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FULL-SOFT +; RUN: llc -mtriple=armv7a--none-eabi -float-abi hard -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FULL-HARD +; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi soft -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT +; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi hard -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD +; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi soft -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FULL-SOFT +; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi hard -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FULL-HARD -target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" -target triple = "armv7a--none-eabi" - -define float @foo(float %a.coerce, float %b.coerce) { +define half @foo(half %a, half %b) { +; SOFT-LABEL: foo: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov s2, r1 +; SOFT-NEXT: vmov s0, r0 +; SOFT-NEXT: vcvtb.f32.f16 s2, s2 +; SOFT-NEXT: vcvtb.f32.f16 s0, s0 +; SOFT-NEXT: vadd.f32 s0, s0, s2 +; SOFT-NEXT: vcvtb.f16.f32 s0, s0 +; SOFT-NEXT: vmov r0, s0 +; SOFT-NEXT: bx lr +; +; HARD-LABEL: foo: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vcvtb.f32.f16 s2, s1 +; HARD-NEXT: vcvtb.f32.f16 s0, s0 +; HARD-NEXT: vadd.f32 s0, s0, s2 +; HARD-NEXT: vcvtb.f16.f32 s0, s0 +; HARD-NEXT: bx lr +; +; FULL-SOFT-LABEL: foo: +; FULL-SOFT: @ %bb.0: @ %entry +; FULL-SOFT-NEXT: vmov.f16 s0, r1 +; FULL-SOFT-NEXT: vmov.f16 s2, r0 +; FULL-SOFT-NEXT: vadd.f16 s0, s2, s0 +; FULL-SOFT-NEXT: vmov r0, s0 +; FULL-SOFT-NEXT: bx lr +; +; FULL-HARD-LABEL: foo: +; FULL-HARD: @ %bb.0: @ %entry +; FULL-HARD-NEXT: vadd.f16 s0, s0, s1 +; FULL-HARD-NEXT: bx lr entry: - %0 = bitcast float %a.coerce to i32 - %tmp.0.extract.trunc = trunc i32 %0 to i16 - %1 = bitcast i16 %tmp.0.extract.trunc to half - %2 = bitcast float %b.coerce to i32 - %tmp1.0.extract.trunc = trunc i32 %2 to i16 - %3 = bitcast i16 %tmp1.0.extract.trunc to half - %4 = fadd half %1, %3 - %5 = bitcast half %4 to i16 - %tmp5.0.insert.ext = zext i16 %5 to i32 - %6 = bitcast i32 %tmp5.0.insert.ext to float - ret float %6 -; CHECK: foo: - -; SOFT: vmov {{s[0-9]+}}, r1 -; SOFT: vmov {{s[0-9]+}}, r0 -; SOFT: vcvtb.f32.f16 {{s[0-9]+}}, {{s[0-9]+}} -; SOFT: vcvtb.f32.f16 {{s[0-9]+}}, {{s[0-9]+}} -; SOFT: vadd.f32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} -; SOFT: vcvtb.f16.f32 {{s[0-9]+}}, {{s[0-9]+}} -; SOFT: vmov r0, {{s[0-9]+}} - -; HARD-NOT: vmov -; HARD-NOT: uxth -; HARD: vcvtb.f32.f16 {{s[0-9]+}}, s1 -; HARD: vcvtb.f32.f16 {{s[0-9]+}}, s0 -; HARD: vadd.f32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} -; HARD: vcvtb.f16.f32 [[SREG:s[0-9]+]], {{s[0-9]+}} -; HARD-NEXT: vmov [[REG0:r[0-9]+]], [[SREG]] -; HARD-NEXT: uxth [[REG1:r[0-9]+]], [[REG0]] -; HARD-NEXT: vmov s0, [[REG1]] - -; CHECK: bx lr + %0 = fadd half %a, %b + ret half %0 } diff --git a/llvm/test/CodeGen/ARM/fp16-bitcast.ll b/llvm/test/CodeGen/ARM/fp16-bitcast.ll --- a/llvm/test/CodeGen/ARM/fp16-bitcast.ll +++ b/llvm/test/CodeGen/ARM/fp16-bitcast.ll @@ -1,71 +1,115 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple thumbv8m.main-arm-unknown-eabi -mattr=+vfp4d16sp < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-VFPV4 -; RUN: llc -mtriple thumbv8.1m.main-arm-unknown-eabi -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP16 +; RUN: llc -mtriple thumbv8m.main-arm-unknown-eabi --float-abi=soft -mattr=+vfp4d16sp < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-VFPV4-SOFT +; RUN: llc -mtriple thumbv8.1m.main-arm-unknown-eabi --float-abi=soft -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP16-SOFT +; RUN: llc -mtriple thumbv8m.main-arm-unknown-eabi --float-abi=hard -mattr=+vfp4d16sp < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-VFPV4-HARD +; RUN: llc -mtriple thumbv8.1m.main-arm-unknown-eabi --float-abi=hard -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP16-HARD target triple = "thumbv8.1m.main-arm-unknown-eabi" define float @add(float %a, float %b) { -; CHECK-LABEL: add: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov s0, r1 -; CHECK-NEXT: vmov s2, r0 -; CHECK-NEXT: vadd.f32 s0, s2, s0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: bx lr +; CHECK-VFPV4-SOFT-LABEL: add: +; CHECK-VFPV4-SOFT: @ %bb.0: @ %entry +; CHECK-VFPV4-SOFT-NEXT: vmov s0, r1 +; CHECK-VFPV4-SOFT-NEXT: vmov s2, r0 +; CHECK-VFPV4-SOFT-NEXT: vadd.f32 s0, s2, s0 +; CHECK-VFPV4-SOFT-NEXT: vmov r0, s0 +; CHECK-VFPV4-SOFT-NEXT: bx lr +; +; CHECK-FP16-SOFT-LABEL: add: +; CHECK-FP16-SOFT: @ %bb.0: @ %entry +; CHECK-FP16-SOFT-NEXT: vmov s0, r1 +; CHECK-FP16-SOFT-NEXT: vmov s2, r0 +; CHECK-FP16-SOFT-NEXT: vadd.f32 s0, s2, s0 +; CHECK-FP16-SOFT-NEXT: vmov r0, s0 +; CHECK-FP16-SOFT-NEXT: bx lr +; +; CHECK-VFPV4-HARD-LABEL: add: +; CHECK-VFPV4-HARD: @ %bb.0: @ %entry +; CHECK-VFPV4-HARD-NEXT: vadd.f32 s0, s0, s1 +; CHECK-VFPV4-HARD-NEXT: bx lr +; +; CHECK-FP16-HARD-LABEL: add: +; CHECK-FP16-HARD: @ %bb.0: @ %entry +; CHECK-FP16-HARD-NEXT: vadd.f32 s0, s0, s1 +; CHECK-FP16-HARD-NEXT: bx lr entry: %add = fadd float %a, %b ret float %add } -define i32 @addf16(i32 %a.coerce, i32 %b.coerce) { -; CHECK-VFPV4-LABEL: addf16: -; CHECK-VFPV4: @ %bb.0: @ %entry -; CHECK-VFPV4-NEXT: vmov s2, r1 -; CHECK-VFPV4-NEXT: vmov s0, r0 -; CHECK-VFPV4-NEXT: vcvtb.f32.f16 s2, s2 -; CHECK-VFPV4-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-VFPV4-NEXT: vadd.f32 s0, s0, s2 -; CHECK-VFPV4-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-VFPV4-NEXT: vmov r0, s0 -; CHECK-VFPV4-NEXT: uxth r0, r0 -; CHECK-VFPV4-NEXT: bx lr +define half @addf16(half %a, half %b) { +; CHECK-VFPV4-SOFT-LABEL: addf16: +; CHECK-VFPV4-SOFT: @ %bb.0: @ %entry +; CHECK-VFPV4-SOFT-NEXT: vmov s2, r1 +; CHECK-VFPV4-SOFT-NEXT: vmov s0, r0 +; CHECK-VFPV4-SOFT-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-VFPV4-SOFT-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-VFPV4-SOFT-NEXT: vadd.f32 s0, s0, s2 +; CHECK-VFPV4-SOFT-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-VFPV4-SOFT-NEXT: vmov r0, s0 +; CHECK-VFPV4-SOFT-NEXT: bx lr ; -; CHECK-FP16-LABEL: addf16: -; CHECK-FP16: @ %bb.0: @ %entry -; CHECK-FP16-NEXT: vmov.f16 s0, r1 -; CHECK-FP16-NEXT: vmov.f16 s2, r0 -; CHECK-FP16-NEXT: vadd.f16 s0, s2, s0 -; CHECK-FP16-NEXT: vmov.f16 r0, s0 -; CHECK-FP16-NEXT: bx lr +; CHECK-FP16-SOFT-LABEL: addf16: +; CHECK-FP16-SOFT: @ %bb.0: @ %entry +; CHECK-FP16-SOFT-NEXT: vmov.f16 s0, r1 +; CHECK-FP16-SOFT-NEXT: vmov.f16 s2, r0 +; CHECK-FP16-SOFT-NEXT: vadd.f16 s0, s2, s0 +; CHECK-FP16-SOFT-NEXT: vmov r0, s0 +; CHECK-FP16-SOFT-NEXT: bx lr +; +; CHECK-VFPV4-HARD-LABEL: addf16: +; CHECK-VFPV4-HARD: @ %bb.0: @ %entry +; CHECK-VFPV4-HARD-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-VFPV4-HARD-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-VFPV4-HARD-NEXT: vadd.f32 s0, s0, s2 +; CHECK-VFPV4-HARD-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-VFPV4-HARD-NEXT: bx lr +; +; CHECK-FP16-HARD-LABEL: addf16: +; CHECK-FP16-HARD: @ %bb.0: @ %entry +; CHECK-FP16-HARD-NEXT: vadd.f16 s0, s0, s1 +; CHECK-FP16-HARD-NEXT: bx lr entry: - %tmp.0.extract.trunc = trunc i32 %a.coerce to i16 - %0 = bitcast i16 %tmp.0.extract.trunc to half - %tmp1.0.extract.trunc = trunc i32 %b.coerce to i16 - %1 = bitcast i16 %tmp1.0.extract.trunc to half - %add = fadd half %0, %1 - %2 = bitcast half %add to i16 - %tmp4.0.insert.ext = zext i16 %2 to i32 - ret i32 %tmp4.0.insert.ext + %add = fadd half %a, %b + ret half %add } define half @load_i16(i16 *%hp) { -; CHECK-VFPV4-LABEL: load_i16: -; CHECK-VFPV4: @ %bb.0: @ %entry -; CHECK-VFPV4-NEXT: vmov.f32 s0, #1.000000e+00 -; CHECK-VFPV4-NEXT: ldrh r0, [r0] -; CHECK-VFPV4-NEXT: vmov s2, r0 -; CHECK-VFPV4-NEXT: vcvtb.f32.f16 s2, s2 -; CHECK-VFPV4-NEXT: vadd.f32 s0, s2, s0 -; CHECK-VFPV4-NEXT: vmov r0, s0 -; CHECK-VFPV4-NEXT: bx lr +; CHECK-VFPV4-SOFT-LABEL: load_i16: +; CHECK-VFPV4-SOFT: @ %bb.0: @ %entry +; CHECK-VFPV4-SOFT-NEXT: vmov.f32 s0, #1.000000e+00 +; CHECK-VFPV4-SOFT-NEXT: ldrh r0, [r0] +; CHECK-VFPV4-SOFT-NEXT: vmov s2, r0 +; CHECK-VFPV4-SOFT-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-VFPV4-SOFT-NEXT: vadd.f32 s0, s2, s0 +; CHECK-VFPV4-SOFT-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-VFPV4-SOFT-NEXT: vmov r0, s0 +; CHECK-VFPV4-SOFT-NEXT: bx lr +; +; CHECK-FP16-SOFT-LABEL: load_i16: +; CHECK-FP16-SOFT: @ %bb.0: @ %entry +; CHECK-FP16-SOFT-NEXT: vldr.16 s2, [r0] +; CHECK-FP16-SOFT-NEXT: vmov.f16 s0, #1.000000e+00 +; CHECK-FP16-SOFT-NEXT: vadd.f16 s0, s2, s0 +; CHECK-FP16-SOFT-NEXT: vmov r0, s0 +; CHECK-FP16-SOFT-NEXT: bx lr +; +; CHECK-VFPV4-HARD-LABEL: load_i16: +; CHECK-VFPV4-HARD: @ %bb.0: @ %entry +; CHECK-VFPV4-HARD-NEXT: vmov.f32 s0, #1.000000e+00 +; CHECK-VFPV4-HARD-NEXT: ldrh r0, [r0] +; CHECK-VFPV4-HARD-NEXT: vmov s2, r0 +; CHECK-VFPV4-HARD-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-VFPV4-HARD-NEXT: vadd.f32 s0, s2, s0 +; CHECK-VFPV4-HARD-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-VFPV4-HARD-NEXT: bx lr ; -; CHECK-FP16-LABEL: load_i16: -; CHECK-FP16: @ %bb.0: @ %entry -; CHECK-FP16-NEXT: vldr.16 s2, [r1] -; CHECK-FP16-NEXT: vmov.f16 s0, #1.000000e+00 -; CHECK-FP16-NEXT: vadd.f16 s0, s2, s0 -; CHECK-FP16-NEXT: vstr.16 s0, [r0] -; CHECK-FP16-NEXT: bx lr +; CHECK-FP16-HARD-LABEL: load_i16: +; CHECK-FP16-HARD: @ %bb.0: @ %entry +; CHECK-FP16-HARD-NEXT: vldr.16 s2, [r0] +; CHECK-FP16-HARD-NEXT: vmov.f16 s0, #1.000000e+00 +; CHECK-FP16-HARD-NEXT: vadd.f16 s0, s2, s0 +; CHECK-FP16-HARD-NEXT: bx lr entry: %h = load i16, i16 *%hp, align 2 %hc = bitcast i16 %h to half diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll --- a/llvm/test/CodeGen/ARM/fp16-promote.ll +++ b/llvm/test/CodeGen/ARM/fp16-promote.ll @@ -933,7 +933,6 @@ } ; CHECK-ALL-LABEL: test_struct_return: -; CHECK-FP16: vcvtb.f32.f16 ; CHECK-VFP-LIBCALL: bl __aeabi_h2f ; CHECK-NOVFP-DAG: ldr ; CHECK-NOVFP-DAG: ldrh diff --git a/llvm/test/CodeGen/ARM/fp16-vminmaxnm-safe.ll b/llvm/test/CodeGen/ARM/fp16-vminmaxnm-safe.ll --- a/llvm/test/CodeGen/ARM/fp16-vminmaxnm-safe.ll +++ b/llvm/test/CodeGen/ARM/fp16-vminmaxnm-safe.ll @@ -2,266 +2,235 @@ ; RUN: llc < %s -mtriple=armv8-eabi -mattr=+fullfp16 | FileCheck %s ; RUN: llc < %s -mtriple thumbv7a -mattr=+fullfp16 | FileCheck %s -; TODO: we can't pass half-precision arguments as "half" types yet. We do -; that for the time being by passing "float %f.coerce" and the necessary -; bitconverts/truncates. In these tests we pass i16 and use 1 bitconvert, which -; is the shortest way to get a half type. But when we can pass half types, we -; want to use that here. - -define half @fp16_vminnm_o(i16 signext %a, i16 signext %b) { +define half @fp16_vminnm_o(half %a, half %b) { ; CHECK-LABEL: fp16_vminnm_o: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 s0, r2 -; CHECK-NEXT: vmov.f16 s2, r1 +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 ; CHECK-NEXT: vcmp.f16 s0, s2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr entry: - %0 = bitcast i16 %a to half - %1 = bitcast i16 %b to half - %cmp = fcmp olt half %0, %1 - %cond = select i1 %cmp, half %0, half %1 + %cmp = fcmp olt half %a, %b + %cond = select i1 %cmp, half %a, half %b ret half %cond } -define half @fp16_vminnm_o_rev(i16 signext %a, i16 signext %b) { +define half @fp16_vminnm_o_rev(half %a, half %b) { ; CHECK-LABEL: fp16_vminnm_o_rev: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 s0, r2 -; CHECK-NEXT: vmov.f16 s2, r1 +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 ; CHECK-NEXT: vcmp.f16 s2, s0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr entry: - %0 = bitcast i16 %a to half - %1 = bitcast i16 %b to half - %cmp = fcmp ogt half %0, %1 - %cond = select i1 %cmp, half %0, half %1 + %cmp = fcmp ogt half %a, %b + %cond = select i1 %cmp, half %a, half %b ret half %cond } -define half @fp16_vminnm_u(i16 signext %a, i16 signext %b) { +define half @fp16_vminnm_u(half %a, half %b) { ; CHECK-LABEL: fp16_vminnm_u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 s0, r1 -; CHECK-NEXT: vmov.f16 s2, r2 +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vmov.f16 s2, r1 ; CHECK-NEXT: vcmp.f16 s0, s2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselge.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr entry: - %0 = bitcast i16 %a to half - %1 = bitcast i16 %b to half - %cmp = fcmp ult half %0, %1 - %cond = select i1 %cmp, half %0, half %1 + %cmp = fcmp ult half %a, %b + %cond = select i1 %cmp, half %a, half %b ret half %cond } -define half @fp16_vminnm_ule(i16 signext %a, i16 signext %b) { +define half @fp16_vminnm_ule(half %a, half %b) { ; CHECK-LABEL: fp16_vminnm_ule: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 s0, r1 -; CHECK-NEXT: vmov.f16 s2, r2 +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vmov.f16 s2, r1 ; CHECK-NEXT: vcmp.f16 s0, s2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr entry: - %0 = bitcast i16 %a to half - %1 = bitcast i16 %b to half - %cmp = fcmp ule half %0, %1 - %cond = select i1 %cmp, half %0, half %1 + %cmp = fcmp ule half %a, %b + %cond = select i1 %cmp, half %a, half %b ret half %cond } -define half @fp16_vminnm_u_rev(i16 signext %a, i16 signext %b) { +define half @fp16_vminnm_u_rev(half %a, half %b) { ; CHECK-LABEL: fp16_vminnm_u_rev: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 s0, r2 -; CHECK-NEXT: vmov.f16 s2, r1 +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 ; CHECK-NEXT: vcmp.f16 s0, s2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselge.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr entry: - %0 = bitcast i16 %a to half - %1 = bitcast i16 %b to half - %cmp = fcmp ugt half %0, %1 - %cond = select i1 %cmp, half %1, half %0 + %cmp = fcmp ugt half %a, %b + %cond = select i1 %cmp, half %b, half %a ret half %cond } -define half @fp16_vmaxnm_o(i16 signext %a, i16 signext %b) { +define half @fp16_vmaxnm_o(half %a, half %b) { ; CHECK-LABEL: fp16_vmaxnm_o: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 s0, r2 -; CHECK-NEXT: vmov.f16 s2, r1 +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 ; CHECK-NEXT: vcmp.f16 s2, s0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr entry: - %0 = bitcast i16 %a to half - %1 = bitcast i16 %b to half - %cmp = fcmp ogt half %0, %1 - %cond = select i1 %cmp, half %0, half %1 + %cmp = fcmp ogt half %a, %b + %cond = select i1 %cmp, half %a, half %b ret half %cond } -define half @fp16_vmaxnm_oge(i16 signext %a, i16 signext %b) { +define half @fp16_vmaxnm_oge(half %a, half %b) { ; CHECK-LABEL: fp16_vmaxnm_oge: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 s0, r2 -; CHECK-NEXT: vmov.f16 s2, r1 +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 ; CHECK-NEXT: vcmp.f16 s2, s0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselge.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr entry: - %0 = bitcast i16 %a to half - %1 = bitcast i16 %b to half - %cmp = fcmp oge half %0, %1 - %cond = select i1 %cmp, half %0, half %1 + %cmp = fcmp oge half %a, %b + %cond = select i1 %cmp, half %a, half %b ret half %cond } -define half @fp16_vmaxnm_o_rev(i16 signext %a, i16 signext %b) { +define half @fp16_vmaxnm_o_rev(half %a, half %b) { ; CHECK-LABEL: fp16_vmaxnm_o_rev: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 s0, r1 -; CHECK-NEXT: vmov.f16 s2, r2 +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vmov.f16 s2, r1 ; CHECK-NEXT: vcmp.f16 s2, s0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr entry: - %0 = bitcast i16 %a to half - %1 = bitcast i16 %b to half - %cmp = fcmp olt half %0, %1 - %cond = select i1 %cmp, half %1, half %0 + %cmp = fcmp olt half %a, %b + %cond = select i1 %cmp, half %b, half %a ret half %cond } -define half @fp16_vmaxnm_ole_rev(i16 signext %a, i16 signext %b) { +define half @fp16_vmaxnm_ole_rev(half %a, half %b) { ; CHECK-LABEL: fp16_vmaxnm_ole_rev: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 s0, r1 -; CHECK-NEXT: vmov.f16 s2, r2 +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vmov.f16 s2, r1 ; CHECK-NEXT: vcmp.f16 s2, s0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselge.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr entry: - %0 = bitcast i16 %a to half - %1 = bitcast i16 %b to half - %cmp = fcmp ole half %0, %1 - %cond = select i1 %cmp, half %1, half %0 + %cmp = fcmp ole half %a, %b + %cond = select i1 %cmp, half %b, half %a ret half %cond } -define half @fp16_vmaxnm_u(i16 signext %a, i16 signext %b) { +define half @fp16_vmaxnm_u(half %a, half %b) { ; CHECK-LABEL: fp16_vmaxnm_u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 s0, r1 -; CHECK-NEXT: vmov.f16 s2, r2 +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vmov.f16 s2, r1 ; CHECK-NEXT: vcmp.f16 s2, s0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselge.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr entry: - %0 = bitcast i16 %a to half - %1 = bitcast i16 %b to half - %cmp = fcmp ugt half %0, %1 - %cond = select i1 %cmp, half %0, half %1 + %cmp = fcmp ugt half %a, %b + %cond = select i1 %cmp, half %a, half %b ret half %cond } -define half @fp16_vmaxnm_uge(i16 signext %a, i16 signext %b) { +define half @fp16_vmaxnm_uge(half %a, half %b) { ; CHECK-LABEL: fp16_vmaxnm_uge: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 s0, r1 -; CHECK-NEXT: vmov.f16 s2, r2 +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vmov.f16 s2, r1 ; CHECK-NEXT: vcmp.f16 s2, s0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr entry: - %0 = bitcast i16 %a to half - %1 = bitcast i16 %b to half - %cmp = fcmp uge half %0, %1 - %cond = select i1 %cmp, half %0, half %1 + %cmp = fcmp uge half %a, %b + %cond = select i1 %cmp, half %a, half %b ret half %cond } -define half @fp16_vmaxnm_u_rev(i16 signext %a, i16 signext %b) { +define half @fp16_vmaxnm_u_rev(half %a, half %b) { ; CHECK-LABEL: fp16_vmaxnm_u_rev: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 s0, r2 -; CHECK-NEXT: vmov.f16 s2, r1 +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 ; CHECK-NEXT: vcmp.f16 s2, s0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselge.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr entry: - %0 = bitcast i16 %a to half - %1 = bitcast i16 %b to half - %cmp = fcmp ult half %0, %1 - %cond = select i1 %cmp, half %1, half %0 + %cmp = fcmp ult half %a, %b + %cond = select i1 %cmp, half %b, half %a ret half %cond } ; known non-NaNs -define half @fp16_vminnm_NNNo(i16 signext %a) { +define half @fp16_vminnm_NNNo(half %a) { ; CHECK-LABEL: fp16_vminnm_NNNo: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s0, r0 ; CHECK-NEXT: vmov.f16 s2, #1.200000e+01 ; CHECK-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NEXT: vldr.16 s2, .LCPI12_0 ; CHECK-NEXT: vcmp.f16 s0, s2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI12_0: ; CHECK-NEXT: .short 0x5040 @ half 34 entry: - %0 = bitcast i16 %a to half - %cmp1 = fcmp olt half %0, 12. - %cond1 = select i1 %cmp1, half %0, half 12. + %cmp1 = fcmp olt half %a, 12. + %cond1 = select i1 %cmp1, half %a, half 12. %cmp2 = fcmp olt half 34., %cond1 %cond2 = select i1 %cmp2, half 34., half %cond1 ret half %cond2 } -define half @fp16_vminnm_NNNo_rev(i16 signext %a) { +define half @fp16_vminnm_NNNo_rev(half %a) { ; CHECK-LABEL: fp16_vminnm_NNNo_rev: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldr.16 s2, .LCPI13_0 -; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s0, r0 ; CHECK-NEXT: vcmp.f16 s0, s2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-NEXT: vldr.16 s2, .LCPI13_1 ; CHECK-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: @ %bb.1: @@ -270,50 +239,48 @@ ; CHECK-NEXT: .LCPI13_1: ; CHECK-NEXT: .short 0x54e0 @ half 78 entry: - %0 = bitcast i16 %a to half - %cmp1 = fcmp ogt half %0, 56. - %cond1 = select i1 %cmp1, half 56., half %0 + %cmp1 = fcmp ogt half %a, 56. + %cond1 = select i1 %cmp1, half 56., half %a %cmp2 = fcmp ogt half 78., %cond1 %cond2 = select i1 %cmp2, half %cond1, half 78. ret half %cond2 } -define half @fp16_vminnm_NNNu(i16 signext %b) { +define half @fp16_vminnm_NNNu(half %b) { ; CHECK-LABEL: fp16_vminnm_NNNu: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s0, r0 ; CHECK-NEXT: vmov.f16 s2, #1.200000e+01 ; CHECK-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NEXT: vldr.16 s2, .LCPI14_0 ; CHECK-NEXT: vcmp.f16 s0, s2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselge.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI14_0: ; CHECK-NEXT: .short 0x5040 @ half 34 entry: - %0 = bitcast i16 %b to half - %cmp1 = fcmp ult half 12., %0 - %cond1 = select i1 %cmp1, half 12., half %0 + %cmp1 = fcmp ult half 12., %b + %cond1 = select i1 %cmp1, half 12., half %b %cmp2 = fcmp ult half %cond1, 34. %cond2 = select i1 %cmp2, half %cond1, half 34. ret half %cond2 } -define half @fp16_vminnm_NNNule(i16 signext %b) { +define half @fp16_vminnm_NNNule(half %b) { ; CHECK-LABEL: fp16_vminnm_NNNule: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldr.16 s2, .LCPI15_0 -; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s0, r0 ; CHECK-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NEXT: vldr.16 s2, .LCPI15_1 ; CHECK-NEXT: vcmp.f16 s0, s2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: @ %bb.1: @@ -323,25 +290,24 @@ ; CHECK-NEXT: .short 0x5300 @ half 56 entry: - %0 = bitcast i16 %b to half - %cmp1 = fcmp ule half 34., %0 - %cond1 = select i1 %cmp1, half 34., half %0 + %cmp1 = fcmp ule half 34., %b + %cond1 = select i1 %cmp1, half 34., half %b %cmp2 = fcmp ule half %cond1, 56. %cond2 = select i1 %cmp2, half %cond1, half 56. ret half %cond2 } -define half @fp16_vminnm_NNNu_rev(i16 signext %b) { +define half @fp16_vminnm_NNNu_rev(half %b) { ; CHECK-LABEL: fp16_vminnm_NNNu_rev: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldr.16 s2, .LCPI16_0 -; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s0, r0 ; CHECK-NEXT: vcmp.f16 s0, s2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselge.f16 s0, s2, s0 ; CHECK-NEXT: vldr.16 s2, .LCPI16_1 ; CHECK-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: @ %bb.1: @@ -352,50 +318,48 @@ entry: - %0 = bitcast i16 %b to half - %cmp1 = fcmp ugt half 56., %0 - %cond1 = select i1 %cmp1, half %0, half 56. + %cmp1 = fcmp ugt half 56., %b + %cond1 = select i1 %cmp1, half %b, half 56. %cmp2 = fcmp ugt half %cond1, 78. %cond2 = select i1 %cmp2, half 78., half %cond1 ret half %cond2 } -define half @fp16_vmaxnm_NNNo(i16 signext %a) { +define half @fp16_vmaxnm_NNNo(half %a) { ; CHECK-LABEL: fp16_vmaxnm_NNNo: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s0, r0 ; CHECK-NEXT: vmov.f16 s2, #1.200000e+01 ; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NEXT: vldr.16 s2, .LCPI17_0 ; CHECK-NEXT: vcmp.f16 s2, s0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI17_0: ; CHECK-NEXT: .short 0x5040 @ half 34 entry: - %0 = bitcast i16 %a to half - %cmp1 = fcmp ogt half %0, 12. - %cond1 = select i1 %cmp1, half %0, half 12. + %cmp1 = fcmp ogt half %a, 12. + %cond1 = select i1 %cmp1, half %a, half 12. %cmp2 = fcmp ogt half 34., %cond1 %cond2 = select i1 %cmp2, half 34., half %cond1 ret half %cond2 } -define half @fp16_vmaxnm_NNNoge(i16 signext %a) { +define half @fp16_vmaxnm_NNNoge(half %a) { ; CHECK-LABEL: fp16_vmaxnm_NNNoge: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldr.16 s2, .LCPI18_0 -; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s0, r0 ; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NEXT: vldr.16 s2, .LCPI18_1 ; CHECK-NEXT: vcmp.f16 s2, s0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselge.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: @ %bb.1: @@ -404,25 +368,24 @@ ; CHECK-NEXT: .LCPI18_1: ; CHECK-NEXT: .short 0x5300 @ half 56 entry: - %0 = bitcast i16 %a to half - %cmp1 = fcmp oge half %0, 34. - %cond1 = select i1 %cmp1, half %0, half 34. + %cmp1 = fcmp oge half %a, 34. + %cond1 = select i1 %cmp1, half %a, half 34. %cmp2 = fcmp oge half 56., %cond1 %cond2 = select i1 %cmp2, half 56., half %cond1 ret half %cond2 } -define half @fp16_vmaxnm_NNNo_rev(i16 signext %a) { +define half @fp16_vmaxnm_NNNo_rev(half %a) { ; CHECK-LABEL: fp16_vmaxnm_NNNo_rev: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldr.16 s2, .LCPI19_0 -; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s0, r0 ; CHECK-NEXT: vcmp.f16 s2, s0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-NEXT: vldr.16 s2, .LCPI19_1 ; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: @ %bb.1: @@ -431,25 +394,24 @@ ; CHECK-NEXT: .LCPI19_1: ; CHECK-NEXT: .short 0x54e0 @ half 78 entry: - %0 = bitcast i16 %a to half - %cmp1 = fcmp olt half %0, 56. - %cond1 = select i1 %cmp1, half 56., half %0 + %cmp1 = fcmp olt half %a, 56. + %cond1 = select i1 %cmp1, half 56., half %a %cmp2 = fcmp olt half 78., %cond1 %cond2 = select i1 %cmp2, half %cond1, half 78. ret half %cond2 } -define half @fp16_vmaxnm_NNNole_rev(i16 signext %a) { +define half @fp16_vmaxnm_NNNole_rev(half %a) { ; CHECK-LABEL: fp16_vmaxnm_NNNole_rev: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldr.16 s2, .LCPI20_0 -; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s0, r0 ; CHECK-NEXT: vcmp.f16 s2, s0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselge.f16 s0, s2, s0 ; CHECK-NEXT: vldr.16 s2, .LCPI20_1 ; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: @ %bb.1: @@ -458,50 +420,48 @@ ; CHECK-NEXT: .LCPI20_1: ; CHECK-NEXT: .short 0x55a0 @ half 90 entry: - %0 = bitcast i16 %a to half - %cmp1 = fcmp ole half %0, 78. - %cond1 = select i1 %cmp1, half 78., half %0 + %cmp1 = fcmp ole half %a, 78. + %cond1 = select i1 %cmp1, half 78., half %a %cmp2 = fcmp ole half 90., %cond1 %cond2 = select i1 %cmp2, half %cond1, half 90. ret half %cond2 } -define half @fp16_vmaxnm_NNNu(i16 signext %b) { +define half @fp16_vmaxnm_NNNu(half %b) { ; CHECK-LABEL: fp16_vmaxnm_NNNu: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s0, r0 ; CHECK-NEXT: vmov.f16 s2, #1.200000e+01 ; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NEXT: vldr.16 s2, .LCPI21_0 ; CHECK-NEXT: vcmp.f16 s2, s0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselge.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI21_0: ; CHECK-NEXT: .short 0x5040 @ half 34 entry: - %0 = bitcast i16 %b to half - %cmp1 = fcmp ugt half 12., %0 - %cond1 = select i1 %cmp1, half 12., half %0 + %cmp1 = fcmp ugt half 12., %b + %cond1 = select i1 %cmp1, half 12., half %b %cmp2 = fcmp ugt half %cond1, 34. %cond2 = select i1 %cmp2, half %cond1, half 34. ret half %cond2 } -define half @fp16_vmaxnm_NNNuge(i16 signext %b) { +define half @fp16_vmaxnm_NNNuge(half %b) { ; CHECK-LABEL: fp16_vmaxnm_NNNuge: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldr.16 s2, .LCPI22_0 -; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s0, r0 ; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NEXT: vldr.16 s2, .LCPI22_1 ; CHECK-NEXT: vcmp.f16 s2, s0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: @ %bb.1: @@ -510,81 +470,77 @@ ; CHECK-NEXT: .LCPI22_1: ; CHECK-NEXT: .short 0x5300 @ half 56 entry: - %0 = bitcast i16 %b to half - %cmp1 = fcmp uge half 34., %0 - %cond1 = select i1 %cmp1, half 34., half %0 + %cmp1 = fcmp uge half 34., %b + %cond1 = select i1 %cmp1, half 34., half %b %cmp2 = fcmp uge half %cond1, 56. %cond2 = select i1 %cmp2, half %cond1, half 56. ret half %cond2 } -define half @fp16_vminmaxnm_neg0(i16 signext %a) { +define half @fp16_vminmaxnm_neg0(half %a) { ; CHECK-LABEL: fp16_vminmaxnm_neg0: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldr.16 s0, .LCPI23_0 -; CHECK-NEXT: vmov.f16 s2, r1 +; CHECK-NEXT: vmov.f16 s2, r0 ; CHECK-NEXT: vminnm.f16 s2, s2, s0 ; CHECK-NEXT: vcmp.f16 s0, s2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselge.f16 s0, s0, s2 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI23_0: ; CHECK-NEXT: .short 0x8000 @ half -0 entry: - %0 = bitcast i16 %a to half - %cmp1 = fcmp olt half %0, -0. - %cond1 = select i1 %cmp1, half %0, half -0. + %cmp1 = fcmp olt half %a, -0. + %cond1 = select i1 %cmp1, half %a, half -0. %cmp2 = fcmp ugt half %cond1, -0. %cond2 = select i1 %cmp2, half %cond1, half -0. ret half %cond2 } -define half @fp16_vminmaxnm_e_0(i16 signext %a) { +define half @fp16_vminmaxnm_e_0(half %a) { ; CHECK-LABEL: fp16_vminmaxnm_e_0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s0, r0 ; CHECK-NEXT: vldr.16 s2, .LCPI24_0 ; CHECK-NEXT: vcmp.f16 s0, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselge.f16 s0, s2, s0 ; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI24_0: ; CHECK-NEXT: .short 0x0000 @ half 0 entry: - %0 = bitcast i16 %a to half - %cmp1 = fcmp nsz ole half 0., %0 - %cond1 = select i1 %cmp1, half 0., half %0 + %cmp1 = fcmp nsz ole half 0., %a + %cond1 = select i1 %cmp1, half 0., half %a %cmp2 = fcmp nsz uge half 0., %cond1 %cond2 = select i1 %cmp2, half 0., half %cond1 ret half %cond2 } -define half @fp16_vminmaxnm_e_neg0(i16 signext %a) { +define half @fp16_vminmaxnm_e_neg0(half %a) { ; CHECK-LABEL: fp16_vminmaxnm_e_neg0: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldr.16 s0, .LCPI25_0 -; CHECK-NEXT: vmov.f16 s2, r1 +; CHECK-NEXT: vmov.f16 s2, r0 ; CHECK-NEXT: vminnm.f16 s2, s2, s0 ; CHECK-NEXT: vcmp.f16 s0, s2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselge.f16 s0, s0, s2 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI25_0: ; CHECK-NEXT: .short 0x8000 @ half -0 entry: - %0 = bitcast i16 %a to half - %cmp1 = fcmp nsz ule half -0., %0 - %cond1 = select i1 %cmp1, half -0., half %0 + %cmp1 = fcmp nsz ule half -0., %a + %cond1 = select i1 %cmp1, half -0., half %a %cmp2 = fcmp nsz oge half -0., %cond1 %cond2 = select i1 %cmp2, half -0., half %cond1 ret half %cond2 diff --git a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll @@ -21,6 +21,7 @@ ; CHECK-NEXT: vmov s2, r0 ; CHECK-NEXT: vadd.f32 s0, s2, s0 ; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bl __aeabi_f2h ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr ; CHECK-NEXT: .p2align 2 diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll @@ -21,6 +21,7 @@ ; CHECK-NEXT: vmov s2, r0 ; CHECK-NEXT: vmul.f32 s0, s2, s0 ; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bl __aeabi_f2h ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr ; CHECK-NEXT: .p2align 2 diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -682,7 +682,7 @@ define arm_aapcs_vfpcc half @extract_f16_0(<8 x half> %a) { ; CHECK-LABEL: extract_f16_0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bx lr entry: %res = extractelement <8 x half> %a, i32 0 @@ -693,7 +693,6 @@ ; CHECK-LABEL: extract_f16_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vstr.16 s0, [r0] ; CHECK-NEXT: bx lr entry: %res = extractelement <8 x half> %a, i32 3 diff --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll --- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll @@ -253,10 +253,9 @@ define arm_aapcs_vfpcc half @vdup_f16_extract(half* %src1, half* %src2) { ; CHECK-LABEL: vdup_f16_extract: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vldr.16 s2, [r1] +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vldr.16 s2, [r0] ; CHECK-NEXT: vadd.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] ; CHECK-NEXT: bx lr entry: %0 = load half, half *%src1, align 2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll @@ -78,7 +78,6 @@ ; CHECK-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NEXT: vstr.16 s0, [r0] ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: @ %bb.1: @@ -103,7 +102,6 @@ ; CHECK-NEXT: vminnm.f16 s4, s4, s6 ; CHECK-NEXT: vminnm.f16 s4, s4, s3 ; CHECK-NEXT: vminnm.f16 s0, s4, s0 -; CHECK-NEXT: vstr.16 s0, [r0] ; CHECK-NEXT: bx lr entry: %z = call fast half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x) @@ -125,7 +123,6 @@ ; CHECK-FP-NEXT: vminnm.f16 s4, s4, s6 ; CHECK-FP-NEXT: vminnm.f16 s4, s4, s3 ; CHECK-FP-NEXT: vminnm.f16 s0, s4, s0 -; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v16f16: @@ -169,7 +166,6 @@ ; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0 -; CHECK-NOFP-NEXT: vstr.16 s0, [r0] ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x) @@ -309,20 +305,20 @@ define arm_aapcs_vfpcc half @fmin_v4f16_nofast(<4 x half> %x) { ; CHECK-FP-LABEL: fmin_v4f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 +; CHECK-FP-NEXT: vmov r0, s1 +; CHECK-FP-NEXT: vdup.32 q1, r0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vmov.u16 r0, q0[1] +; CHECK-FP-NEXT: vdup.16 q1, r0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vstr.16 s0, [r0] +; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov r1, s1 +; CHECK-NOFP-NEXT: vmov r0, s1 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 -; CHECK-NOFP-NEXT: vdup.32 q1, r1 +; CHECK-NOFP-NEXT: vdup.32 q1, r0 ; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 ; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr @@ -333,7 +329,6 @@ ; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 -; CHECK-NOFP-NEXT: vstr.16 s0, [r0] ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x) @@ -346,13 +341,13 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 +; CHECK-FP-NEXT: vmov r0, s1 +; CHECK-FP-NEXT: vdup.32 q1, r0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vmov.u16 r0, q0[1] +; CHECK-FP-NEXT: vdup.16 q1, r0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vstr.16 s0, [r0] +; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f16_nofast: @@ -384,7 +379,6 @@ ; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 -; CHECK-NOFP-NEXT: vstr.16 s0, [r0] ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x) @@ -398,13 +392,13 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 +; CHECK-FP-NEXT: vmov r0, s1 +; CHECK-FP-NEXT: vdup.32 q1, r0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vmov.u16 r0, q0[1] +; CHECK-FP-NEXT: vdup.16 q1, r0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vstr.16 s0, [r0] +; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v16f16_nofast: @@ -462,7 +456,6 @@ ; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 -; CHECK-NOFP-NEXT: vstr.16 s0, [r0] ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x) @@ -1195,7 +1188,6 @@ ; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NEXT: vstr.16 s0, [r0] ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 1 ; CHECK-NEXT: @ %bb.1: @@ -1220,7 +1212,6 @@ ; CHECK-NEXT: vmaxnm.f16 s4, s4, s6 ; CHECK-NEXT: vmaxnm.f16 s4, s4, s3 ; CHECK-NEXT: vmaxnm.f16 s0, s4, s0 -; CHECK-NEXT: vstr.16 s0, [r0] ; CHECK-NEXT: bx lr entry: %z = call fast half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x) @@ -1242,7 +1233,6 @@ ; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s6 ; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s3 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s4, s0 -; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v16f16: @@ -1286,7 +1276,6 @@ ; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0 -; CHECK-NOFP-NEXT: vstr.16 s0, [r0] ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x) @@ -1424,20 +1413,20 @@ define arm_aapcs_vfpcc half @fmax_v4f16_nofast(<4 x half> %x) { ; CHECK-FP-LABEL: fmax_v4f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 +; CHECK-FP-NEXT: vmov r0, s1 +; CHECK-FP-NEXT: vdup.32 q1, r0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vmov.u16 r0, q0[1] +; CHECK-FP-NEXT: vdup.16 q1, r0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vstr.16 s0, [r0] +; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov r1, s1 +; CHECK-NOFP-NEXT: vmov r0, s1 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 -; CHECK-NOFP-NEXT: vdup.32 q1, r1 +; CHECK-NOFP-NEXT: vdup.32 q1, r0 ; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 ; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr @@ -1448,7 +1437,6 @@ ; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 -; CHECK-NOFP-NEXT: vstr.16 s0, [r0] ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x) @@ -1461,13 +1449,13 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 +; CHECK-FP-NEXT: vmov r0, s1 +; CHECK-FP-NEXT: vdup.32 q1, r0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vmov.u16 r0, q0[1] +; CHECK-FP-NEXT: vdup.16 q1, r0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vstr.16 s0, [r0] +; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f16_nofast: @@ -1499,7 +1487,6 @@ ; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 -; CHECK-NOFP-NEXT: vstr.16 s0, [r0] ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x) @@ -1513,13 +1500,13 @@ ; CHECK-FP-NEXT: vmov.f64 d2, d1 ; CHECK-FP-NEXT: vmov.f32 s5, s3 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 +; CHECK-FP-NEXT: vmov r0, s1 +; CHECK-FP-NEXT: vdup.32 q1, r0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vmov.u16 r0, q0[1] +; CHECK-FP-NEXT: vdup.16 q1, r0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vstr.16 s0, [r0] +; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v16f16_nofast: @@ -1577,7 +1564,6 @@ ; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 -; CHECK-NOFP-NEXT: vstr.16 s0, [r0] ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x)