Index: lib/CodeGen/GlobalISel/LegalizerHelper.cpp =================================================================== --- lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -1222,6 +1222,32 @@ MI.eraseFromParent(); return Legalized; } + + // Unary operations. + case TargetOpcode::G_FCEIL: { + // FIXME: Should handle vectors. + if (NarrowTy.isVector()) + return UnableToLegalize; + + unsigned NarrowSize = NarrowTy.getSizeInBits(); + unsigned VectorDst = MI.getOperand(0).getReg(); + unsigned Size = MRI.getType(VectorDst).getSizeInBits(); + int NumParts = Size / NarrowSize; + + // FIXME: Should handle differing sizes. + if (Size % NarrowSize != 0) + return UnableToLegalize; + SmallVector SrcRegs, DstRegs; + extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs); + for (int i = 0; i < NumParts; ++i) { + unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy); + MIRBuilder.buildInstr(TargetOpcode::G_FCEIL, {DstReg}, {SrcRegs[i]}); + DstRegs.push_back(DstReg); + } + MIRBuilder.buildBuildVector(VectorDst, DstRegs); + MI.eraseFromParent(); + return Legalized; + } case TargetOpcode::G_LOAD: case TargetOpcode::G_STORE: { bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD; Index: lib/Target/AArch64/AArch64InstructionSelector.cpp =================================================================== --- lib/Target/AArch64/AArch64InstructionSelector.cpp +++ lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -74,6 +74,7 @@ MachineRegisterInfo &MRI) const; bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; ComplexRendererFns selectArithImmed(MachineOperand &Root) const; @@ -332,39 +333,71 @@ return GenericOpc; } -static bool selectFP16CopyFromGPR32(MachineInstr &I, const TargetInstrInfo &TII, - MachineRegisterInfo &MRI, unsigned SrcReg) { - // Copies from gpr32 to fpr16 need to use a sub-register copy. - unsigned CopyReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); +static bool +selectSubregisterCopy(MachineInstr &I, const TargetInstrInfo &TII, + MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, unsigned SrcReg, + const TargetRegisterClass *From, + const TargetRegisterClass *To, unsigned SubReg) { + unsigned CopyReg = MRI.createVirtualRegister(From); BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::COPY)) .addDef(CopyReg) .addUse(SrcReg); - unsigned SubRegCopy = MRI.createVirtualRegister(&AArch64::FPR16RegClass); + unsigned SubRegCopy = MRI.createVirtualRegister(To); BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY)) .addDef(SubRegCopy) - .addUse(CopyReg, 0, AArch64::hsub); - + .addUse(CopyReg, 0, SubReg); MachineOperand &RegOp = I.getOperand(1); RegOp.setReg(SubRegCopy); + + // It's possible that the destination register won't be constrained. Make + // sure that happens. + if (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg())) + RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI); + return true; } static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, - const RegisterBankInfo &RBI) { + const RegisterBankInfo &RBI, + const AArch64Subtarget &STI) { unsigned DstReg = I.getOperand(0).getReg(); unsigned SrcReg = I.getOperand(1).getReg(); if (TargetRegisterInfo::isPhysicalRegister(DstReg)) { + const RegisterBank &RegBank = *RBI.getRegBank(SrcReg, MRI, TRI); + const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank( + MRI.getType(SrcReg), RegBank, RBI, /* GetAllRegSet */ true); + if (TRI.getRegClass(AArch64::FPR16RegClassID)->contains(DstReg) && !TargetRegisterInfo::isPhysicalRegister(SrcReg)) { - const RegisterBank &RegBank = *RBI.getRegBank(SrcReg, MRI, TRI); - const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank( - MRI.getType(SrcReg), RegBank, RBI, /* GetAllRegSet */ true); + // Copies from gpr32 to fpr16 need to use a sub-register copy. if (SrcRC == &AArch64::GPR32allRegClass) - return selectFP16CopyFromGPR32(I, TII, MRI, SrcReg); + return selectSubregisterCopy(I, TII, MRI, RBI, SrcReg, + &AArch64::FPR32RegClass, + &AArch64::FPR16RegClass, AArch64::hsub); + } + + // Is this a vector copy? + if (MRI.getType(SrcReg).isVector() && + !TargetRegisterInfo::isPhysicalRegister(SrcReg)) { + const LLT &VecTy = MRI.getType(SrcReg); + // If we don't have full fp 16 support, then we need to perform a + // subregister copy. + if (VecTy.getElementType().getSizeInBits() == 16 && !STI.hasFullFP16()) { + if (VecTy.getNumElements() == 4) { + return selectSubregisterCopy(I, TII, MRI, RBI, SrcReg, + &AArch64::FPR128RegClass, SrcRC, + AArch64::dsub); + } else if (VecTy.getNumElements() == 8) { + return selectSubregisterCopy(I, TII, MRI, RBI, SrcReg, + &AArch64::FPR128RegClass, SrcRC, + AArch64::hsub); + } + } } + assert(I.isCopy() && "Generic operators do not allow physical registers"); return true; } @@ -419,7 +452,9 @@ RegOp.setReg(PromoteReg); } else if (RC == &AArch64::FPR16RegClass && SrcRC == &AArch64::GPR32allRegClass) { - selectFP16CopyFromGPR32(I, TII, MRI, SrcReg); + return selectSubregisterCopy(I, TII, MRI, RBI, SrcReg, + &AArch64::FPR32RegClass, + &AArch64::FPR16RegClass, AArch64::hsub); } } @@ -757,7 +792,7 @@ } if (I.isCopy()) - return selectCopy(I, TII, MRI, TRI, RBI); + return selectCopy(I, TII, MRI, TRI, RBI, STI); return true; } @@ -1274,7 +1309,7 @@ .addImm(AArch64::sub_32); I.getOperand(1).setReg(ExtSrc); } - return selectCopy(I, TII, MRI, TRI, RBI); + return selectCopy(I, TII, MRI, TRI, RBI, STI); } case TargetOpcode::G_ZEXT: @@ -1353,7 +1388,7 @@ case TargetOpcode::G_INTTOPTR: // The importer is currently unable to import pointer types since they // didn't exist in SelectionDAG. - return selectCopy(I, TII, MRI, TRI, RBI); + return selectCopy(I, TII, MRI, TRI, RBI, STI); case TargetOpcode::G_BITCAST: // Imported SelectionDAG rules can handle every bitcast except those that @@ -1361,7 +1396,7 @@ // but we might not run an optimizer that deletes them. if (MRI.getType(I.getOperand(0).getReg()) == MRI.getType(I.getOperand(1).getReg())) - return selectCopy(I, TII, MRI, TRI, RBI); + return selectCopy(I, TII, MRI, TRI, RBI, STI); return false; case TargetOpcode::G_SELECT: { @@ -1556,6 +1591,8 @@ return selectBuildVector(I, MRI); case TargetOpcode::G_MERGE_VALUES: return selectMergeValues(I, MRI); + case TargetOpcode::G_UNMERGE_VALUES: + return selectUnmergeValues(I, MRI); } return false; @@ -1584,6 +1621,8 @@ }; switch (DstTy.getElementType().getSizeInBits()) { + case 16: + return BuildFn(AArch64::hsub); case 32: return BuildFn(AArch64::ssub); case 64: @@ -1639,6 +1678,135 @@ return true; } +bool AArch64InstructionSelector::selectUnmergeValues( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && + "unexpected opcode"); + + // TODO: Handle unmerging into GPRs. + if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() != + AArch64::FPRRegBankID) { + LLVM_DEBUG(dbgs() << "Vector-to-GPR unmerges not supported yet.\n"); + return false; + } + + // The last operand is the vector source register, and every other operand is + // a register to unpack into. + unsigned NumElts = I.getNumOperands() - 1; + unsigned SrcReg = I.getOperand(NumElts).getReg(); + const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); + const LLT WideTy = MRI.getType(SrcReg); + assert(WideTy.isVector() && "can only unmerge from vector types!"); + assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() && + "source register size too small!"); + + // TODO: Handle unmerging into scalars. + if (!NarrowTy.isScalar()) { + LLVM_DEBUG(dbgs() << "Vector-to-vector unmerges not supported yet.\n"); + return false; + } + + // Choose a lane copy opcode and subregister based off of the size of the + // vector's elements. + unsigned CopyOpc = 0; + unsigned ExtractSubReg = 0; + switch (NarrowTy.getSizeInBits()) { + case 16: + CopyOpc = AArch64::CPYi16; + ExtractSubReg = AArch64::hsub; + break; + case 32: + CopyOpc = AArch64::CPYi32; + ExtractSubReg = AArch64::ssub; + break; + case 64: + CopyOpc = AArch64::CPYi64; + ExtractSubReg = AArch64::dsub; + break; + default: + // Unknown size, bail out. + LLVM_DEBUG(dbgs() << "NarrowTy had unsupported size.\n"); + return false; + } + + // Set up for the lane copies. + MachineBasicBlock &MBB = *I.getParent(); + + // Stores the registers we'll be copying from. + std::vector InsertRegs; + + // We'll use the first register twice, so we only need NumElts-1 registers. + unsigned NumInsertRegs = NumElts - 1; + + // If our elements fit into exactly 128 bits, then we can copy from the source + // directly. Otherwise, we need to do a bit of setup with some subregister + // inserts. + if (NarrowTy.getSizeInBits() * NumElts == 128) { + InsertRegs = std::vector(NumInsertRegs, SrcReg); + } else { + // No. We have to perform subregister inserts. For each insert, create an + // implicit def and a subregister insert, and save the register we create. + InsertRegs.reserve(NumInsertRegs); + for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) { + unsigned ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); + MachineInstr &ImpDefMI = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF), + ImpDefReg); + + // Now, create the subregister insert from SrcReg. + unsigned InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); + MachineInstr &InsMI = + *BuildMI(MBB, I, I.getDebugLoc(), + TII.get(TargetOpcode::INSERT_SUBREG), InsertReg) + .addUse(ImpDefReg) + .addUse(SrcReg) + .addImm(AArch64::dsub); + + constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); + + // Save the register so that we can copy from it after. + InsertRegs.push_back(InsertReg); + } + } + + // Now that we've created any necessary subregister inserts, we can + // create the copies. + // + // Perform the first copy separately as a subregister copy. + unsigned CopyTo = I.getOperand(0).getReg(); + MachineInstr &FirstCopy = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::COPY), CopyTo) + .addUse(InsertRegs[0], 0, ExtractSubReg); + constrainSelectedInstRegOperands(FirstCopy, TII, TRI, RBI); + + // Now, perform the remaining copies as vector lane copies. + unsigned LaneIdx = 1; + for (unsigned InsReg : InsertRegs) { + unsigned CopyTo = I.getOperand(LaneIdx).getReg(); + MachineInstr &CopyInst = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo) + .addUse(InsReg) + .addImm(LaneIdx); + constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI); + ++LaneIdx; + } + + // Separately constrain the first copy's destination. Because of the + // limitation in constrainOperandRegClass, we can't guarantee that this will + // actually be constrained. So, do it ourselves using the second operand. + const TargetRegisterClass *RC = + MRI.getRegClassOrNull(I.getOperand(1).getReg()); + if (!RC) { + LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n"); + return false; + } + + RBI.constrainGenericRegister(CopyTo, *RC, MRI); + I.eraseFromParent(); + return true; +} + bool AArch64InstructionSelector::selectBuildVector( MachineInstr &I, MachineRegisterInfo &MRI) const { assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); @@ -1647,7 +1815,7 @@ const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); const LLT EltTy = MRI.getType(I.getOperand(1).getReg()); unsigned EltSize = EltTy.getSizeInBits(); - if (EltSize < 32 || EltSize > 64) + if (EltSize < 16 || EltSize > 64) return false; // Don't support all element types yet. const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); unsigned Opc; @@ -1661,7 +1829,10 @@ SubregIdx = AArch64::dsub; } } else { - if (EltSize == 32) { + if (EltSize == 16) { + Opc = AArch64::INSvi16lane; + SubregIdx = AArch64::hsub; + } else if (EltSize == 32) { Opc = AArch64::INSvi32lane; SubregIdx = AArch64::ssub; } else { @@ -1670,14 +1841,13 @@ } } - if (EltSize * DstTy.getNumElements() != 128) - return false; // Don't handle unpacked vectors yet. - unsigned DstVec = 0; - const TargetRegisterClass *DstRC = getRegClassForTypeOnBank( - DstTy, RBI.getRegBank(AArch64::FPRRegBankID), RBI); - emitScalarToVector(DstVec, DstTy, DstRC, I.getOperand(1).getReg(), - *I.getParent(), I.getIterator(), MRI); + + // We always operate on a 128 bit wide register. + const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; + if (!emitScalarToVector(DstVec, DstTy, DstRC, I.getOperand(1).getReg(), + *I.getParent(), I.getIterator(), MRI)) + return false; for (unsigned i = 2, e = DstTy.getSizeInBits() / EltSize + 1; i < e; ++i) { unsigned InsDef; // For the last insert re-use the dst reg of the G_BUILD_VECTOR. Index: lib/Target/AArch64/AArch64LegalizerInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -110,6 +110,15 @@ getActionDefinitionsBuilder({G_FREM, G_FPOW}).libcallFor({s32, s64}); getActionDefinitionsBuilder(G_FCEIL) + // If we don't have full FP16 support, then scalarize the elements of + // vectors containing fp16 types. + .fewerElementsIf( + [=, &ST](const LegalityQuery &Query) { + const auto &Ty = Query.Types[0]; + return Ty.isVector() && Ty.getElementType() == s16 && + !ST.hasFullFP16(); + }, + [=](const LegalityQuery &Query) { return std::make_pair(0, s16); }) // If we don't have full FP16 support, then widen s16 to s32 if we // encounter it. .widenScalarIf( @@ -117,7 +126,7 @@ return Query.Types[0] == s16 && !ST.hasFullFP16(); }, [=](const LegalityQuery &Query) { return std::make_pair(0, s32); }) - .legalFor({s16, s32, s64, v2s32, v4s32, v2s64}); + .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16}); getActionDefinitionsBuilder(G_INSERT) .unsupportedIf([=](const LegalityQuery &Query) { @@ -421,7 +430,7 @@ }); getActionDefinitionsBuilder(G_BUILD_VECTOR) - .legalFor({{v4s32, s32}, {v2s64, s64}}) + .legalFor({{v4s16, s16}, {v8s16, s16}, {v4s32, s32}, {v2s64, s64}}) .clampNumElements(0, v4s32, v4s32) .clampNumElements(0, v2s64, v2s64) Index: lib/Target/AArch64/AArch64RegisterBankInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -633,6 +633,62 @@ OpRegBankIdx[0] = PMI_FirstFPR; break; } + break; + case TargetOpcode::G_UNMERGE_VALUES: { + // If the first operand belongs to a FPR register bank, then make sure that + // we preserve that. + if (OpRegBankIdx[0] != PMI_FirstGPR) + break; + + // Helper lambda that returns true if MI has floating point constraints. + auto HasFPConstraints = [&TRI, &MRI, this](MachineInstr &MI) { + unsigned Op = MI.getOpcode(); + + // Do we have an explicit floating point instruction? + if (isPreISelGenericFloatingPointOpcode(Op)) + return true; + + // No. Check if we have a copy-like instruction. If we do, then we could + // still be fed by floating point instructions. + if (Op != TargetOpcode::COPY && !MI.isPHI()) + return false; + + // MI is copy-like. Return true if it's using an FPR. + return getRegBank(MI.getOperand(0).getReg(), MRI, TRI) == + &AArch64::FPRRegBank; + }; + + if (any_of(MRI.use_instructions(MI.getOperand(0).getReg()), + [&](MachineInstr &MI) { return HasFPConstraints(MI); })) { + // Set the register bank of every operand to FPR. + for (unsigned Idx = 0, NumOperands = MI.getNumOperands(); + Idx < NumOperands; ++Idx) + OpRegBankIdx[Idx] = PMI_FirstFPR; + } + break; + } + + case TargetOpcode::G_BUILD_VECTOR: + // If the first operand belongs to a FPR register bank, then make sure + // that we preserve that. + if (OpRegBankIdx[1] != PMI_FirstGPR) + break; + unsigned VReg = MI.getOperand(1).getReg(); + if (!VReg) + break; + + // Get the instruction that defined the BUILD_VECTOR, and check if it's + // a floating point operation. + MachineInstr *DefMI = MRI.getVRegDef(VReg); + unsigned DefOpc = DefMI->getOpcode(); + if (isPreISelGenericFloatingPointOpcode(DefOpc)) { + // Have a floating point op. + // Make sure every operand gets mapped to a FPR register class. + unsigned NumOperands = MI.getNumOperands(); + for (unsigned Idx = 0; Idx < NumOperands; ++Idx) + OpRegBankIdx[Idx] = PMI_FirstFPR; + } + break; } // Finally construct the computed mapping. Index: test/CodeGen/AArch64/GlobalISel/legalize-ceil.mir =================================================================== --- /dev/null +++ test/CodeGen/AArch64/GlobalISel/legalize-ceil.mir @@ -0,0 +1,86 @@ +# RUN: llc -mtriple=arm64-unknown-unknown -global-isel -O0 -mattr=-fullfp16 -run-pass=legalizer %s -o - | FileCheck %s + +--- | + define <8 x half> @test_v8f16.ceil(<8 x half> %a) { + ret <8 x half> %a + } + + define <4 x half> @test_v4f16.ceil(<4 x half> %a) { + ret <4 x half> %a + } + +... +--- +name: test_v8f16.ceil +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: $q0 + ; CHECK-LABEL: name: test_v8f16.ceil + %0:_(<8 x s16>) = COPY $q0 + ; CHECK: %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16) = G_UNMERGE_VALUES %{{[0-9]+}}(<8 x s16>) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(<8 x s16>) = G_BUILD_VECTOR %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16) + %1:_(<8 x s16>) = G_FCEIL %0 + $q0 = COPY %1(<8 x s16>) + RET_ReallyLR implicit $q0 + +... +--- +name: test_v4f16.ceil +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: $d0 + ; CHECK-LABEL: name: test_v4f16.ceil + %0:_(<4 x s16>) = COPY $d0 + ; CHECK: %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16) = G_UNMERGE_VALUES %{{[0-9]+}}(<4 x s16>) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(<4 x s16>) = G_BUILD_VECTOR %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16) + %1:_(<4 x s16>) = G_FCEIL %0 + $d0 = COPY %1(<4 x s16>) + RET_ReallyLR implicit $d0 + +... Index: test/CodeGen/AArch64/GlobalISel/select-ceil.mir =================================================================== --- test/CodeGen/AArch64/GlobalISel/select-ceil.mir +++ test/CodeGen/AArch64/GlobalISel/select-ceil.mir @@ -1,5 +1,6 @@ # RUN: llc -verify-machineinstrs -mtriple aarch64--- \ -# RUN: -run-pass=instruction-select -global-isel %s -o - | FileCheck %s +# RUN: -run-pass=instruction-select -mattr=+fullfp16 -global-isel %s -o - \ +# RUN: | FileCheck %s ... --- name: ceil_float @@ -91,3 +92,39 @@ $q0 = COPY %1(<2 x s64>) ... +--- +name: ceil_v4f16 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } +body: | + bb.0: + ; CHECK-LABEL: name: ceil_v4f16 + ; CHECK: %{{[0-9]+}}:fpr64 = FRINTPv4f16 %{{[0-9]+}} + liveins: $d0 + %0:fpr(<4 x s16>) = COPY $d0 + %1:fpr(<4 x s16>) = G_FCEIL %0 + $d0 = COPY %1(<4 x s16>) + +... +--- +name: ceil_v8f16 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } +body: | + bb.0: + ; CHECK-LABEL: name: ceil_v8f16 + ; CHECK: %{{[0-9]+}}:fpr128 = FRINTPv8f16 %{{[0-9]+}} + liveins: $q0 + %0:fpr(<8 x s16>) = COPY $q0 + %1:fpr(<8 x s16>) = G_FCEIL %0 + $q0 = COPY %1(<8 x s16>) + +... Index: test/CodeGen/AArch64/GlobalISel/select-unmerge.mir =================================================================== --- /dev/null +++ test/CodeGen/AArch64/GlobalISel/select-unmerge.mir @@ -0,0 +1,154 @@ + +# RUN: llc -O0 -mattr=-fullfp16 -mtriple=aarch64-- \ +# RUN: -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s + +--- | + define <2 x double> @test_v2s64_unmerge(<2 x double> %a) { + ret <2 x double> %a + } + + define <4 x float> @test_v4s32_unmerge(<4 x float> %a) { + ret <4 x float> %a + } + + define <4 x half> @test_v4s16_unmerge(<4 x half> %a) { + ret <4 x half> %a + } + + define <8 x half> @test_v8s16_unmerge(<8 x half> %a) { + ret <8 x half> %a + } + +... +--- +name: test_v2s64_unmerge +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + - { id: 3, class: fpr } +body: | + bb.1 (%ir-block.0): + liveins: $q0 + ; CHECK-LABEL: name: test_v2s64_unmerge + %0:fpr(<2 x s64>) = COPY $q0 + + ; Since 2 * 64 = 128, we can just directly copy. + ; CHECK: %2:fpr64 = COPY %0.dsub + ; CHECK: %3:fpr64 = CPYi64 %0, 1 + %2:fpr(s64), %3:fpr(s64) = G_UNMERGE_VALUES %0(<2 x s64>) + + %1:fpr(<2 x s64>) = G_BUILD_VECTOR %2(s64), %3(s64) + $q0 = COPY %1(<2 x s64>) + RET_ReallyLR implicit $q0 +... +--- +name: test_v4s32_unmerge +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + - { id: 3, class: fpr } + - { id: 4, class: fpr } + - { id: 5, class: fpr } +body: | + bb.1 (%ir-block.0): + liveins: $q0 + ; CHECK-LABEL: name: test_v4s32_unmerge + %0:fpr(<4 x s32>) = COPY $q0 + + ; Since 4 * 32 = 128, we can just directly copy. + ; CHECK: %2:fpr32 = COPY %0.ssub + ; CHECK: %3:fpr32 = CPYi32 %0, 1 + ; CHECK: %4:fpr32 = CPYi32 %0, 2 + ; CHECK: %5:fpr32 = CPYi32 %0, 3 + %2:fpr(s32), %3:fpr(s32), %4:fpr(s32), %5:fpr(s32) = G_UNMERGE_VALUES %0(<4 x s32>) + + %1:fpr(<4 x s32>) = G_BUILD_VECTOR %2(s32), %3(s32), %4(s32), %5(s32) + $q0 = COPY %1(<4 x s32>) + RET_ReallyLR implicit $q0 +... +--- +name: test_v4s16_unmerge +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + - { id: 3, class: fpr } + - { id: 4, class: fpr } + - { id: 5, class: fpr } +body: | + bb.1 (%ir-block.0): + liveins: $d0 + ; CHECK-LABEL: name: test_v4s16_unmerge + %0:fpr(<4 x s16>) = COPY $d0 + + ; Since 4 * 16 != 128, we need to widen using implicit defs. + ; Note that we expect to reuse one of the INSERT_SUBREG results, as CPYi16 + ; expects a lane > 0. + ; CHECK-DAG: [[IMPDEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK-NEXT: [[INS_SHARED:%[0-9]+]]:fpr128 = INSERT_SUBREG [[IMPDEF1]], %0, %subreg.dsub + ; CHECK: [[IMPDEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK-NEXT: [[INS2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[IMPDEF2]], %0, %subreg.dsub + ; CHECK: [[IMPDEF3:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK-NEXT: [[INS3:%[0-9]+]]:fpr128 = INSERT_SUBREG [[IMPDEF3]], %0, %subreg.dsub + ; CHECK: %2:fpr16 = COPY [[INS_SHARED]].hsub + ; CHECK: %3:fpr16 = CPYi16 [[INS_SHARED]], 1 + ; CHECK: %4:fpr16 = CPYi16 [[INS2]], 2 + ; CHECK: %5:fpr16 = CPYi16 [[INS3]], 3 + %2:fpr(s16), %3:fpr(s16), %4:fpr(s16), %5:fpr(s16) = G_UNMERGE_VALUES %0(<4 x s16>) + + %1:fpr(<4 x s16>) = G_BUILD_VECTOR %2(s16), %3(s16), %4(s16), %5(s16) + $d0 = COPY %1(<4 x s16>) + RET_ReallyLR implicit $d0 +... +--- +name: test_v8s16_unmerge +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + - { id: 3, class: fpr } + - { id: 4, class: fpr } + - { id: 5, class: fpr } + - { id: 6, class: fpr } + - { id: 7, class: fpr } + - { id: 8, class: fpr } + - { id: 9, class: fpr } +body: | + bb.1 (%ir-block.0): + liveins: $q0 + ; CHECK-LABEL: name: test_v8s16_unmerge + %0:fpr(<8 x s16>) = COPY $q0 + + ; Since 8 * 16 = 128, we can just directly copy. + ; CHECK: %2:fpr16 = COPY %0.hsub + ; CHECK: %3:fpr16 = CPYi16 %0, 1 + ; CHECK: %4:fpr16 = CPYi16 %0, 2 + ; CHECK: %5:fpr16 = CPYi16 %0, 3 + ; CHECK: %6:fpr16 = CPYi16 %0, 4 + ; CHECK: %7:fpr16 = CPYi16 %0, 5 + ; CHECK: %8:fpr16 = CPYi16 %0, 6 + ; CHECK: %9:fpr16 = CPYi16 %0, 7 + %2:fpr(s16), %3:fpr(s16), %4:fpr(s16), %5:fpr(s16), %6:fpr(s16), %7:fpr(s16), %8:fpr(s16), %9:fpr(s16) = G_UNMERGE_VALUES %0(<8 x s16>) + + %1:fpr(<8 x s16>) = G_BUILD_VECTOR %2:fpr(s16), %3:fpr(s16), %4:fpr(s16), %5:fpr(s16), %6:fpr(s16), %7:fpr(s16), %8:fpr(s16), %9:fpr(s16) + $q0 = COPY %1(<8 x s16>) + RET_ReallyLR implicit $q0 +... Index: test/CodeGen/AArch64/arm64-vfloatintrinsics.ll =================================================================== --- test/CodeGen/AArch64/arm64-vfloatintrinsics.ll +++ test/CodeGen/AArch64/arm64-vfloatintrinsics.ll @@ -3,6 +3,13 @@ ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -mattr=+fullfp16 \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP16 +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -mattr=-fullfp16 \ +; RUN: -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* \ +; RUN: 2>&1 | FileCheck %s --check-prefixes=GISEL,GISEL-NOFP16,FALLBACK +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -mattr=+fullfp16 \ +; RUN: -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* \ +; RUN: 2>&1 | FileCheck %s --check-prefixes=GISEL,GISEL-FP16,FALLBACK + ;;; Half vectors %v4f16 = type <4 x half> @@ -111,6 +118,12 @@ ; CHECK-FP16-NOT: fcvt ; CHECK-FP16: frintp.4h ; CHECK-FP16-NEXT: ret + ; FALLBACK-NOT: remark{{.*}}test_v4f16.ceil: + ; GISEL-LABEL: test_v4f16.ceil: + ; GISEL-NOFP16-COUNT-4: frintp s{{[0-9]+}}, s{{[0-9]+}} + ; GISEL-FP16-NOT: fcvt + ; GISEL-FP16: frintp.4h + ; GISEL-FP16-NEXT: ret %1 = call %v4f16 @llvm.ceil.v4f16(%v4f16 %a) ret %v4f16 %1 } @@ -268,6 +281,12 @@ ; CHECK-FP16-NOT: fcvt ; CHECK-FP16: frintp.8h ; CHECK-FP16-NEXT: ret + ; FALLBACK-NOT: remark{{.*}}test_v8f16.ceil: + ; GISEL-LABEL: test_v8f16.ceil: + ; GISEL-NOFP16-COUNT-8: frintp s{{[0-9]+}}, s{{[0-9]+}} + ; GISEL-FP16-NOT: fcvt + ; GISEL-FP16: frintp.8h + ; GISEL-FP16-NEXT: ret %1 = call %v8f16 @llvm.ceil.v8f16(%v8f16 %a) ret %v8f16 %1 } @@ -400,8 +419,11 @@ ret %v2f32 %1 } ; CHECK-LABEL: test_v2f32.ceil: +; FALLBACK-NOT: remark{{.*}}test_v2f32.ceil +; GISEL-LABEL: test_v2f32.ceil: define %v2f32 @test_v2f32.ceil(%v2f32 %a) { ; CHECK: frintp.2s + ; GISEL: frintp.2s %1 = call %v2f32 @llvm.ceil.v2f32(%v2f32 %a) ret %v2f32 %1 } @@ -525,8 +547,11 @@ ret %v4f32 %1 } ; CHECK: test_v4f32.ceil: +; FALLBACK-NOT: remark{{.*}}test_v4f32.ceil +; GISEL-LABEL: test_v4f32.ceil: define %v4f32 @test_v4f32.ceil(%v4f32 %a) { ; CHECK: frintp.4s + ; GISEL: frintp.4s %1 = call %v4f32 @llvm.ceil.v4f32(%v4f32 %a) ret %v4f32 %1 } @@ -649,8 +674,11 @@ ret %v2f64 %1 } ; CHECK: test_v2f64.ceil: +; FALLBACK-NOT: remark{{.*}}test_v2f64.ceil +; GISEL-LABEL: test_v2f64.ceil: define %v2f64 @test_v2f64.ceil(%v2f64 %a) { ; CHECK: frintp.2d + ; GISEL: frintp.2d %1 = call %v2f64 @llvm.ceil.v2f64(%v2f64 %a) ret %v2f64 %1 }