Index: lib/CodeGen/GlobalISel/LegalizerHelper.cpp =================================================================== --- lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -1289,7 +1289,8 @@ case TargetOpcode::G_FABS: case TargetOpcode::G_FDIV: case TargetOpcode::G_FREM: - case TargetOpcode::G_FMA: { + case TargetOpcode::G_FMA: + case TargetOpcode::G_FCEIL: { unsigned NarrowSize = NarrowTy.getSizeInBits(); unsigned DstReg = MI.getOperand(0).getReg(); unsigned Flags = MI.getFlags(); Index: lib/Target/AArch64/AArch64InstructionSelector.cpp =================================================================== --- lib/Target/AArch64/AArch64InstructionSelector.cpp +++ lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -73,6 +73,7 @@ MachineRegisterInfo &MRI) const; bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; ComplexRendererFns selectArithImmed(MachineOperand &Root) const; @@ -176,6 +177,70 @@ return nullptr; } +/// Given a register bank, and size in bits, return the smallest register class +/// that can represent that combination. +const TargetRegisterClass *getMinClassForRegBank(const RegisterBank &RB, + unsigned SizeInBits, + bool GetAllRegSet = false) { + unsigned RegBankID = RB.getID(); + + if (RegBankID == AArch64::GPRRegBankID) { + if (SizeInBits <= 32) + return GetAllRegSet ? &AArch64::GPR32allRegClass + : &AArch64::GPR32RegClass; + if (SizeInBits == 64) + return GetAllRegSet ? &AArch64::GPR64allRegClass + : &AArch64::GPR64RegClass; + } + + if (RegBankID == AArch64::FPRRegBankID) { + switch (SizeInBits) { + default: + return nullptr; + case 8: + return &AArch64::FPR8RegClass; + case 16: + return &AArch64::FPR16RegClass; + case 32: + return &AArch64::FPR32RegClass; + case 64: + return &AArch64::FPR64RegClass; + case 128: + return &AArch64::FPR128RegClass; + } + } + + return nullptr; +} + +/// Returns the correct subregister to use for a given register class. +static bool getSubRegForClass(const TargetRegisterClass *RC, + const TargetRegisterInfo &TRI, unsigned &SubReg) { + switch (TRI.getRegSizeInBits(*RC)) { + case 8: + SubReg = AArch64::bsub; + break; + case 16: + SubReg = AArch64::hsub; + break; + case 32: + if (RC == &AArch64::GPR32RegClass) + SubReg = AArch64::sub_32; + else + SubReg = AArch64::ssub; + break; + case 64: + SubReg = AArch64::dsub; + break; + default: + LLVM_DEBUG( + dbgs() << "Couldn't find appropriate subregister for register class."); + return false; + } + + return true; +} + /// Check whether \p I is a currently unsupported binary operation: /// - it has an unsized type /// - an operand is not a vreg @@ -331,107 +396,178 @@ return GenericOpc; } -static bool selectFP16CopyFromGPR32(MachineInstr &I, const TargetInstrInfo &TII, - MachineRegisterInfo &MRI, unsigned SrcReg) { - // Copies from gpr32 to fpr16 need to use a sub-register copy. - unsigned CopyReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::COPY)) - .addDef(CopyReg) - .addUse(SrcReg); - unsigned SubRegCopy = MRI.createVirtualRegister(&AArch64::FPR16RegClass); - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY)) - .addDef(SubRegCopy) - .addUse(CopyReg, 0, AArch64::hsub); - - MachineOperand &RegOp = I.getOperand(1); - RegOp.setReg(SubRegCopy); - return true; -} - -static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, - MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, - const RegisterBankInfo &RBI) { - - unsigned DstReg = I.getOperand(0).getReg(); - unsigned SrcReg = I.getOperand(1).getReg(); - - if (TargetRegisterInfo::isPhysicalRegister(DstReg)) { - if (TRI.getRegClass(AArch64::FPR16RegClassID)->contains(DstReg) && - !TargetRegisterInfo::isPhysicalRegister(SrcReg)) { - const RegisterBank &RegBank = *RBI.getRegBank(SrcReg, MRI, TRI); - const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank( - MRI.getType(SrcReg), RegBank, RBI, /* GetAllRegSet */ true); - if (SrcRC == &AArch64::GPR32allRegClass) - return selectFP16CopyFromGPR32(I, TII, MRI, SrcReg); - } - assert(I.isCopy() && "Generic operators do not allow physical registers"); - return true; - } - - const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI); - const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); - (void)DstSize; +/// Helper function that verifies that we have a valid copy at the end of +/// selectCopy. Verifies that the source and dest have the expected sizes and +/// then returns true. +static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, + const RegisterBankInfo &RBI) { + const unsigned DstReg = I.getOperand(0).getReg(); + const unsigned SrcReg = I.getOperand(1).getReg(); + const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); - (void)SrcSize; - assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) && - "No phys reg on generic operators"); + + // Make sure the size of the source and dest line up. assert( (DstSize == SrcSize || // Copies are a mean to setup initial types, the number of // bits may not exactly match. - (TargetRegisterInfo::isPhysicalRegister(SrcReg) && - DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI)) || + (TargetRegisterInfo::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) || // Copies are a mean to copy bits around, as long as we are // on the same register class, that's fine. Otherwise, that // means we need some SUBREG_TO_REG or AND & co. (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) && "Copy with different width?!"); - assert((DstSize <= 64 || RegBank.getID() == AArch64::FPRRegBankID) && + + // Check the size of the destination. + assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) && "GPRs cannot get more than 64-bit width values"); - const TargetRegisterClass *RC = getRegClassForTypeOnBank( - MRI.getType(DstReg), RegBank, RBI, /* GetAllRegSet */ true); - if (!RC) { - LLVM_DEBUG(dbgs() << "Unexpected bitcast size " << DstSize << '\n'); + return true; +} + +/// Helper function for selectCopy. Inserts a subregister copy from +/// \p *From to \p *To, linking it up to \p I. +/// +/// e.g, given I = "Dst = COPY SrcReg", we'll transform that into +/// +/// CopyReg (From class) = COPY SrcReg +/// SubRegCopy (To class) = COPY CopyReg:SubReg +/// Dst = COPY SubRegCopy +static bool selectSubregisterCopy(MachineInstr &I, const TargetInstrInfo &TII, + MachineRegisterInfo &MRI, + const RegisterBankInfo &RBI, unsigned SrcReg, + const TargetRegisterClass *From, + const TargetRegisterClass *To, + unsigned SubReg) { + unsigned CopyReg = MRI.createVirtualRegister(From); + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::COPY), CopyReg) + .addUse(SrcReg); + unsigned SubRegCopy = MRI.createVirtualRegister(To); + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY), + SubRegCopy) + .addUse(CopyReg, 0, SubReg); + MachineOperand &RegOp = I.getOperand(1); + RegOp.setReg(SubRegCopy); + + // It's possible that the destination register won't be constrained. Make + // sure that happens. + if (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg())) + RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI); + + return true; +} + +static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, + MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, + const RegisterBankInfo &RBI) { + + unsigned DstReg = I.getOperand(0).getReg(); + unsigned SrcReg = I.getOperand(1).getReg(); + const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); + const TargetRegisterClass *DstRC = getMinClassForRegBank( + DstRegBank, RBI.getSizeInBits(DstReg, MRI, TRI), true); + if (!DstRC) { + LLVM_DEBUG(dbgs() << "Unexpected dest size " + << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n'); return false; } - if (!TargetRegisterInfo::isPhysicalRegister(SrcReg)) { - const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(SrcReg); - const TargetRegisterClass *SrcRC = - RegClassOrBank.dyn_cast(); - const RegisterBank *RB = nullptr; + // A couple helpers below, for making sure that the copy we produce is valid. + + // Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want + // to verify that the src and dst are the same size, since that's handled by + // the SUBREG_TO_REG. + bool KnownValid = false; + + // Returns true, or asserts if something we don't expect happens. Instead of + // returning true, we return isValidCopy() to ensure that we verify the + // result. + auto CheckCopy = [&I, &DstRegBank, &MRI, &TRI, &RBI, &KnownValid]() { + // If we have a bitcast or something, we can't have physical registers. + assert( + I.isCopy() || + (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg()) && + !TargetRegisterInfo::isPhysicalRegister(I.getOperand(1).getReg())) && + "No phys reg on generic operator!"); + assert(KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI)); + return true; + }; + + // Is this a copy? If so, then we may need to insert a subregister copy, or + // a SUBREG_TO_REG. + if (I.isCopy()) { + // Yes. Check if there's anything to fix up. + const TargetRegisterClass *SrcRC = getMinClassForRegBank( + SrcRegBank, RBI.getSizeInBits(SrcReg, MRI, TRI), true); if (!SrcRC) { - RB = RegClassOrBank.get(); - SrcRC = getRegClassForTypeOnBank(MRI.getType(SrcReg), *RB, RBI, true); + LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n"); + return false; } - // Copies from fpr16 to gpr32 need to use SUBREG_TO_REG. - if (RC == &AArch64::GPR32allRegClass && SrcRC == &AArch64::FPR16RegClass) { - unsigned PromoteReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); - BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(AArch64::SUBREG_TO_REG)) - .addDef(PromoteReg) - .addImm(0) - .addUse(SrcReg) - .addImm(AArch64::hsub); - MachineOperand &RegOp = I.getOperand(1); - RegOp.setReg(PromoteReg); - } else if (RC == &AArch64::FPR16RegClass && - SrcRC == &AArch64::GPR32allRegClass) { - selectFP16CopyFromGPR32(I, TII, MRI, SrcReg); + + // Is this a cross-bank copy? + if (DstRegBank.getID() != SrcRegBank.getID()) { + // If we're doing a cross-bank copy on different-sized registers, we need + // to do a bit more work. + unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC); + unsigned DstSize = TRI.getRegSizeInBits(*DstRC); + + if (SrcSize > DstSize) { + // We're doing a cross-bank copy into a smaller register. We need a + // subregister copy. First, get a register class that's on the same bank + // as the destination, but the same size as the source. + const TargetRegisterClass *SubregRC = + getMinClassForRegBank(DstRegBank, SrcSize, true); + + // Get the appropriate subregister for the destination. + unsigned SubReg = 0; + if (!getSubRegForClass(DstRC, TRI, SubReg)) { + LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n"); + return false; + } + + // Now, insert a subregister copy using the new register class. + selectSubregisterCopy(I, TII, MRI, RBI, SrcReg, SubregRC, DstRC, + SubReg); + return CheckCopy(); + } + + else if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 && + SrcSize == 16) { + // Special case for FPR16 to GPR32. + // FIXME: This can probably be generalized like the above case. + unsigned PromoteReg = + MRI.createVirtualRegister(&AArch64::FPR32RegClass); + BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(AArch64::SUBREG_TO_REG), PromoteReg) + .addImm(0) + .addUse(SrcReg) + .addImm(AArch64::hsub); + MachineOperand &RegOp = I.getOperand(1); + RegOp.setReg(PromoteReg); + + // Promise that the copy is implicitly validated by the SUBREG_TO_REG. + KnownValid = true; + } } + + // If the destination is a physical register, then there's nothing to + // change, so we're done. + if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + return CheckCopy(); } - // No need to constrain SrcReg. It will get constrained when - // we hit another of its use or its defs. - // Copies do not have constraints. - if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) { + // No need to constrain SrcReg. It will get constrained when we hit another + // of its use or its defs. Copies do not have constraints. + if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode()) << " operand\n"); return false; } I.setDesc(TII.get(AArch64::COPY)); - return true; + return CheckCopy(); } static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) { @@ -1555,6 +1691,8 @@ return selectBuildVector(I, MRI); case TargetOpcode::G_MERGE_VALUES: return selectMergeValues(I, MRI); + case TargetOpcode::G_UNMERGE_VALUES: + return selectUnmergeValues(I, MRI); } return false; @@ -1583,6 +1721,8 @@ }; switch (DstTy.getElementType().getSizeInBits()) { + case 16: + return BuildFn(AArch64::hsub); case 32: return BuildFn(AArch64::ssub); case 64: @@ -1638,6 +1778,135 @@ return true; } +bool AArch64InstructionSelector::selectUnmergeValues( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && + "unexpected opcode"); + + // TODO: Handle unmerging into GPRs. + if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() != + AArch64::FPRRegBankID) { + LLVM_DEBUG(dbgs() << "Vector-to-GPR unmerges not supported yet.\n"); + return false; + } + + // The last operand is the vector source register, and every other operand is + // a register to unpack into. + unsigned NumElts = I.getNumOperands() - 1; + unsigned SrcReg = I.getOperand(NumElts).getReg(); + const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); + const LLT WideTy = MRI.getType(SrcReg); + assert(WideTy.isVector() && "can only unmerge from vector types!"); + assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() && + "source register size too small!"); + + // TODO: Handle unmerging into scalars. + if (!NarrowTy.isScalar()) { + LLVM_DEBUG(dbgs() << "Vector-to-vector unmerges not supported yet.\n"); + return false; + } + + // Choose a lane copy opcode and subregister based off of the size of the + // vector's elements. + unsigned CopyOpc = 0; + unsigned ExtractSubReg = 0; + switch (NarrowTy.getSizeInBits()) { + case 16: + CopyOpc = AArch64::CPYi16; + ExtractSubReg = AArch64::hsub; + break; + case 32: + CopyOpc = AArch64::CPYi32; + ExtractSubReg = AArch64::ssub; + break; + case 64: + CopyOpc = AArch64::CPYi64; + ExtractSubReg = AArch64::dsub; + break; + default: + // Unknown size, bail out. + LLVM_DEBUG(dbgs() << "NarrowTy had unsupported size.\n"); + return false; + } + + // Set up for the lane copies. + MachineBasicBlock &MBB = *I.getParent(); + + // Stores the registers we'll be copying from. + std::vector InsertRegs; + + // We'll use the first register twice, so we only need NumElts-1 registers. + unsigned NumInsertRegs = NumElts - 1; + + // If our elements fit into exactly 128 bits, then we can copy from the source + // directly. Otherwise, we need to do a bit of setup with some subregister + // inserts. + if (NarrowTy.getSizeInBits() * NumElts == 128) { + InsertRegs = std::vector(NumInsertRegs, SrcReg); + } else { + // No. We have to perform subregister inserts. For each insert, create an + // implicit def and a subregister insert, and save the register we create. + InsertRegs.reserve(NumInsertRegs); + for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) { + unsigned ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); + MachineInstr &ImpDefMI = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF), + ImpDefReg); + + // Now, create the subregister insert from SrcReg. + unsigned InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); + MachineInstr &InsMI = + *BuildMI(MBB, I, I.getDebugLoc(), + TII.get(TargetOpcode::INSERT_SUBREG), InsertReg) + .addUse(ImpDefReg) + .addUse(SrcReg) + .addImm(AArch64::dsub); + + constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); + + // Save the register so that we can copy from it after. + InsertRegs.push_back(InsertReg); + } + } + + // Now that we've created any necessary subregister inserts, we can + // create the copies. + // + // Perform the first copy separately as a subregister copy. + unsigned CopyTo = I.getOperand(0).getReg(); + MachineInstr &FirstCopy = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::COPY), CopyTo) + .addUse(InsertRegs[0], 0, ExtractSubReg); + constrainSelectedInstRegOperands(FirstCopy, TII, TRI, RBI); + + // Now, perform the remaining copies as vector lane copies. + unsigned LaneIdx = 1; + for (unsigned InsReg : InsertRegs) { + unsigned CopyTo = I.getOperand(LaneIdx).getReg(); + MachineInstr &CopyInst = + *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo) + .addUse(InsReg) + .addImm(LaneIdx); + constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI); + ++LaneIdx; + } + + // Separately constrain the first copy's destination. Because of the + // limitation in constrainOperandRegClass, we can't guarantee that this will + // actually be constrained. So, do it ourselves using the second operand. + const TargetRegisterClass *RC = + MRI.getRegClassOrNull(I.getOperand(1).getReg()); + if (!RC) { + LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n"); + return false; + } + + RBI.constrainGenericRegister(CopyTo, *RC, MRI); + I.eraseFromParent(); + return true; +} + bool AArch64InstructionSelector::selectBuildVector( MachineInstr &I, MachineRegisterInfo &MRI) const { assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); @@ -1646,7 +1915,7 @@ const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); const LLT EltTy = MRI.getType(I.getOperand(1).getReg()); unsigned EltSize = EltTy.getSizeInBits(); - if (EltSize < 32 || EltSize > 64) + if (EltSize < 16 || EltSize > 64) return false; // Don't support all element types yet. const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); unsigned Opc; @@ -1660,7 +1929,10 @@ SubregIdx = AArch64::dsub; } } else { - if (EltSize == 32) { + if (EltSize == 16) { + Opc = AArch64::INSvi16lane; + SubregIdx = AArch64::hsub; + } else if (EltSize == 32) { Opc = AArch64::INSvi32lane; SubregIdx = AArch64::ssub; } else { @@ -1669,21 +1941,24 @@ } } - if (EltSize * DstTy.getNumElements() != 128) - return false; // Don't handle unpacked vectors yet. - unsigned DstVec = 0; - const TargetRegisterClass *DstRC = getRegClassForTypeOnBank( - DstTy, RBI.getRegBank(AArch64::FPRRegBankID), RBI); - emitScalarToVector(DstVec, DstTy, DstRC, I.getOperand(1).getReg(), - *I.getParent(), I.getIterator(), MRI); - for (unsigned i = 2, e = DstTy.getSizeInBits() / EltSize + 1; i < e; ++i) { + + const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; + if (!emitScalarToVector(DstVec, DstTy, DstRC, I.getOperand(1).getReg(), + *I.getParent(), I.getIterator(), MRI)) + return false; + + unsigned DstSize = DstTy.getSizeInBits(); + + // Keep track of the last MI we inserted. Later on, we might be able to save + // a copy using it. + MachineInstr *PrevMI = nullptr; + for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) { unsigned InsDef; - // For the last insert re-use the dst reg of the G_BUILD_VECTOR. - if (i + 1 < e) - InsDef = MRI.createVirtualRegister(DstRC); - else - InsDef = I.getOperand(0).getReg(); + + // Note that if we don't do a subregister copy, we end up making one more + // of these than we need. + InsDef = MRI.createVirtualRegister(DstRC); unsigned LaneIdx = i - 1; if (RB.getID() == AArch64::FPRRegBankID) { unsigned ImpDef = MRI.createVirtualRegister(DstRC); @@ -1708,6 +1983,7 @@ constrainSelectedInstRegOperands(InsSubMI, TII, TRI, RBI); constrainSelectedInstRegOperands(InsEltMI, TII, TRI, RBI); DstVec = InsDef; + PrevMI = &InsEltMI; } else { MachineInstr &InsMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opc)) @@ -1717,8 +1993,53 @@ .addUse(I.getOperand(i).getReg()); constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); DstVec = InsDef; + PrevMI = &InsMI; } } + + // If DstTy's size in bits is less than 128, then emit a subregister copy + // from DstVec to the last register we've defined. + if (DstSize < 128) { + unsigned SubReg = 0; + + // Helper lambda to decide on a register class and subregister for the + // subregister copy. + auto GetRegInfoForCopy = [&SubReg, + &DstSize]() -> const TargetRegisterClass * { + switch (DstSize) { + default: + LLVM_DEBUG(dbgs() << "Unknown destination size (" << DstSize << ")\n"); + return nullptr; + case 32: + SubReg = AArch64::ssub; + return &AArch64::FPR32RegClass; + case 64: + SubReg = AArch64::dsub; + return &AArch64::FPR64RegClass; + } + }; + + const TargetRegisterClass *RC = GetRegInfoForCopy(); + if (!RC) + return false; + + unsigned Reg = MRI.createVirtualRegister(RC); + unsigned DstReg = I.getOperand(0).getReg(); + + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY), + DstReg) + .addUse(DstVec, 0, SubReg); + MachineOperand &RegOp = I.getOperand(1); + RegOp.setReg(Reg); + RBI.constrainGenericRegister(DstReg, *RC, MRI); + } else { + // We don't need a subregister copy. Save a copy by re-using the + // destination register on the final insert. + assert(PrevMI && "PrevMI was null?"); + PrevMI->getOperand(0).setReg(I.getOperand(0).getReg()); + constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI); + } + I.eraseFromParent(); return true; } Index: lib/Target/AArch64/AArch64LegalizerInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -124,6 +124,15 @@ getActionDefinitionsBuilder({G_FREM, G_FPOW}).libcallFor({s32, s64}); getActionDefinitionsBuilder(G_FCEIL) + // If we don't have full FP16 support, then scalarize the elements of + // vectors containing fp16 types. + .fewerElementsIf( + [=, &ST](const LegalityQuery &Query) { + const auto &Ty = Query.Types[0]; + return Ty.isVector() && Ty.getElementType() == s16 && + !ST.hasFullFP16(); + }, + [=](const LegalityQuery &Query) { return std::make_pair(0, s16); }) // If we don't have full FP16 support, then widen s16 to s32 if we // encounter it. .widenScalarIf( @@ -131,7 +140,7 @@ return Query.Types[0] == s16 && !ST.hasFullFP16(); }, [=](const LegalityQuery &Query) { return std::make_pair(0, s32); }) - .legalFor({s16, s32, s64, v2s32, v4s32, v2s64}); + .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16}); getActionDefinitionsBuilder(G_INSERT) .unsupportedIf([=](const LegalityQuery &Query) { @@ -435,7 +444,7 @@ }); getActionDefinitionsBuilder(G_BUILD_VECTOR) - .legalFor({{v4s32, s32}, {v2s64, s64}}) + .legalFor({{v4s16, s16}, {v8s16, s16}, {v4s32, s32}, {v2s64, s64}}) .clampNumElements(0, v4s32, v4s32) .clampNumElements(0, v2s64, v2s64) Index: lib/Target/AArch64/AArch64RegisterBankInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -635,6 +635,62 @@ OpRegBankIdx[0] = PMI_FirstFPR; break; } + break; + case TargetOpcode::G_UNMERGE_VALUES: { + // If the first operand belongs to a FPR register bank, then make sure that + // we preserve that. + if (OpRegBankIdx[0] != PMI_FirstGPR) + break; + + // Helper lambda that returns true if MI has floating point constraints. + auto HasFPConstraints = [&TRI, &MRI, this](MachineInstr &MI) { + unsigned Op = MI.getOpcode(); + + // Do we have an explicit floating point instruction? + if (isPreISelGenericFloatingPointOpcode(Op)) + return true; + + // No. Check if we have a copy-like instruction. If we do, then we could + // still be fed by floating point instructions. + if (Op != TargetOpcode::COPY && !MI.isPHI()) + return false; + + // MI is copy-like. Return true if it's using an FPR. + return getRegBank(MI.getOperand(0).getReg(), MRI, TRI) == + &AArch64::FPRRegBank; + }; + + if (any_of(MRI.use_instructions(MI.getOperand(0).getReg()), + [&](MachineInstr &MI) { return HasFPConstraints(MI); })) { + // Set the register bank of every operand to FPR. + for (unsigned Idx = 0, NumOperands = MI.getNumOperands(); + Idx < NumOperands; ++Idx) + OpRegBankIdx[Idx] = PMI_FirstFPR; + } + break; + } + + case TargetOpcode::G_BUILD_VECTOR: + // If the first operand belongs to a FPR register bank, then make sure + // that we preserve that. + if (OpRegBankIdx[1] != PMI_FirstGPR) + break; + unsigned VReg = MI.getOperand(1).getReg(); + if (!VReg) + break; + + // Get the instruction that defined the BUILD_VECTOR, and check if it's + // a floating point operation. + MachineInstr *DefMI = MRI.getVRegDef(VReg); + unsigned DefOpc = DefMI->getOpcode(); + if (isPreISelGenericFloatingPointOpcode(DefOpc)) { + // Have a floating point op. + // Make sure every operand gets mapped to a FPR register class. + unsigned NumOperands = MI.getNumOperands(); + for (unsigned Idx = 0; Idx < NumOperands; ++Idx) + OpRegBankIdx[Idx] = PMI_FirstFPR; + } + break; } // Finally construct the computed mapping. Index: test/CodeGen/AArch64/GlobalISel/legalize-ceil.mir =================================================================== --- /dev/null +++ test/CodeGen/AArch64/GlobalISel/legalize-ceil.mir @@ -0,0 +1,86 @@ +# RUN: llc -mtriple=arm64-unknown-unknown -global-isel -O0 -mattr=-fullfp16 -run-pass=legalizer %s -o - | FileCheck %s + +--- | + define <8 x half> @test_v8f16.ceil(<8 x half> %a) { + ret <8 x half> %a + } + + define <4 x half> @test_v4f16.ceil(<4 x half> %a) { + ret <4 x half> %a + } + +... +--- +name: test_v8f16.ceil +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: $q0 + ; CHECK-LABEL: name: test_v8f16.ceil + %0:_(<8 x s16>) = COPY $q0 + ; CHECK: %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16) = G_UNMERGE_VALUES %{{[0-9]+}}(<8 x s16>) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(<8 x s16>) = G_BUILD_VECTOR %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16) + %1:_(<8 x s16>) = G_FCEIL %0 + $q0 = COPY %1(<8 x s16>) + RET_ReallyLR implicit $q0 + +... +--- +name: test_v4f16.ceil +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: $d0 + ; CHECK-LABEL: name: test_v4f16.ceil + %0:_(<4 x s16>) = COPY $d0 + ; CHECK: %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16) = G_UNMERGE_VALUES %{{[0-9]+}}(<4 x s16>) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16) + ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}} + ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32) + ; CHECK: %{{[0-9]+}}:_(<4 x s16>) = G_BUILD_VECTOR %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16) + %1:_(<4 x s16>) = G_FCEIL %0 + $d0 = COPY %1(<4 x s16>) + RET_ReallyLR implicit $d0 + +... Index: test/CodeGen/AArch64/GlobalISel/select-ceil.mir =================================================================== --- test/CodeGen/AArch64/GlobalISel/select-ceil.mir +++ test/CodeGen/AArch64/GlobalISel/select-ceil.mir @@ -1,5 +1,6 @@ # RUN: llc -verify-machineinstrs -mtriple aarch64--- \ -# RUN: -run-pass=instruction-select -global-isel %s -o - | FileCheck %s +# RUN: -run-pass=instruction-select -mattr=+fullfp16 -global-isel %s -o - \ +# RUN: | FileCheck %s ... --- name: ceil_float @@ -91,3 +92,39 @@ $q0 = COPY %1(<2 x s64>) ... +--- +name: ceil_v4f16 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } +body: | + bb.0: + ; CHECK-LABEL: name: ceil_v4f16 + ; CHECK: %{{[0-9]+}}:fpr64 = FRINTPv4f16 %{{[0-9]+}} + liveins: $d0 + %0:fpr(<4 x s16>) = COPY $d0 + %1:fpr(<4 x s16>) = G_FCEIL %0 + $d0 = COPY %1(<4 x s16>) + +... +--- +name: ceil_v8f16 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } +body: | + bb.0: + ; CHECK-LABEL: name: ceil_v8f16 + ; CHECK: %{{[0-9]+}}:fpr128 = FRINTPv8f16 %{{[0-9]+}} + liveins: $q0 + %0:fpr(<8 x s16>) = COPY $q0 + %1:fpr(<8 x s16>) = G_FCEIL %0 + $q0 = COPY %1(<8 x s16>) + +... Index: test/CodeGen/AArch64/GlobalISel/select-unmerge.mir =================================================================== --- /dev/null +++ test/CodeGen/AArch64/GlobalISel/select-unmerge.mir @@ -0,0 +1,154 @@ + +# RUN: llc -O0 -mattr=-fullfp16 -mtriple=aarch64-- \ +# RUN: -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s + +--- | + define <2 x double> @test_v2s64_unmerge(<2 x double> %a) { + ret <2 x double> %a + } + + define <4 x float> @test_v4s32_unmerge(<4 x float> %a) { + ret <4 x float> %a + } + + define <4 x half> @test_v4s16_unmerge(<4 x half> %a) { + ret <4 x half> %a + } + + define <8 x half> @test_v8s16_unmerge(<8 x half> %a) { + ret <8 x half> %a + } + +... +--- +name: test_v2s64_unmerge +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + - { id: 3, class: fpr } +body: | + bb.1 (%ir-block.0): + liveins: $q0 + ; CHECK-LABEL: name: test_v2s64_unmerge + %0:fpr(<2 x s64>) = COPY $q0 + + ; Since 2 * 64 = 128, we can just directly copy. + ; CHECK: %2:fpr64 = COPY %0.dsub + ; CHECK: %3:fpr64 = CPYi64 %0, 1 + %2:fpr(s64), %3:fpr(s64) = G_UNMERGE_VALUES %0(<2 x s64>) + + %1:fpr(<2 x s64>) = G_BUILD_VECTOR %2(s64), %3(s64) + $q0 = COPY %1(<2 x s64>) + RET_ReallyLR implicit $q0 +... +--- +name: test_v4s32_unmerge +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + - { id: 3, class: fpr } + - { id: 4, class: fpr } + - { id: 5, class: fpr } +body: | + bb.1 (%ir-block.0): + liveins: $q0 + ; CHECK-LABEL: name: test_v4s32_unmerge + %0:fpr(<4 x s32>) = COPY $q0 + + ; Since 4 * 32 = 128, we can just directly copy. + ; CHECK: %2:fpr32 = COPY %0.ssub + ; CHECK: %3:fpr32 = CPYi32 %0, 1 + ; CHECK: %4:fpr32 = CPYi32 %0, 2 + ; CHECK: %5:fpr32 = CPYi32 %0, 3 + %2:fpr(s32), %3:fpr(s32), %4:fpr(s32), %5:fpr(s32) = G_UNMERGE_VALUES %0(<4 x s32>) + + %1:fpr(<4 x s32>) = G_BUILD_VECTOR %2(s32), %3(s32), %4(s32), %5(s32) + $q0 = COPY %1(<4 x s32>) + RET_ReallyLR implicit $q0 +... +--- +name: test_v4s16_unmerge +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + - { id: 3, class: fpr } + - { id: 4, class: fpr } + - { id: 5, class: fpr } +body: | + bb.1 (%ir-block.0): + liveins: $d0 + ; CHECK-LABEL: name: test_v4s16_unmerge + %0:fpr(<4 x s16>) = COPY $d0 + + ; Since 4 * 16 != 128, we need to widen using implicit defs. + ; Note that we expect to reuse one of the INSERT_SUBREG results, as CPYi16 + ; expects a lane > 0. + ; CHECK-DAG: [[IMPDEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK-NEXT: [[INS_SHARED:%[0-9]+]]:fpr128 = INSERT_SUBREG [[IMPDEF1]], %0, %subreg.dsub + ; CHECK: [[IMPDEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK-NEXT: [[INS2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[IMPDEF2]], %0, %subreg.dsub + ; CHECK: [[IMPDEF3:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK-NEXT: [[INS3:%[0-9]+]]:fpr128 = INSERT_SUBREG [[IMPDEF3]], %0, %subreg.dsub + ; CHECK: %2:fpr16 = COPY [[INS_SHARED]].hsub + ; CHECK: %3:fpr16 = CPYi16 [[INS_SHARED]], 1 + ; CHECK: %4:fpr16 = CPYi16 [[INS2]], 2 + ; CHECK: %5:fpr16 = CPYi16 [[INS3]], 3 + %2:fpr(s16), %3:fpr(s16), %4:fpr(s16), %5:fpr(s16) = G_UNMERGE_VALUES %0(<4 x s16>) + + %1:fpr(<4 x s16>) = G_BUILD_VECTOR %2(s16), %3(s16), %4(s16), %5(s16) + $d0 = COPY %1(<4 x s16>) + RET_ReallyLR implicit $d0 +... +--- +name: test_v8s16_unmerge +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + - { id: 3, class: fpr } + - { id: 4, class: fpr } + - { id: 5, class: fpr } + - { id: 6, class: fpr } + - { id: 7, class: fpr } + - { id: 8, class: fpr } + - { id: 9, class: fpr } +body: | + bb.1 (%ir-block.0): + liveins: $q0 + ; CHECK-LABEL: name: test_v8s16_unmerge + %0:fpr(<8 x s16>) = COPY $q0 + + ; Since 8 * 16 = 128, we can just directly copy. + ; CHECK: %2:fpr16 = COPY %0.hsub + ; CHECK: %3:fpr16 = CPYi16 %0, 1 + ; CHECK: %4:fpr16 = CPYi16 %0, 2 + ; CHECK: %5:fpr16 = CPYi16 %0, 3 + ; CHECK: %6:fpr16 = CPYi16 %0, 4 + ; CHECK: %7:fpr16 = CPYi16 %0, 5 + ; CHECK: %8:fpr16 = CPYi16 %0, 6 + ; CHECK: %9:fpr16 = CPYi16 %0, 7 + %2:fpr(s16), %3:fpr(s16), %4:fpr(s16), %5:fpr(s16), %6:fpr(s16), %7:fpr(s16), %8:fpr(s16), %9:fpr(s16) = G_UNMERGE_VALUES %0(<8 x s16>) + + %1:fpr(<8 x s16>) = G_BUILD_VECTOR %2:fpr(s16), %3:fpr(s16), %4:fpr(s16), %5:fpr(s16), %6:fpr(s16), %7:fpr(s16), %8:fpr(s16), %9:fpr(s16) + $q0 = COPY %1(<8 x s16>) + RET_ReallyLR implicit $q0 +... Index: test/CodeGen/AArch64/arm64-vfloatintrinsics.ll =================================================================== --- test/CodeGen/AArch64/arm64-vfloatintrinsics.ll +++ test/CodeGen/AArch64/arm64-vfloatintrinsics.ll @@ -3,6 +3,13 @@ ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -mattr=+fullfp16 \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP16 +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -mattr=-fullfp16 \ +; RUN: -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* \ +; RUN: 2>&1 | FileCheck %s --check-prefixes=GISEL,GISEL-NOFP16,FALLBACK +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -mattr=+fullfp16 \ +; RUN: -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* \ +; RUN: 2>&1 | FileCheck %s --check-prefixes=GISEL,GISEL-FP16,FALLBACK + ;;; Half vectors %v4f16 = type <4 x half> @@ -111,6 +118,12 @@ ; CHECK-FP16-NOT: fcvt ; CHECK-FP16: frintp.4h ; CHECK-FP16-NEXT: ret + ; FALLBACK-NOT: remark{{.*}}test_v4f16.ceil: + ; GISEL-LABEL: test_v4f16.ceil: + ; GISEL-NOFP16-COUNT-4: frintp s{{[0-9]+}}, s{{[0-9]+}} + ; GISEL-FP16-NOT: fcvt + ; GISEL-FP16: frintp.4h + ; GISEL-FP16-NEXT: ret %1 = call %v4f16 @llvm.ceil.v4f16(%v4f16 %a) ret %v4f16 %1 } @@ -268,6 +281,12 @@ ; CHECK-FP16-NOT: fcvt ; CHECK-FP16: frintp.8h ; CHECK-FP16-NEXT: ret + ; FALLBACK-NOT: remark{{.*}}test_v8f16.ceil: + ; GISEL-LABEL: test_v8f16.ceil: + ; GISEL-NOFP16-COUNT-8: frintp s{{[0-9]+}}, s{{[0-9]+}} + ; GISEL-FP16-NOT: fcvt + ; GISEL-FP16: frintp.8h + ; GISEL-FP16-NEXT: ret %1 = call %v8f16 @llvm.ceil.v8f16(%v8f16 %a) ret %v8f16 %1 } @@ -400,8 +419,11 @@ ret %v2f32 %1 } ; CHECK-LABEL: test_v2f32.ceil: +; FALLBACK-NOT: remark{{.*}}test_v2f32.ceil +; GISEL-LABEL: test_v2f32.ceil: define %v2f32 @test_v2f32.ceil(%v2f32 %a) { ; CHECK: frintp.2s + ; GISEL: frintp.2s %1 = call %v2f32 @llvm.ceil.v2f32(%v2f32 %a) ret %v2f32 %1 } @@ -525,8 +547,11 @@ ret %v4f32 %1 } ; CHECK: test_v4f32.ceil: +; FALLBACK-NOT: remark{{.*}}test_v4f32.ceil +; GISEL-LABEL: test_v4f32.ceil: define %v4f32 @test_v4f32.ceil(%v4f32 %a) { ; CHECK: frintp.4s + ; GISEL: frintp.4s %1 = call %v4f32 @llvm.ceil.v4f32(%v4f32 %a) ret %v4f32 %1 } @@ -649,8 +674,11 @@ ret %v2f64 %1 } ; CHECK: test_v2f64.ceil: +; FALLBACK-NOT: remark{{.*}}test_v2f64.ceil +; GISEL-LABEL: test_v2f64.ceil: define %v2f64 @test_v2f64.ceil(%v2f64 %a) { ; CHECK: frintp.2d + ; GISEL: frintp.2d %1 = call %v2f64 @llvm.ceil.v2f64(%v2f64 %a) ret %v2f64 %1 }