Index: llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -68,6 +68,8 @@ bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned BitSize); + bool expand_DestructiveOp(MachineInstr &MI, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp, unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg, @@ -344,6 +346,176 @@ return true; } +/// \brief Expand Pseudos to Instructions with destructive operands. +/// +/// This mechanism uses MOVPRFX instructions for zeroing the false lanes +/// or for fixing relaxed register allocation conditions to comply with +/// the instructions register constraints. The latter case may be cheaper +/// than setting the register constraints in the register allocator, +/// since that will insert regular MOV instructions rather than MOVPRFX. +/// +/// Example (after register allocation): +/// +/// FSUB_ZPZZ_ZERO_B Z0, Pg, Z1, Z0 +/// +/// * The Pseudo FSUB_ZPZZ_ZERO_B maps to FSUB_ZPmZ_B. +/// * We cannot map directly to FSUB_ZPmZ_B because the register +/// constraints of the instruction are not met. +/// * Also the _ZERO specifies the false lanes need to be zeroed. +/// +/// We first try to see if the destructive operand == result operand, +/// if not, we try to swap the operands, e.g. +/// +/// FSUB_ZPmZ_B Z0, Pg/m, Z0, Z1 +/// +/// But because FSUB_ZPmZ is not commutative, this is semantically +/// different, so we need a reverse instruction: +/// +/// FSUBR_ZPmZ_B Z0, Pg/m, Z0, Z1 +/// +/// Then we implement the zeroing of the false lanes of Z0 by adding +/// a zeroing MOVPRFX instruction: +/// +/// MOVPRFX_ZPzZ_B Z0, Pg/z, Z0 +/// FSUBR_ZPmZ_B Z0, Pg/m, Z0, Z1 +/// +/// Note that this can only be done for _ZERO or _UNDEF variants where +/// we can guarantee the false lanes to be zeroed (by implementing this) +/// or that they are undef (don't care / not used), otherwise the +/// swapping of operands is illegal because the operation is not +/// (or cannot be emulated to be) fully commutative. +bool AArch64ExpandPseudo::expand_DestructiveOp( + MachineInstr &MI, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + unsigned Opcode = AArch64::getSVEPseudoMap(MI.getOpcode()); + uint64_t DType = TII->get(Opcode).TSFlags & AArch64::DestructiveInstTypeMask; + uint64_t FalseLanes = MI.getDesc().TSFlags & AArch64::FalseLanesMask; + bool FalseZero = FalseLanes == AArch64::FalseLanesZero; + + unsigned DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + + if (DType == AArch64::DestructiveBinary) + assert(DstReg != MI.getOperand(3).getReg()); + + bool UseRev = false; + unsigned PredIdx, DOPIdx, SrcIdx; + switch (DType) { + case AArch64::DestructiveBinaryComm: + case AArch64::DestructiveBinaryCommWithRev: + if (DstReg == MI.getOperand(3).getReg()) { + // FSUB Zd, Pg, Zs1, Zd ==> FSUBR Zd, Pg/m, Zd, Zs1 + std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 3, 2); + UseRev = true; + break; + } + LLVM_FALLTHROUGH; + case AArch64::DestructiveBinary: + std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 2, 3); + break; + default: + llvm_unreachable("Unsupported Destructive Operand type"); + } + +#ifndef NDEBUG + // MOVPRFX can only be used if the destination operand + // is the destructive operand, not as any other operand, + // so the Destructive Operand must be unique. + bool DOPRegIsUnique = false; + switch (DType) { + case AArch64::DestructiveBinaryComm: + case AArch64::DestructiveBinaryCommWithRev: + DOPRegIsUnique = + DstReg != MI.getOperand(DOPIdx).getReg() || + MI.getOperand(DOPIdx).getReg() != MI.getOperand(SrcIdx).getReg(); + break; + } + + assert (DOPRegIsUnique && "The destructive operand should be unique"); +#endif + + // Resolve the reverse opcode + if (UseRev) { + if (AArch64::getSVERevInstr(Opcode) != -1) + Opcode = AArch64::getSVERevInstr(Opcode); + else if (AArch64::getSVEOrigInstr(Opcode) != -1) + Opcode = AArch64::getSVEOrigInstr(Opcode); + } + + // Get the right MOVPRFX + uint64_t ElementSize = TII->getElementSizeForOpcode(Opcode); + unsigned MovPrfx, MovPrfxZero; + switch (ElementSize) { + case AArch64::ElementSizeNone: + case AArch64::ElementSizeB: + MovPrfx = AArch64::MOVPRFX_ZZ; + MovPrfxZero = AArch64::MOVPRFX_ZPzZ_B; + break; + case AArch64::ElementSizeH: + MovPrfx = AArch64::MOVPRFX_ZZ; + MovPrfxZero = AArch64::MOVPRFX_ZPzZ_H; + break; + case AArch64::ElementSizeS: + MovPrfx = AArch64::MOVPRFX_ZZ; + MovPrfxZero = AArch64::MOVPRFX_ZPzZ_S; + break; + case AArch64::ElementSizeD: + MovPrfx = AArch64::MOVPRFX_ZZ; + MovPrfxZero = AArch64::MOVPRFX_ZPzZ_D; + break; + default: + llvm_unreachable("Unsupported ElementSize"); + } + + // + // Create the destructive operation (if required) + // + MachineInstrBuilder PRFX, DOP; + if (FalseZero) { + assert(ElementSize != AArch64::ElementSizeNone && + "This instruction is unpredicated"); + + // Merge source operand into destination register + PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxZero)) + .addReg(DstReg, RegState::Define) + .addReg(MI.getOperand(PredIdx).getReg()) + .addReg(MI.getOperand(DOPIdx).getReg()); + + // After the movprfx, the destructive operand is same as Dst + DOPIdx = 0; + } else if (DstReg != MI.getOperand(DOPIdx).getReg()) { + PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx)) + .addReg(DstReg, RegState::Define) + .addReg(MI.getOperand(DOPIdx).getReg()); + DOPIdx = 0; + } + + // + // Create the destructive operation + // + DOP = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)); + + switch (DType) { + case AArch64::DestructiveBinaryComm: + case AArch64::DestructiveBinaryCommWithRev: + DOP.add(MI.getOperand(PredIdx)) + .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill) + .add(MI.getOperand(SrcIdx)); + break; + } + + if (PRFX) { + finalizeBundle(MBB, PRFX->getIterator(), MBBI->getIterator()); + transferImpOps(MI, PRFX, DOP); + } else + transferImpOps(MI, DOP, DOP); + + MI.eraseFromParent(); + return true; +} + bool AArch64ExpandPseudo::expandSetTagLoop( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI) { @@ -425,6 +597,17 @@ MachineBasicBlock::iterator &NextMBBI) { MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); + + // Check if we can expand the destructive op + int OrigInstr = AArch64::getSVEPseudoMap(MI.getOpcode()); + if (OrigInstr != -1) { + auto &Orig = TII->get(OrigInstr); + if ((Orig.TSFlags & AArch64::DestructiveInstTypeMask) + != AArch64::NotDestructive) { + return expand_DestructiveOp(MI, MBB, MBBI); + } + } + switch (Opcode) { default: break; Index: llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -162,6 +162,22 @@ return false; } + bool SelectDupZero(SDValue N) { + switch(N->getOpcode()) { + case ISD::BUILD_VECTOR: { + auto Splat = cast(N)->getSplatValue(); + if (auto CN = dyn_cast(Splat)) + if (CN->isZero()) + return true; + break; + } + default: + break; + } + + return false; + } + template bool SelectSVEAddSubImm(SDValue N, SDValue &Imm, SDValue &Shift) { return SelectSVEAddSubImm(N, VT, Imm, Shift); Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -22,13 +22,27 @@ // Enum describing whether an instruction is // destructive in its first source operand. -class DestructiveInstTypeEnum val> { - bits<1> Value = val; +class DestructiveInstTypeEnum val> { + bits<4> Value = val; } -def NotDestructive : DestructiveInstTypeEnum<0>; +def NotDestructive : DestructiveInstTypeEnum<0>; // Destructive in its first operand and can be MOVPRFX'd, but has no other // special properties. -def DestructiveOther : DestructiveInstTypeEnum<1>; +def DestructiveOther : DestructiveInstTypeEnum<1>; +def DestructiveUnary : DestructiveInstTypeEnum<2>; +def DestructiveBinaryImm : DestructiveInstTypeEnum<3>; +def DestructiveBinaryShImmUnpred : DestructiveInstTypeEnum<4>; +def DestructiveBinary : DestructiveInstTypeEnum<5>; +def DestructiveBinaryComm : DestructiveInstTypeEnum<6>; +def DestructiveBinaryCommWithRev : DestructiveInstTypeEnum<7>; +def DestructiveTernaryCommWithRev : DestructiveInstTypeEnum<8>; + +class FalseLanesEnum val> { + bits<2> Value = val; +} +def FalseLanesNone : FalseLanesEnum<0>; +def FalseLanesZero : FalseLanesEnum<1>; +def FalseLanesUndef : FalseLanesEnum<2>; // AArch64 Instruction Format class AArch64Inst : Instruction { @@ -46,10 +60,12 @@ bits<2> Form = F.Value; // Defaults + FalseLanesEnum FalseLanes = FalseLanesNone; DestructiveInstTypeEnum DestructiveInstType = NotDestructive; ElementSizeEnum ElementSize = ElementSizeNone; - let TSFlags{3} = DestructiveInstType.Value; + let TSFlags{8-7} = FalseLanes.Value; + let TSFlags{6-3} = DestructiveInstType.Value; let TSFlags{2-0} = ElementSize.Value; let Pattern = []; Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -264,6 +264,8 @@ MachineBasicBlock::iterator &It, MachineFunction &MF, const outliner::Candidate &C) const override; bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override; + /// Returns the vector element size (B, H, S or D) of an SVE opcode. + uint64_t getElementSizeForOpcode(unsigned Opc) const; /// Returns true if the instruction has a shift by immediate that can be /// executed in one cycle less. static bool isFalkorShiftExtFast(const MachineInstr &MI); @@ -288,6 +290,8 @@ isCopyInstrImpl(const MachineInstr &MI) const override; private: + unsigned getInstBundleLength(const MachineInstr &MI) const; + /// Sets the offsets on outlined instructions in \p MBB which use SP /// so that they will be valid post-outlining. /// @@ -374,7 +378,8 @@ // struct TSFlags { #define TSFLAG_ELEMENT_SIZE_TYPE(X) (X) // 3-bits -#define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 1-bit +#define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 4-bit +#define TSFLAG_FALSE_LANE_TYPE(X) ((X) << 7) // 2-bits // } namespace AArch64 { @@ -389,13 +394,31 @@ }; enum DestructiveInstType { - DestructiveInstTypeMask = TSFLAG_DESTRUCTIVE_INST_TYPE(0x1), - NotDestructive = TSFLAG_DESTRUCTIVE_INST_TYPE(0x0), - DestructiveOther = TSFLAG_DESTRUCTIVE_INST_TYPE(0x1), + DestructiveInstTypeMask = TSFLAG_DESTRUCTIVE_INST_TYPE(0xf), + NotDestructive = TSFLAG_DESTRUCTIVE_INST_TYPE(0x0), + DestructiveOther = TSFLAG_DESTRUCTIVE_INST_TYPE(0x1), + DestructiveUnary = TSFLAG_DESTRUCTIVE_INST_TYPE(0x2), + DestructiveBinaryImm = TSFLAG_DESTRUCTIVE_INST_TYPE(0x3), + DestructiveBinaryShImmUnpred = TSFLAG_DESTRUCTIVE_INST_TYPE(0x4), + DestructiveBinary = TSFLAG_DESTRUCTIVE_INST_TYPE(0x5), + DestructiveBinaryComm = TSFLAG_DESTRUCTIVE_INST_TYPE(0x6), + DestructiveBinaryCommWithRev = TSFLAG_DESTRUCTIVE_INST_TYPE(0x7), + DestructiveTernaryCommWithRev = TSFLAG_DESTRUCTIVE_INST_TYPE(0x8), +}; + +enum FalseLaneType { + FalseLanesMask = TSFLAG_FALSE_LANE_TYPE(0x3), + FalseLanesZero = TSFLAG_FALSE_LANE_TYPE(0x1), + FalseLanesUndef = TSFLAG_FALSE_LANE_TYPE(0x2), }; #undef TSFLAG_ELEMENT_SIZE_TYPE #undef TSFLAG_DESTRUCTIVE_INST_TYPE +#undef TSFLAG_FALSE_LANE_TYPE + +int getSVEPseudoMap(uint16_t Opcode); +int getSVERevInstr(uint16_t Opcode); +int getSVEOrigInstr(uint16_t Opcode); } } // end namespace llvm Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -119,11 +119,25 @@ case AArch64::SPACE: NumBytes = MI.getOperand(1).getImm(); break; + case TargetOpcode::BUNDLE: + NumBytes = getInstBundleLength(MI); + break; } return NumBytes; } +unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const { + unsigned Size = 0; + MachineBasicBlock::const_instr_iterator I = MI.getIterator(); + MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); + while (++I != E && I->isInsideBundle()) { + assert(!I->isBundle() && "No nested bundle!"); + Size += getInstSizeInBytes(*I); + } + return Size; +} + static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, SmallVectorImpl &Cond) { // Block ends with fall-through condbranch. @@ -6667,5 +6681,10 @@ return TargetInstrInfo::describeLoadedValue(MI, Reg); } +uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const { + return get(Opc).TSFlags & AArch64::ElementSizeMask; +} + #define GET_INSTRINFO_HELPERS +#define GET_INSTRMAP_INFO #include "AArch64GenInstrInfo.inc" Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -220,19 +220,32 @@ defm FMAX_ZPmI : sve_fp_2op_i_p_zds<0b110, "fmax", sve_fpimm_zero_one>; defm FMIN_ZPmI : sve_fp_2op_i_p_zds<0b111, "fmin", sve_fpimm_zero_one>; - defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", int_aarch64_sve_fadd>; - defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", int_aarch64_sve_fsub>; - defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", int_aarch64_sve_fmul>; - defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", int_aarch64_sve_fsubr>; - defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", int_aarch64_sve_fmaxnm>; - defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", int_aarch64_sve_fminnm>; - defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", int_aarch64_sve_fmax>; - defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin", int_aarch64_sve_fmin>; - defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd", int_aarch64_sve_fabd>; + defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", int_aarch64_sve_fadd, DestructiveBinaryComm>; + defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", int_aarch64_sve_fsub, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ", 1>; + defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", "FMUL_ZPZZ", int_aarch64_sve_fmul, DestructiveBinaryComm>; + defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", "FSUBR_ZPZZ", int_aarch64_sve_fsubr, DestructiveBinaryCommWithRev, "FSUB_ZPmZ", 0>; + defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", "FMAXNM_ZPZZ", int_aarch64_sve_fmaxnm, DestructiveBinaryComm>; + defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", "FMINNM_ZPZZ", int_aarch64_sve_fminnm, DestructiveBinaryComm>; + defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", "FMAX_ZPZZ", int_aarch64_sve_fmax, DestructiveBinaryComm>; + defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin", "FMIN_ZPZZ", int_aarch64_sve_fmin, DestructiveBinaryComm>; + defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd", "FABD_ZPZZ", int_aarch64_sve_fabd, DestructiveBinaryComm>; defm FSCALE_ZPmZ : sve_fp_2op_p_zds_fscale<0b1001, "fscale", int_aarch64_sve_fscale>; - defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx", int_aarch64_sve_fmulx>; - defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr", int_aarch64_sve_fdivr>; - defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", int_aarch64_sve_fdiv>; + defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx", "FMULX_ZPZZ", int_aarch64_sve_fmulx, DestructiveBinaryComm>; + defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr", "FDIVR_ZPZZ", int_aarch64_sve_fdivr, DestructiveBinaryCommWithRev, "FDIV_ZPmZ", 0>; + defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", "FDIV_ZPZZ", int_aarch64_sve_fdiv, DestructiveBinaryCommWithRev, "FDIVR_ZPmZ", 1>; + + defm FADD_ZPZZ : sve_fp_2op_p_zds_zx; + defm FSUB_ZPZZ : sve_fp_2op_p_zds_zx; + defm FMUL_ZPZZ : sve_fp_2op_p_zds_zx; + defm FSUBR_ZPZZ : sve_fp_2op_p_zds_zx; + defm FMAXNM_ZPZZ : sve_fp_2op_p_zds_zx; + defm FMINNM_ZPZZ : sve_fp_2op_p_zds_zx; + defm FMAX_ZPZZ : sve_fp_2op_p_zds_zx; + defm FMIN_ZPZZ : sve_fp_2op_p_zds_zx; + defm FABD_ZPZZ : sve_fp_2op_p_zds_zx; + defm FMULX_ZPZZ : sve_fp_2op_p_zds_zx; + defm FDIVR_ZPZZ : sve_fp_2op_p_zds_zx; + defm FDIV_ZPZZ : sve_fp_2op_p_zds_zx; defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>; defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", fsub>; Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -644,4 +644,7 @@ if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH && TM->getTargetTriple().isOSBinFormatMachO()) addPass(createAArch64CollectLOHPass()); + + // SVE bundles move prefixes with destructive operations. + addPass(createUnpackMachineBundles(nullptr)); } Index: llvm/lib/Target/AArch64/SVEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SVEInstrFormats.td +++ llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -367,8 +367,16 @@ : Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, (vt4 ImmTy:$Op4))), (inst $Op1, $Op2, $Op3, ImmTy:$Op4)>; +def SVEDup0 : ComplexPattern; def SVEDup0Undef : ComplexPattern; +let AddedComplexity = 1 in { +class SVE_3_Op_Pat_SelZero +: Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), vt3:$Op3))), + (inst $Op1, $Op2, $Op3)>; +} + // // Common but less generic patterns. // @@ -378,6 +386,55 @@ : Pat<(vtd (op vt1:$Op1)), (inst (IMPLICIT_DEF), (ptrue 31), $Op1)>; +// +// Pseudo -> Instruction mappings +// +def getSVEPseudoMap : InstrMapping { + let FilterClass = "SVEPseudo2Instr"; + let RowFields = ["PseudoName"]; + let ColFields = ["IsInstr"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +class SVEPseudo2Instr { + string PseudoName = name; + bit IsInstr = instr; +} + +def getSVERevInstr : InstrMapping { + let FilterClass = "SVEInstr2Rev"; + let RowFields = ["InstrName"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + +def getSVEOrigInstr : InstrMapping { + let FilterClass = "SVEInstr2Rev"; + let RowFields = ["InstrName"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +class SVEInstr2Rev { + string InstrName = !if(nameIsOrig, name, revname); + bit IsOrig = nameIsOrig; +} + +// +// Pseudos for destructive operands +// +let hasNoSchedulingInfo = 1 in { + class PredTwoOpPseudo + : SVEPseudo2Instr, + Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2), []> { + let FalseLanes = flags; + } +} + //===----------------------------------------------------------------------===// // SVE Predicate Misc Group //===----------------------------------------------------------------------===// @@ -1427,11 +1484,17 @@ let ElementSize = zprty.ElementSize; } -multiclass sve_fp_2op_p_zds opc, string asm, - SDPatternOperator op> { - def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>; - def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>; - def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>; +multiclass sve_fp_2op_p_zds opc, string asm, string Ps, + SDPatternOperator op, DestructiveInstTypeEnum flags, + string revname="", bit isOrig=0> { + let DestructiveInstType = flags in { + def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>, + SVEPseudo2Instr, SVEInstr2Rev; + def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>, + SVEPseudo2Instr, SVEInstr2Rev; + def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>, + SVEPseudo2Instr, SVEInstr2Rev; + } def : SVE_3_Op_Pat(NAME # _H)>; def : SVE_3_Op_Pat(NAME # _S)>; @@ -1449,6 +1512,16 @@ def : SVE_3_Op_Pat(NAME # _D)>; } +multiclass sve_fp_2op_p_zds_zx { + def _ZERO_H : PredTwoOpPseudo; + def _ZERO_S : PredTwoOpPseudo; + def _ZERO_D : PredTwoOpPseudo; + + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_D)>; +} + class sve_fp_ftmad sz, string asm, ZPRRegOp zprty> : I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, imm32_0_7:$imm3), asm, "\t$Zdn, $_Zdn, $Zm, $imm3", Index: llvm/test/CodeGen/AArch64/O0-pipeline.ll =================================================================== --- llvm/test/CodeGen/AArch64/O0-pipeline.ll +++ llvm/test/CodeGen/AArch64/O0-pipeline.ll @@ -68,6 +68,7 @@ ; CHECK-NEXT: Implement the 'patchable-function' attribute ; CHECK-NEXT: AArch64 Branch Targets ; CHECK-NEXT: Branch relaxation pass +; CHECK-NEXT: Unpack machine instruction bundles ; CHECK-NEXT: Contiguously Lay Out Funclets ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis Index: llvm/test/CodeGen/AArch64/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -178,6 +178,7 @@ ; CHECK-NEXT: AArch64 Branch Targets ; CHECK-NEXT: Branch relaxation pass ; CHECK-NEXT: AArch64 Compress Jump Tables +; CHECK-NEXT: Unpack machine instruction bundles ; CHECK-NEXT: Contiguously Lay Out Funclets ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis Index: llvm/test/CodeGen/AArch64/sve-intrinsics-fp-arith-merging.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-intrinsics-fp-arith-merging.ll @@ -0,0 +1,330 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; FADD +; + +define @fadd_s( %pg, %a, %b) { +; CHECK-LABEL: fadd_s: +; CHECK: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fadd.nxv4f32( %pg, + %a_z, + %b) + ret %out +} + +define @fadd_d( %pg, %a, %b) { +; CHECK-LABEL: fadd_d: +; CHECK: movprfx z0.d, p0/z, z0.d +; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fadd.nxv2f64( %pg, + %a_z, + %b) + ret %out +} + +; +; FMAX +; + +define @fmax_s( %pg, %a, %b) { +; CHECK-LABEL: fmax_s: +; CHECK: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: fmax z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fmax.nxv4f32( %pg, + %a_z, + %b) + ret %out +} + +define @fmax_d( %pg, %a, %b) { +; CHECK-LABEL: fmax_d: +; CHECK: movprfx z0.d, p0/z, z0.d +; CHECK-NEXT: fmax z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fmax.nxv2f64( %pg, + %a_z, + %b) + ret %out +} + +; +; FMAXNM +; + +define @fmaxnm_s( %pg, %a, %b) { +; CHECK-LABEL: fmaxnm_s: +; CHECK: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fmaxnm.nxv4f32( %pg, + %a_z, + %b) + ret %out +} + +define @fmaxnm_d( %pg, %a, %b) { +; CHECK-LABEL: fmaxnm_d: +; CHECK: movprfx z0.d, p0/z, z0.d +; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fmaxnm.nxv2f64( %pg, + %a_z, + %b) + ret %out +} + +; +; FMIN +; + +define @fmin_s( %pg, %a, %b) { +; CHECK-LABEL: fmin_s: +; CHECK: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: fmin z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fmin.nxv4f32( %pg, + %a_z, + %b) + ret %out +} + +define @fmin_d( %pg, %a, %b) { +; CHECK-LABEL: fmin_d: +; CHECK: movprfx z0.d, p0/z, z0.d +; CHECK-NEXT: fmin z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fmin.nxv2f64( %pg, + %a_z, + %b) + ret %out +} + +; +; FMINNM +; + +define @fminnm_s( %pg, %a, %b) { +; CHECK-LABEL: fminnm_s: +; CHECK: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fminnm.nxv4f32( %pg, + %a_z, + %b) + ret %out +} + +define @fminnm_d( %pg, %a, %b) { +; CHECK-LABEL: fminnm_d: +; CHECK: movprfx z0.d, p0/z, z0.d +; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fminnm.nxv2f64( %pg, + %a_z, + %b) + ret %out +} + +; +; FMUL +; + +define @fmul_s( %pg, %a, %b) { +; CHECK-LABEL: fmul_s: +; CHECK: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fmul.nxv4f32( %pg, + %a_z, + %b) + ret %out +} + +define @fmul_d( %pg, %a, %b) { +; CHECK-LABEL: fmul_d: +; CHECK: movprfx z0.d, p0/z, z0.d +; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fmul.nxv2f64( %pg, + %a_z, + %b) + ret %out +} + +; +; FSUB +; + +define @fsub_s( %pg, %a, %b) { +; CHECK-LABEL: fsub_s: +; CHECK: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fsub.nxv4f32( %pg, + %a_z, + %b) + ret %out +} + +define @fsub_d( %pg, %a, %b) { +; CHECK-LABEL: fsub_d: +; CHECK: movprfx z0.d, p0/z, z0.d +; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fsub.nxv2f64( %pg, + %a_z, + %b) + ret %out +} + +; +; FSUBR +; + +define @fsubr_s( %pg, %a, %b) { +; CHECK-LABEL: fsubr_s: +; CHECK: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: fsubr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fsubr.nxv4f32( %pg, + %a_z, + %b) + ret %out +} + +define @fsubr_d( %pg, %a, %b) { +; CHECK-LABEL: fsubr_d: +; CHECK: movprfx z0.d, p0/z, z0.d +; CHECK-NEXT: fsubr z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fsubr.nxv2f64( %pg, + %a_z, + %b) + ret %out +} + +declare @llvm.aarch64.sve.fabd.nxv4f32(, , ) +declare @llvm.aarch64.sve.fabd.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fabs.nxv4f32(, , ) +declare @llvm.aarch64.sve.fabs.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fadd.nxv4f32(, , ) +declare @llvm.aarch64.sve.fadd.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fcmla.lane.nxv4f32(, , , i32, i32) +declare @llvm.aarch64.sve.fcmla.lane.nxv2f64(, , , i32, i32) + +declare @llvm.aarch64.sve.fdiv.nxv4f32(, , ) +declare @llvm.aarch64.sve.fdiv.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fdivr.nxv4f32(, , ) +declare @llvm.aarch64.sve.fdivr.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fmad.nxv4f32(, , , ) +declare @llvm.aarch64.sve.fmad.nxv2f64(, , , ) + +declare @llvm.aarch64.sve.fmax.nxv4f32(, , ) +declare @llvm.aarch64.sve.fmax.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fmaxnm.nxv4f32(, , ) +declare @llvm.aarch64.sve.fmaxnm.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fmin.nxv4f32(, , ) +declare @llvm.aarch64.sve.fmin.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fminnm.nxv4f32(, , ) +declare @llvm.aarch64.sve.fminnm.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fmla.nxv4f32(, , , ) +declare @llvm.aarch64.sve.fmla.nxv2f64(, , , ) + +declare @llvm.aarch64.sve.fmla.lane.nxv4f32(, , , i32) +declare @llvm.aarch64.sve.fmla.lane.nxv2f64(, , , i32) + +declare @llvm.aarch64.sve.fmls.nxv4f32(, , , ) +declare @llvm.aarch64.sve.fmls.nxv2f64(, , , ) + +declare @llvm.aarch64.sve.fmls.lane.nxv4f32(, , , i32) +declare @llvm.aarch64.sve.fmls.lane.nxv2f64(, , , i32) + +declare @llvm.aarch64.sve.fmsb.nxv4f32(, , , ) +declare @llvm.aarch64.sve.fmsb.nxv2f64(, , , ) + +declare @llvm.aarch64.sve.fmul.nxv4f32(, , ) +declare @llvm.aarch64.sve.fmul.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fmulx.nxv4f32(, , ) +declare @llvm.aarch64.sve.fmulx.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fneg.nxv4f32(, , ) +declare @llvm.aarch64.sve.fneg.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fnmad.nxv4f32(, , , ) +declare @llvm.aarch64.sve.fnmad.nxv2f64(, , , ) + +declare @llvm.aarch64.sve.fnmla.nxv4f32(, , , ) +declare @llvm.aarch64.sve.fnmla.nxv2f64(, , , ) + +declare @llvm.aarch64.sve.fnmls.nxv4f32(, , , ) +declare @llvm.aarch64.sve.fnmls.nxv2f64(, , , ) + +declare @llvm.aarch64.sve.fnmsb.nxv4f32(, , , ) +declare @llvm.aarch64.sve.fnmsb.nxv2f64(, , , ) + +declare @llvm.aarch64.sve.frecpx.nxv4f32(, , ) +declare @llvm.aarch64.sve.frecpx.nxv2f64(, , ) + +declare @llvm.aarch64.sve.frinta.nxv4f32(, , ) +declare @llvm.aarch64.sve.frinta.nxv2f64(, , ) + +declare @llvm.aarch64.sve.frinti.nxv4f32(, , ) +declare @llvm.aarch64.sve.frinti.nxv2f64(, , ) + +declare @llvm.aarch64.sve.frintm.nxv4f32(, , ) +declare @llvm.aarch64.sve.frintm.nxv2f64(, , ) + +declare @llvm.aarch64.sve.frintn.nxv4f32(, , ) +declare @llvm.aarch64.sve.frintn.nxv2f64(, , ) + +declare @llvm.aarch64.sve.frintp.nxv4f32(, , ) +declare @llvm.aarch64.sve.frintp.nxv2f64(, , ) + +declare @llvm.aarch64.sve.frintx.nxv4f32(, , ) +declare @llvm.aarch64.sve.frintx.nxv2f64(, , ) + +declare @llvm.aarch64.sve.frintz.nxv4f32(, , ) +declare @llvm.aarch64.sve.frintz.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fscale.nxv4f32(, , ) +declare @llvm.aarch64.sve.fscale.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fsqrt.nxv4f32(, , ) +declare @llvm.aarch64.sve.fsqrt.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fsub.nxv4f32(, , ) +declare @llvm.aarch64.sve.fsub.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fsubr.nxv4f32(, , ) +declare @llvm.aarch64.sve.fsubr.nxv2f64(, , )