Index: llvm/include/llvm/CodeGen/TargetPassConfig.h =================================================================== --- llvm/include/llvm/CodeGen/TargetPassConfig.h +++ llvm/include/llvm/CodeGen/TargetPassConfig.h @@ -402,6 +402,13 @@ return false; } + /// addPostCoalesce - Add passes to the optimized register allocation pipeline + /// after coalescing is complete, but before further scheduling or register + /// allocation. + virtual bool addPostCoalesce() { + return false; + } + /// Add passes to be run immediately after virtual registers are rewritten /// to physical registers. virtual void addPostRewrite() { } Index: llvm/lib/CodeGen/TargetPassConfig.cpp =================================================================== --- llvm/lib/CodeGen/TargetPassConfig.cpp +++ llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1216,6 +1216,9 @@ addPass(&TwoAddressInstructionPassID, false); addPass(&RegisterCoalescerID); + // Allow targets to change the live ranges after coalescing + addPostCoalesce(); + // The machine scheduler may accidentally create disconnected components // when moving subregister definitions around, avoid this by splitting them to // separate vregs before. Splitting can also improve reg. allocation quality. Index: llvm/lib/Target/AArch64/AArch64.h =================================================================== --- llvm/lib/Target/AArch64/AArch64.h +++ llvm/lib/Target/AArch64/AArch64.h @@ -52,6 +52,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); FunctionPass *createAArch64CollectLOHPass(); +FunctionPass *createSVEConditionalEarlyClobberPass(); ModulePass *createSVEIntrinsicOptsPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, Index: llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -348,13 +348,25 @@ /// \brief Expand Pseudos to Instructions with destructive operands. /// -/// This mechanism uses MOVPRFX instructions for zeroing the false lanes -/// or for fixing relaxed register allocation conditions to comply with +/// This mechanism uses MOVPRFX instructions for merging/zeroing the false +/// lanes or for fixing relaxed register allocation conditions to comply with /// the instructions register constraints. The latter case may be cheaper /// than setting the register constraints in the register allocator, /// since that will insert regular MOV instructions rather than MOVPRFX. /// -/// Example (after register allocation): +/// Merging example (after register allocation): +/// +/// FADD_ZPZZ_B Z0, Pg, Z0, Z1, Z2 +/// +/// * The Pseudo FADD_ZPZZ_B maps to FADD_ZPmZ_B, where Z2 is the +/// Passthru register. +/// * We cannot map directly to FADD_ZPmZ_B because we need to +/// carry the explicit passthru register. +/// * FIXME: Register constraints when they're determined. +/// * For performance, it's prefered to use the zero/undef merging +/// variants. +/// +/// Zeroing example (after register allocation): /// /// FSUB_ZPZZ_ZERO_B Z0, Pg, Z1, Z0 /// @@ -379,9 +391,8 @@ /// MOVPRFX_ZPzZ_B Z0, Pg/z, Z0 /// FSUBR_ZPmZ_B Z0, Pg/m, Z0, Z1 /// -/// Note that this can only be done for _ZERO or _UNDEF variants where -/// we can guarantee the false lanes to be zeroed (by implementing this) -/// or that they are undef (don't care / not used), otherwise the +/// Note that this can only be done for merging variants where +/// we can guarantee the false lanes are specified, otherwise the /// swapping of operands is illegal because the operation is not /// (or cannot be emulated to be) fully commutative. bool AArch64ExpandPseudo::expand_DestructiveOp( @@ -391,7 +402,6 @@ unsigned Opcode = AArch64::getSVEPseudoMap(MI.getOpcode()); uint64_t DType = TII->get(Opcode).TSFlags & AArch64::DestructiveInstTypeMask; uint64_t FalseLanes = MI.getDesc().TSFlags & AArch64::FalseLanesMask; - bool FalseZero = FalseLanes == AArch64::FalseLanesZero; unsigned DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); @@ -400,21 +410,21 @@ assert(DstReg != MI.getOperand(3).getReg()); bool UseRev = false; - unsigned PredIdx, DOPIdx, SrcIdx; + unsigned PredIdx, DOPIdx, SrcIdx, PassthruIdx; switch (DType) { case AArch64::DestructiveBinaryComm: case AArch64::DestructiveBinaryCommWithRev: if (DstReg == MI.getOperand(3).getReg()) { // FSUB Zd, Pg, Zs1, Zd ==> FSUBR Zd, Pg/m, Zd, Zs1 - std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 3, 2); + std::tie(PredIdx, DOPIdx, SrcIdx, PassthruIdx) = std::make_tuple(1, 3, 2, 4); UseRev = true; break; } LLVM_FALLTHROUGH; case AArch64::DestructiveBinary: case AArch64::DestructiveBinaryImm: - std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 2, 3); - break; + std::tie(PredIdx, DOPIdx, SrcIdx, PassthruIdx) = std::make_tuple(1, 2, 3, 4); + break; default: llvm_unreachable("Unsupported Destructive Operand type"); } @@ -449,24 +459,28 @@ // Get the right MOVPRFX uint64_t ElementSize = TII->getElementSizeForOpcode(Opcode); - unsigned MovPrfx, MovPrfxZero; + unsigned MovPrfx, MovPrfxZero, MovPrfxMerge; switch (ElementSize) { case AArch64::ElementSizeNone: case AArch64::ElementSizeB: MovPrfx = AArch64::MOVPRFX_ZZ; MovPrfxZero = AArch64::MOVPRFX_ZPzZ_B; + MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_B; break; case AArch64::ElementSizeH: MovPrfx = AArch64::MOVPRFX_ZZ; MovPrfxZero = AArch64::MOVPRFX_ZPzZ_H; + MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_H; break; case AArch64::ElementSizeS: MovPrfx = AArch64::MOVPRFX_ZZ; MovPrfxZero = AArch64::MOVPRFX_ZPzZ_S; + MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_S; break; case AArch64::ElementSizeD: MovPrfx = AArch64::MOVPRFX_ZZ; MovPrfxZero = AArch64::MOVPRFX_ZPzZ_D; + MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_D; break; default: llvm_unreachable("Unsupported ElementSize"); @@ -476,22 +490,56 @@ // Create the destructive operation (if required) // MachineInstrBuilder PRFX, DOP; - if (FalseZero) { + if (FalseLanes == AArch64::FalseLanesZero) { assert(ElementSize != AArch64::ElementSizeNone && "This instruction is unpredicated"); + // If we're replacing the (DUP #0) with a zeroing MOVPRFX, walk + // backwards through the MachineInstrs to see if the DUP can be + // removed. + unsigned PassthruReg = MI.getOperand(PassthruIdx).getReg(); + MachineBasicBlock::reverse_iterator RIt = MI.getReverseIterator(); + for (MachineInstr &PredI : make_range(std::next(RIt), MBB.rend())) { + // If there are any uses of the DUP, don't remove it. + if (PredI.readsRegister(PassthruReg)) + break; + + // If we found the DUP with no other uses, remove it. + if (PredI.definesRegister(PassthruReg)) { + PredI.eraseFromParent(); + break; + } + } + // Merge source operand into destination register PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxZero)) .addReg(DstReg, RegState::Define) .addReg(MI.getOperand(PredIdx).getReg()) .addReg(MI.getOperand(DOPIdx).getReg()); + // After the movprfx, the destructive operand is same as Dst + DOPIdx = 0; + } else if (FalseLanes == AArch64::FalseLanesMerge) { + unsigned PassthruReg = MI.getOperand(PassthruIdx).getReg(); + unsigned DOPReg = MI.getOperand(DOPIdx).getReg(); + + // Generate a MOVPRFX to merge the false lanes. If the src and + // dst regs are the same, there's nothing to be done. + if (PassthruReg != DOPReg) + PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxMerge)) + .addReg(PassthruReg, RegState::Define) + .addReg(PassthruReg) + .addReg(MI.getOperand(PredIdx).getReg()) + .addReg(DOPReg); + // After the movprfx, the destructive operand is same as Dst DOPIdx = 0; } else if (DstReg != MI.getOperand(DOPIdx).getReg()) { PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx)) .addReg(DstReg, RegState::Define) .addReg(MI.getOperand(DOPIdx).getReg()); + + // After the movprfx, the destructive operand is same as Dst DOPIdx = 0; } Index: llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -162,10 +162,11 @@ return false; } - bool SelectDupZero(SDValue N) { + bool SelectDupZero(SDValue N, SDValue &Res) { switch(N->getOpcode()) { case AArch64ISD::DUP: case ISD::SPLAT_VECTOR: { + Res = N; auto Opnd0 = N->getOperand(0); if (auto CN = dyn_cast(Opnd0)) if (CN->isNullValue()) Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -37,12 +37,13 @@ def DestructiveBinaryCommWithRev : DestructiveInstTypeEnum<7>; def DestructiveTernaryCommWithRev : DestructiveInstTypeEnum<8>; -class FalseLanesEnum val> { - bits<2> Value = val; +class FalseLanesEnum val> { + bits<3> Value = val; } def FalseLanesNone : FalseLanesEnum<0>; def FalseLanesZero : FalseLanesEnum<1>; def FalseLanesUndef : FalseLanesEnum<2>; +def FalseLanesMerge : FalseLanesEnum<4>; // AArch64 Instruction Format class AArch64Inst : Instruction { @@ -64,7 +65,7 @@ DestructiveInstTypeEnum DestructiveInstType = NotDestructive; ElementSizeEnum ElementSize = ElementSizeNone; - let TSFlags{8-7} = FalseLanes.Value; + let TSFlags{9-7} = FalseLanes.Value; let TSFlags{6-3} = DestructiveInstType.Value; let TSFlags{2-0} = ElementSize.Value; Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -420,9 +420,10 @@ }; enum FalseLaneType { - FalseLanesMask = TSFLAG_FALSE_LANE_TYPE(0x3), - FalseLanesZero = TSFLAG_FALSE_LANE_TYPE(0x1), + FalseLanesMask = TSFLAG_FALSE_LANE_TYPE(0x7), + FalseLanesZero = TSFLAG_FALSE_LANE_TYPE(0x1), FalseLanesUndef = TSFLAG_FALSE_LANE_TYPE(0x2), + FalseLanesMerge = TSFLAG_FALSE_LANE_TYPE(0x4), }; #undef TSFLAG_ELEMENT_SIZE_TYPE Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -405,6 +405,7 @@ } void addIRPasses() override; + bool addPostCoalesce() override; bool addPreISel() override; bool addInstSelector() override; bool addIRTranslator() override; @@ -493,6 +494,14 @@ } // Pass Pipeline Configuration +bool AArch64PassConfig::addPostCoalesce() { + // Add a pass that transforms SVE MOVPRFXable Pseudo instructions + // to add an 'earlyclobber' under certain conditions + addPass(createSVEConditionalEarlyClobberPass()); + + return false; +} + bool AArch64PassConfig::addPreISel() { // Run promote constant before global merge, so that the promoted constants // get a chance to be merged Index: llvm/lib/Target/AArch64/CMakeLists.txt =================================================================== --- llvm/lib/Target/AArch64/CMakeLists.txt +++ llvm/lib/Target/AArch64/CMakeLists.txt @@ -65,6 +65,7 @@ AArch64TargetMachine.cpp AArch64TargetObjectFile.cpp AArch64TargetTransformInfo.cpp + SVEConditionalEarlyClobberPass.cpp SVEIntrinsicOpts.cpp AArch64SIMDInstrOpt.cpp Index: llvm/lib/Target/AArch64/SVEConditionalEarlyClobberPass.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/AArch64/SVEConditionalEarlyClobberPass.cpp @@ -0,0 +1,187 @@ +//==-- SVEConditionalEarlyClobberPass.cpp - Conditionally add early clobber ==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass solves an issue with MOVPRFXable instructions that +// have the restriction that the destination register of a MOVPRFX +// cannot be used in any operand of the next instruction, except for +// the destructive operand. +// +// We chose to create Pseudo instructions to implement false-lane zeroing, +// where we specifically tried not to use the '$Zd = $Zs1' restriction +// so that the register allocator doesn't insert normal +// MOV instructions. The downside of doing that, is that the register +// allocation of: +// vreg1 = OP_ZEROING vreg0, vreg0 +// may result in: +// Z8 = OP_ZEROING Z8, Z8 +// +// At expand time, the OP_ZEROING will either need a scratch register to +// implement an actual 'MOV(DUP(0))', or will need to use a MOVPRFX Pg/z +// with a dummy ('nop'-like) MOVPRFXable instruction, like LSL #0. +// +// This is better handled by the register allocator creating an allocation +// that takes the above restriction into account, e.g. +// Z3 = OP_ZEROING Z8, Z8 +// which can be correctly expanded into: +// Z3 = MOVPRFX Pg/z, Z8 +// Z3 = OP Z3, Z8 +// +// After Coalescing of virtual registers, we know whether the input operands +// to the instruction will be in the same register or not. +// For our example: +// vreg1 = OP_ZEROING vreg0, vreg0 +// we know that vreg0 and vreg0 will be equal, but we don't know the +// register allocation of vreg1. We want to force that vreg1 will be different +// from vreg0, which can be done using an 'earlyclobber'. +// +// This pass adds the earlyclobber to the machine operand, and also updates +// the cache of live ranges so that subsequent passes don't need to +// recalculate those for the newly added earlyclobber. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +using namespace llvm; + +#define PASS_SHORT_NAME "Conditional Early Clobber" + +namespace llvm { + void initializeSVEConditionalEarlyClobberPassPass(PassRegistry &); +} + +namespace { +class SVEConditionalEarlyClobberPass : public MachineFunctionPass { +public: + static char ID; + SVEConditionalEarlyClobberPass() : MachineFunctionPass(ID) { + initializeSVEConditionalEarlyClobberPassPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &Fn) override; + + StringRef getPassName() const override { return PASS_SHORT_NAME; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); + } +private: + const TargetInstrInfo *TII; + LiveIntervals *LIS; + + bool addConditionalEC(MachineInstr &MI); + bool hasConditionalClobber(const MachineInstr &MI); +}; +char SVEConditionalEarlyClobberPass::ID = 0; +} + +INITIALIZE_PASS(SVEConditionalEarlyClobberPass, + "aarch64-conditional-early-clobber", + PASS_SHORT_NAME, false, false) + +FunctionPass *llvm::createSVEConditionalEarlyClobberPass() { + return new SVEConditionalEarlyClobberPass(); +} + +// We could also choose to do this with a new instruction annotation +// like 'earlyclobberif($Zd=$Zs1)', but because this is so specific to SVE +// it should be fine to explicitly check the type of SVE operation where +// we know what the conditions are. +bool SVEConditionalEarlyClobberPass::hasConditionalClobber( + const MachineInstr &MI) { + int Instr = AArch64::getSVEPseudoMap(MI.getOpcode()); + if (Instr == -1) + return false; + + uint64_t FalseLanesZero = MI.getDesc().TSFlags & AArch64::FalseLanesZero; + if (!FalseLanesZero) + return false; + + uint64_t DType = + TII->get(Instr).TSFlags & AArch64::DestructiveInstTypeMask; + auto mo_equals = [&](const MachineOperand &MO1, const MachineOperand &MO2) { + if (MO1.getReg() == MO2.getReg() && MO1.getSubReg() == MO2.getSubReg()) { + // This is needed to deal with cases where subreg assignment means that + // the earlyclobber isn't necessary. + return MI.getOperand(0).getSubReg() == MO1.getSubReg() || + ((MO1.getSubReg() == 0) ^ (MI.getOperand(0).getSubReg() == 0)); + } + return false; + }; + switch (DType) { + case AArch64::DestructiveBinary: + case AArch64::DestructiveBinaryComm: + case AArch64::DestructiveBinaryCommWithRev: + return mo_equals(MI.getOperand(2), MI.getOperand(3)); + case AArch64::DestructiveTernaryCommWithRev: + return mo_equals(MI.getOperand(2), MI.getOperand(3)) || + mo_equals(MI.getOperand(2), MI.getOperand(4)) || + mo_equals(MI.getOperand(3), MI.getOperand(4)); + case AArch64::NotDestructive: + case AArch64::DestructiveBinaryImm: + case AArch64::DestructiveBinaryShImmUnpred: + return false; + default: + break; + } + + llvm_unreachable("Not a known destructive operand type"); +} + +bool SVEConditionalEarlyClobberPass::addConditionalEC(MachineInstr &MI) { + // If the operand is already 'earlyclobber' or it doesn't require + // adding a conditional one (based on instruction), then don't bother. + if (!hasConditionalClobber(MI)) + return false; + + if (MI.getOperand(0).isEarlyClobber()) + return false; + + assert(MI.getOperand(0).isDef()); + + // Set the 'EarlyClobber' attribute for when the live ranges need + // to be recalculated. + MI.getOperand(0).setIsEarlyClobber(true); + + SlotIndex Index = LIS->getInstructionIndex(MI); + SlotIndex DefSlot = Index.getRegSlot(0); + + // Update the LiveRange cache by extending the liferange of the + // 'Def' register to be live earlier, so it overlaps with the + // live ranges of the input operands. + unsigned Reg = MI.getOperand(0).getReg(); + auto *Seg = LIS->getInterval(Reg).getSegmentContaining(DefSlot); + assert(Seg && "Expected Def operand to be live with instruction"); + Seg->start = Index.getRegSlot(true); + Seg->valno->def = Seg->start; + + return true; +} + +bool SVEConditionalEarlyClobberPass::runOnMachineFunction(MachineFunction &MF) { + LIS = &getAnalysis(); + TII = MF.getSubtarget().getInstrInfo(); + + bool Modified = false; + for (auto &MBB : MF) + for (auto &MI : MBB) + Modified |= addConditionalEC(MI); + + return Modified; +} Index: llvm/lib/Target/AArch64/SVEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SVEInstrFormats.td +++ llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -373,7 +373,7 @@ : Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, (vt4 ImmTy:$Op4))), (inst $Op1, $Op2, $Op3, ImmTy:$Op4)>; -def SVEDup0 : ComplexPattern; +def SVEDup0 : ComplexPattern; def SVEDup0Undef : ComplexPattern; let AddedComplexity = 1 in { @@ -382,11 +382,27 @@ : Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), vt3:$Op3))), (inst $Op1, $Op2, $Op3)>; +class SVE_3_Op_Pat_Sel_Passthru +: Pat<(vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, vt2:$Passthru), vt3:$Op3)), + (inst $Op1, $Op2, $Op3, $Passthru)>; + +class SVE_3_Op_Pat_SelZero_Passthru +: Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (vt2 SVEDup0:$Dup)), vt3:$Op3))), + (inst $Op1, $Op2, $Op3, $Dup)>; + class SVE_3_Op_Pat_Shift_Imm_SelZero : Pat<(vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), (i32 (vt3:$Op3)))), (inst $Op1, $Op2, vt3:$Op3)>; + +class SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru +: Pat<(vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (vt2 SVEDup0:$Dup)), (i32 (vt3:$Op3)))), + (inst $Op1, $Op2, vt3:$Op3, $Dup)>; } // @@ -457,6 +473,25 @@ Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, immty:$imm), []> { let FalseLanes = flags; } + + class PredTwoOpMergePseudo + : SVEPseudo2Instr, + Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2, zprty:$Zpt), []> { + let FalseLanes = FalseLanesMerge; + let Constraints = "$Zd = $Zpt"; + } + + class PredTwoOpMergeZeroPseudo + : SVEPseudo2Instr, + Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2, zprty:$Zpt), []> { + let FalseLanes = FalseLanesZero; + } + + class PredTwoOpImmMergeZeroPseudo + : SVEPseudo2Instr, + Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, immty:$imm, zprty:$Zpt), []> { + let FalseLanes = FalseLanesZero; + } } //===----------------------------------------------------------------------===// @@ -1597,13 +1632,21 @@ } multiclass sve_fp_2op_p_zds_zx { - def _ZERO_H : PredTwoOpPseudo; - def _ZERO_S : PredTwoOpPseudo; - def _ZERO_D : PredTwoOpPseudo; + def _ZERO_H : PredTwoOpMergeZeroPseudo; + def _ZERO_S : PredTwoOpMergeZeroPseudo; + def _ZERO_D : PredTwoOpMergeZeroPseudo; + + def : SVE_3_Op_Pat_SelZero_Passthru(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_SelZero_Passthru(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_SelZero_Passthru(NAME # _ZERO_D)>; + + def _MERGE_H : PredTwoOpMergePseudo; + def _MERGE_S : PredTwoOpMergePseudo; + def _MERGE_D : PredTwoOpMergePseudo; - def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_H)>; - def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_S)>; - def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_D)>; + def : SVE_3_Op_Pat_Sel_Passthru(NAME # _MERGE_H)>; + def : SVE_3_Op_Pat_Sel_Passthru(NAME # _MERGE_S)>; + def : SVE_3_Op_Pat_Sel_Passthru(NAME # _MERGE_D)>; } class sve_fp_ftmad sz, string asm, ZPRRegOp zprty> @@ -4762,15 +4805,15 @@ def _S_Z_UNDEF : PredTwoOpImmPseudo; def _D_Z_UNDEF : PredTwoOpImmPseudo; - def _B_Z_ZERO : PredTwoOpImmPseudo; - def _H_Z_ZERO : PredTwoOpImmPseudo; - def _S_Z_ZERO : PredTwoOpImmPseudo; - def _D_Z_ZERO : PredTwoOpImmPseudo; + def _B_Z_ZERO : PredTwoOpImmMergeZeroPseudo; + def _H_Z_ZERO : PredTwoOpImmMergeZeroPseudo; + def _S_Z_ZERO : PredTwoOpImmMergeZeroPseudo; + def _D_Z_ZERO : PredTwoOpImmMergeZeroPseudo; - def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _B_Z_ZERO)>; - def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _H_Z_ZERO)>; - def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _S_Z_ZERO)>; - def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _D_Z_ZERO)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru(NAME # _B_Z_ZERO)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru(NAME # _H_Z_ZERO)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru(NAME # _S_Z_ZERO)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru(NAME # _D_Z_ZERO)>; def : SVE_3_Op_Imm_Pat(NAME # _B)>; def : SVE_3_Op_Imm_Pat(NAME # _H)>; @@ -4803,15 +4846,15 @@ } multiclass sve_int_bin_pred_shift_0_right_zx { - def _ZERO_B : PredTwoOpImmPseudo; - def _ZERO_H : PredTwoOpImmPseudo; - def _ZERO_S : PredTwoOpImmPseudo; - def _ZERO_D : PredTwoOpImmPseudo; + def _ZERO_B : PredTwoOpImmMergeZeroPseudo; + def _ZERO_H : PredTwoOpImmMergeZeroPseudo; + def _ZERO_S : PredTwoOpImmMergeZeroPseudo; + def _ZERO_D : PredTwoOpImmMergeZeroPseudo; - def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_B)>; - def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_H)>; - def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_S)>; - def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_D)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru(NAME # _ZERO_B)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru(NAME # _ZERO_D)>; } class sve_int_bin_pred_shift sz8_64, bit wide, bits<3> opc, @@ -4857,15 +4900,25 @@ } multiclass sve_int_bin_pred_zx { - def _ZERO_B : PredTwoOpPseudo; - def _ZERO_H : PredTwoOpPseudo; - def _ZERO_S : PredTwoOpPseudo; - def _ZERO_D : PredTwoOpPseudo; - - def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_B)>; - def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_H)>; - def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_S)>; - def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_D)>; + def _ZERO_B : PredTwoOpMergeZeroPseudo; + def _ZERO_H : PredTwoOpMergeZeroPseudo; + def _ZERO_S : PredTwoOpMergeZeroPseudo; + def _ZERO_D : PredTwoOpMergeZeroPseudo; + + def : SVE_3_Op_Pat_SelZero_Passthru(NAME # _ZERO_B)>; + def : SVE_3_Op_Pat_SelZero_Passthru(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_SelZero_Passthru(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_SelZero_Passthru(NAME # _ZERO_D)>; + + def _MERGE_B : PredTwoOpMergePseudo; + def _MERGE_H : PredTwoOpMergePseudo; + def _MERGE_S : PredTwoOpMergePseudo; + def _MERGE_D : PredTwoOpMergePseudo; + + def : SVE_3_Op_Pat_Sel_Passthru(NAME # _MERGE_B)>; + def : SVE_3_Op_Pat_Sel_Passthru(NAME # _MERGE_H)>; + def : SVE_3_Op_Pat_Sel_Passthru(NAME # _MERGE_S)>; + def : SVE_3_Op_Pat_Sel_Passthru(NAME # _MERGE_D)>; } multiclass sve_int_bin_pred_shift_wide opc, string asm, Index: llvm/test/CodeGen/AArch64/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -135,6 +135,7 @@ ; CHECK-NEXT: Slot index numbering ; CHECK-NEXT: Live Interval Analysis ; CHECK-NEXT: Simple Register Coalescing +; CHECK-NEXT: Conditional Early Clobber ; CHECK-NEXT: Rename Disconnected Subregister Components ; CHECK-NEXT: Machine Instruction Scheduler ; CHECK-NEXT: Machine Block Frequency Analysis Index: llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll @@ -52,6 +52,114 @@ ret %out } +define @add_i8( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: add_i8: +; CHECK: movprfx z2.b, p0/m, z0.b +; CHECK-NEXT: add z2.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.add.nxv16i8( %pg, + %a_m, + %b) + ret %out +} + +define @add_i16( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: add_i16 +; CHECK: movprfx z2.h, p0/m, z0.h +; CHECK-NEXT: add z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.add.nxv8i16( %pg, + %a_m, + %b) + ret %out +} + +define @add_i32( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: add_i32: +; CHECK: movprfx z2.s, p0/m, z0.s +; CHECK-NEXT: add z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.add.nxv4i32( %pg, + %a_m, + %b) + ret %out +} + +define @add_i64( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: add_i64: +; CHECK: movprfx z2.d, p0/m, z0.d +; CHECK-NEXT: add z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.add.nxv2i64( %pg, + %a_m, + %b) + ret %out +} + +define @add_i8_comm( %pg, %a, + %b) { +; CHECK-LABEL: add_i8_comm: +; CHECK: add z1.b, p0/m, z1.b, z0.b +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %b + %out = call @llvm.aarch64.sve.add.nxv16i8( %pg, + %a_m, + %b) + ret %out +} + +define @add_i16_comm( %pg, %a, + %b) { +; CHECK-LABEL: add_i16 +; CHECK: add z1.h, p0/m, z1.h, z0.h +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %b + %out = call @llvm.aarch64.sve.add.nxv8i16( %pg, + %a_m, + %b) + ret %out +} + +define @add_i32_comm( %pg, %a, + %b) { +; CHECK-LABEL: add_i32_comm: +; CHECK: add z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %b + %out = call @llvm.aarch64.sve.add.nxv4i32( %pg, + %a_m, + %b) + ret %out +} + +define @add_i64_comm( %pg, %a, + %b) { +; CHECK-LABEL: add_i64_comm: +; CHECK: add z1.d, p0/m, z1.d, z0.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %b + %out = call @llvm.aarch64.sve.add.nxv2i64( %pg, + %a_m, + %b) + ret %out +} + ; ; SUB ; @@ -104,6 +212,62 @@ ret %out } +define @sub_i8( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: sub_i8: +; CHECK: movprfx z2.b, p0/m, z0.b +; CHECK-NEXT: sub z2.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.sub.nxv16i8( %pg, + %a_m, + %b) + ret %out +} + +define @sub_i16( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: sub_i16 +; CHECK: movprfx z2.h, p0/m, z0.h +; CHECK-NEXT: sub z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.sub.nxv8i16( %pg, + %a_m, + %b) + ret %out +} + +define @sub_i32( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: sub_i32: +; CHECK: movprfx z2.s, p0/m, z0.s +; CHECK-NEXT: sub z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.sub.nxv4i32( %pg, + %a_m, + %b) + ret %out +} + +define @sub_i64( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: sub_i64: +; CHECK: movprfx z2.d, p0/m, z0.d +; CHECK-NEXT: sub z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.sub.nxv2i64( %pg, + %a_m, + %b) + ret %out +} + ; ; SUBR ; @@ -156,6 +320,118 @@ ret %out } +define @subr_i8( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: subr_i8: +; CHECK: movprfx z2.b, p0/m, z0.b +; CHECK-NEXT: subr z2.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.subr.nxv16i8( %pg, + %a_m, + %b) + ret %out +} + +define @subr_i16( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: subr_i16 +; CHECK: movprfx z2.h, p0/m, z0.h +; CHECK-NEXT: subr z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.subr.nxv8i16( %pg, + %a_m, + %b) + ret %out +} + +define @subr_i32( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: subr_i32: +; CHECK: movprfx z2.s, p0/m, z0.s +; CHECK-NEXT: subr z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.subr.nxv4i32( %pg, + %a_m, + %b) + ret %out +} + +define @subr_i64( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: subr_i64: +; CHECK: movprfx z2.d, p0/m, z0.d +; CHECK-NEXT: subr z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.subr.nxv2i64( %pg, + %a_m, + %b) + ret %out +} + +define @subr_i8_rev( %pg, %a, + %b) { +; CHECK-LABEL: subr_i8_rev: +; CHECK-NOT: movprfx +; CHECK: subr z1.b, p0/m, z1.b, z0.b +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %b + %out = call @llvm.aarch64.sve.sub.nxv16i8( %pg, + %a_m, + %b) + ret %out +} + +define @subr_i16_rev( %pg, %a, + %b) { +; CHECK-LABEL: subr_i16_rev: +; CHECK-NOT: movprfx +; CHECK: subr z1.h, p0/m, z1.h, z0.h +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %b + %out = call @llvm.aarch64.sve.sub.nxv8i16( %pg, + %a_m, + %b) + ret %out +} + +define @subr_i32_rev( %pg, %a, + %b) { +; CHECK-LABEL: subr_i32_rev: +; CHECK-NOT: movprfx +; CHECK: subr z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %b + %out = call @llvm.aarch64.sve.sub.nxv4i32( %pg, + %a_m, + %b) + ret %out +} + +define @subr_i64_rev( %pg, %a, + %b) { +; CHECK-LABEL: subr_i64_rev: +; CHECK-NOT: movprfx +; CHECK: subr z1.d, p0/m, z1.d, z0.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %b + %out = call @llvm.aarch64.sve.sub.nxv2i64( %pg, + %a_m, + %b) + ret %out +} + declare @llvm.aarch64.sve.add.nxv16i8(, , ) declare @llvm.aarch64.sve.add.nxv8i16(, , ) declare @llvm.aarch64.sve.add.nxv4i32(, , ) Index: llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll @@ -85,6 +85,62 @@ ret %out } +define @asr_i8( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: asr_i8: +; CHECK: movprfx z2.b, p0/m, z0.b +; CHECK-NEXT: asr z2.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.asr.nxv16i8( %pg, + %a_m, + %b) + ret %out +} + +define @asr_i16( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: asr_i16: +; CHECK: movprfx z2.h, p0/m, z0.h +; CHECK-NEXT: asr z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.asr.nxv8i16( %pg, + %a_m, + %b) + ret %out +} + +define @asr_i32( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: asr_i32: +; CHECK: movprfx z2.s, p0/m, z0.s +; CHECK-NEXT: asr z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.asr.nxv4i32( %pg, + %a_m, + %b) + ret %out +} + +define @asr_i64( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: asr_i64: +; CHECK: movprfx z2.d, p0/m, z0.d +; CHECK-NEXT: asr z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.asr.nxv2i64( %pg, + %a_m, + %b) + ret %out +} + ; ; ASRD ; @@ -222,6 +278,62 @@ ret %out } +define @lsl_i8( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: lsl_i8: +; CHECK: movprfx z2.b, p0/m, z0.b +; CHECK-NEXT: lsl z2.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.lsl.nxv16i8( %pg, + %a_m, + %b) + ret %out +} + +define @lsl_i16( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: lsl_i16: +; CHECK: movprfx z2.h, p0/m, z0.h +; CHECK-NEXT: lsl z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.lsl.nxv8i16( %pg, + %a_m, + %b) + ret %out +} + +define @lsl_i32( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: lsl_i32: +; CHECK: movprfx z2.s, p0/m, z0.s +; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.lsl.nxv4i32( %pg, + %a_m, + %b) + ret %out +} + +define @lsl_i64( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: lsl_i64: +; CHECK: movprfx z2.d, p0/m, z0.d +; CHECK-NEXT: lsl z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.lsl.nxv2i64( %pg, + %a_m, + %b) + ret %out +} + ; ; LSR ; @@ -307,6 +419,62 @@ ret %out } +define @lsr_i8( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: lsr_i8: +; CHECK: movprfx z2.b, p0/m, z0.b +; CHECK-NEXT: lsr z2.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.lsr.nxv16i8( %pg, + %a_m, + %b) + ret %out +} + +define @lsr_i16( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: lsr_i16: +; CHECK: movprfx z2.h, p0/m, z0.h +; CHECK-NEXT: lsr z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.lsr.nxv8i16( %pg, + %a_m, + %b) + ret %out +} + +define @lsr_i32( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: lsr_i32: +; CHECK: movprfx z2.s, p0/m, z0.s +; CHECK-NEXT: lsr z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.lsr.nxv4i32( %pg, + %a_m, + %b) + ret %out +} + +define @lsr_i64( %pg, %a, + %b, %passthru) { +; CHECK-LABEL: lsr_i64: +; CHECK: movprfx z2.d, p0/m, z0.d +; CHECK-NEXT: lsr z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %a_m = select %pg, %a, %passthru + %out = call @llvm.aarch64.sve.lsr.nxv2i64( %pg, + %a_m, + %b) + ret %out +} + declare @llvm.aarch64.sve.asr.nxv16i8(, , ) declare @llvm.aarch64.sve.asr.nxv8i16(, , ) declare @llvm.aarch64.sve.asr.nxv4i32(, , ) Index: llvm/test/CodeGen/AArch64/sve-movprfx-merging.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-movprfx-merging.ll @@ -0,0 +1,88 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + + +define @fsub_merge_z0_z0_z0( %p, %z0) { +; CHECK-LABEL: fsub_merge_z0_z0_z0 +; CHECK: fsub z0.s, p0/m, z0.s, z0.s +; CHECK-NEXT: ret + %z0_in = select %p, %z0, %z0 + %sub = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %z0_in, + %z0) + ret %sub +} + +define @fsub_merge_z0_z1( %p, %z0, + %z1, %pt) { +; CHECK-LABEL: fsub_merge_z0_z1 +; CHECK: movprfx z2.s, p0/m, z0.s +; CHECK-NEXT: fsub z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %z0_in = select %p, %z0, %pt + %sub = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %z0_in, + %z1) + ret %sub +} + +define @fsub_merge_reuse_z0_z1_pt( %p, %z0, + %z1, %pt) { +; CHECK-LABEL: fsub_merge_reuse_z0_z1_pt +; CHECK: mov z3.d, z2.d +; CHECK: movprfx z3.s, p0/m, z0.s +; CHECK-NEXT: fsub z3.s, p0/m, z3.s, z1.s +; CHECK: movprfx z2.s, p0/m, z0.s +; CHECK-NEXT: fsub z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %z0_in = select %p, %z0, %pt + %sub = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %z0_in, + %z1) + %sub2 = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %z0_in, + %sub) + ret %sub2 +} + +define @fsub_merge_reuse2_z0_z1_pt( %p, %z0, + %z1, %pt) { +; CHECK-LABEL: fsub_merge_reuse2_z0_z1_pt +; CHECK: sel z3.s, p0, z0.s, z2.s +; CHECK: movprfx z2.s, p0/m, z0.s +; CHECK-NEXT: fsub z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: fsub z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %z0_in = select %p, %z0, %pt + %sub = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %z0_in, + %z1) + %sub2 = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %sub, + %z0_in) + ret %sub2 +} + +define @fsub_merge_z0_z1_pt_reuse( %p, %z0, + %z1, %pt) { +; CHECK-LABEL: fsub_merge_z0_z1_pt +; CHECK: mov z3.d, z2.d +; CHECK: movprfx z3.s, p0/m, z0.s +; CHECK-NEXT: fsub z3.s, p0/m, z3.s, z1.s +; CHECK-NEXT: fsub z3.s, p0/m, z3.s, z2.s +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: ret + %z0_in = select %p, %z0, %pt + %sub = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %z0_in, + %z1) + %sub2 = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %sub, + %pt) + ret %sub2 +} + +declare @llvm.aarch64.sve.fsub.nxv4f32(, , ) + Index: llvm/test/CodeGen/AArch64/sve-movprfx-zeroing.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-movprfx-zeroing.ll @@ -0,0 +1,120 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +define @fsub_zero_z0_z0( %p, %z0) { +; CHECK-LABEL: fsub_zero_z0_z0 +; CHECK: movprfx z1.s, p0/z, z0.s +; CHECK-NEXT: fsub z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: z0.d, z1.d +; CHECK-NEXT: ret + %z0_in = select %p, %z0, zeroinitializer + %sub = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %z0_in, + %z0) + ret %sub +} + +define @fsub_zero_z0_z1( %p, %z0, + %z1) { +; CHECK-LABEL: fsub_zero_z0_z1 +; CHECK: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %z0_in = select %p, %z0, zeroinitializer + %sub = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %z0_in, + %z1) + ret %sub +} + +define @fsub_zero_z0_reuse_z01( %p, %z0, %z1) { +; CHECK-LABEL: fsub_zero_z0_reuse_z01 +; CHECK: movprfx z1.s, p0/z, z1.s +; CHECK-NEXT: fsubr z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %z0_in = select %p, %z0, zeroinitializer + %sub = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %z0_in, + %z1) + %sub2 = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %z0, + %sub) + ret %sub2 +} + +define @fsub_zero_z0_z0_fsub_zero_z0_z0( %p, %z0) { +; CHECK-LABEL: fsub_zero_z0_z0_fsub_zero_z0_z0 +; CHECK: movprfx z1.s, p0/z, z0.s +; CHECK-NEXT: fsub z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: fsub z1.s, p0/m, z1.s, z1.s +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %z0_in = select %p, %z0, zeroinitializer + %sub = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %z0_in, + %z0) + %sub2 = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %z0_in, + %z0) + %sub3 = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %sub, + %sub2) + ret %sub3 +} + +define @fsub_zero_z0_z1_fsub_zero_z0_z2( %p, %z0, + %z1, %z2) { +; CHECK-LABEL: fsub_zero_z0_z1_fsub_zero_z0_z2 +; CHECK: movprfx z1.s, p0/z, z1.s +; CHECK-NEXT: fsubr z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: fsub z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %z0_in = select %p, %z0, zeroinitializer + %sub = call @llvm.aarch64.sve.fsub.nxv4f32( %p, %z0_in, %z1) + %sub2 = call @llvm.aarch64.sve.fsub.nxv4f32( %p, %z0_in, %z2) + %sub3 = call @llvm.aarch64.sve.fsub.nxv4f32( %p, %sub, %sub2) + ret %sub3 +} + +define @fsub_zero_z0_z0_reuse( %p, %z0) { +; CHECK-LABEL: fsub_zero_z0_z0_reuse +; CHECK: movprfx z1.s, p0/z, z0.s +; CHECK-NEXT: fsub z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: fsub z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: z0.d, z1.d +; CHECK-NEXT: ret + %z0_in = select %p, %z0, zeroinitializer + %sub = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %z0_in, + %z0) + %sub2 = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %sub, + %z0) + ret %sub2 +} + +define @fsub_zero_reuse_z0_z0( %p, %z0) { +; CHECK-LABEL: fsub_zero_reuse_z0_z0 +; CHECK: mov z2.s, #0 +; CHECK-NEXT: sel z3.s, p0, z0.s, z2.s +; CHECK-NEXT: movprfx z1.s, p0/z, z0.s +; CHECK-NEXT: fsub z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: fsub z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %z0_in = select %p, %z0, zeroinitializer + %sub = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %z0_in, + %z0) + %sub2 = call @llvm.aarch64.sve.fsub.nxv4f32( %p, + %sub, + %z0_in) + ret %sub2 +} + +declare @llvm.aarch64.sve.fsub.nxv8f16(, , ) +declare @llvm.aarch64.sve.fsub.nxv4f32(, , ) +declare @llvm.aarch64.sve.fsub.nxv2f64(, , )