Index: llvm/lib/Target/ARM/ARM.h =================================================================== --- llvm/lib/Target/ARM/ARM.h +++ llvm/lib/Target/ARM/ARM.h @@ -47,6 +47,7 @@ FunctionPass *createMLxExpansionPass(); FunctionPass *createThumb2ITBlockPass(); FunctionPass *createMVEVPTBlockPass(); +FunctionPass *createMVEVPTOptimisationsPass(); FunctionPass *createARMOptimizeBarriersPass(); FunctionPass *createThumb2SizeReductionPass( std::function Ftor = nullptr); @@ -66,6 +67,7 @@ void initializeThumb2SizeReducePass(PassRegistry &); void initializeThumb2ITBlockPass(PassRegistry &); void initializeMVEVPTBlockPass(PassRegistry &); +void initializeMVEVPTOptimisationsPass(PassRegistry &); void initializeARMLowOverheadLoopsPass(PassRegistry &); void initializeMVETailPredicationPass(PassRegistry &); void initializeMVEGatherScatterLoweringPass(PassRegistry &); Index: llvm/lib/Target/ARM/ARMTargetMachine.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -96,6 +96,7 @@ initializeARMExpandPseudoPass(Registry); initializeThumb2SizeReducePass(Registry); initializeMVEVPTBlockPass(Registry); + initializeMVEVPTOptimisationsPass(Registry); initializeMVETailPredicationPass(Registry); initializeARMLowOverheadLoopsPass(Registry); initializeMVEGatherScatterLoweringPass(Registry); @@ -485,6 +486,8 @@ } void ARMPassConfig::addPreRegAlloc() { + addPass(createMVEVPTOptimisationsPass()); + if (getOptLevel() != CodeGenOpt::None) { addPass(createMLxExpansionPass()); Index: llvm/lib/Target/ARM/CMakeLists.txt =================================================================== --- llvm/lib/Target/ARM/CMakeLists.txt +++ llvm/lib/Target/ARM/CMakeLists.txt @@ -54,6 +54,7 @@ MVEGatherScatterLowering.cpp MVETailPredication.cpp MVEVPTBlockPass.cpp + MVEVPTOptimisationsPass.cpp Thumb1FrameLowering.cpp Thumb1InstrInfo.cpp ThumbRegisterInfo.cpp Index: llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp @@ -0,0 +1,282 @@ +//===-- MVEVPTOptimisationsPass.cpp ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMSubtarget.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "Thumb2InstrInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/Support/Debug.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "arm-mve-vpt-opts" + +namespace { +class MVEVPTOptimisations : public MachineFunctionPass { +public: + static char ID; + const Thumb2InstrInfo *TII; + MachineRegisterInfo *MRI; + + MVEVPTOptimisations() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &Fn) override; + + StringRef getPassName() const override { return "MVE VPT Optimisation Pass"; } + +private: + MachineInstrBuilder BuildVPNOTBefore(MachineBasicBlock &MBB, + MachineInstr &Instr); + MachineInstr &ReplaceUsageOfRegisterByVPNOT(MachineBasicBlock &MBB, + MachineInstr &Instr, + unsigned OpIdx, Register Target); + bool InsertVPNOTs(MachineBasicBlock &MBB); +}; + +char MVEVPTOptimisations::ID = 0; + +} // end anonymous namespace + +INITIALIZE_PASS(MVEVPTOptimisations, DEBUG_TYPE, + "ARM MVE VPT Optimisations pass", false, false) + +static bool IsVCMP(unsigned Opcode) { + switch (Opcode) { + case ARM::MVE_VCMPf16: + case ARM::MVE_VCMPf16r: + case ARM::MVE_VCMPf32: + case ARM::MVE_VCMPf32r: + case ARM::MVE_VCMPi16: + case ARM::MVE_VCMPi16r: + case ARM::MVE_VCMPi32: + case ARM::MVE_VCMPi32r: + case ARM::MVE_VCMPi8: + case ARM::MVE_VCMPi8r: + case ARM::MVE_VCMPs16: + case ARM::MVE_VCMPs16r: + case ARM::MVE_VCMPs32: + case ARM::MVE_VCMPs32r: + case ARM::MVE_VCMPs8: + case ARM::MVE_VCMPs8r: + case ARM::MVE_VCMPu16: + case ARM::MVE_VCMPu16r: + case ARM::MVE_VCMPu32: + case ARM::MVE_VCMPu32r: + case ARM::MVE_VCMPu8: + case ARM::MVE_VCMPu8r: + return true; + default: + return false; + } +} + +// Returns the CondCode of a VCMP Instruction. +static ARMCC::CondCodes GetCondCode(MachineInstr &Instr) { + assert(IsVCMP(Instr.getOpcode()) && "Inst must be a VCMP"); + return ARMCC::CondCodes(Instr.getOperand(3).getImm()); +} + +// Returns true if Cond is equivalent to a VPNOT instruction on the result of +// Prev. Cond and Prev must be VCMPs. +static bool IsVPNOTEquivalent(MachineInstr &Cond, MachineInstr &Prev) { + assert(IsVCMP(Cond.getOpcode()) && IsVCMP(Prev.getOpcode())); + + // Opcodes must match. + if (Cond.getOpcode() != Prev.getOpcode()) + return false; + + // The condition code of Cond must be the opposite of Prev's, with + // operands swapped. + ARMCC::CondCodes ExpectedCode = GetCondCode(Cond); + ExpectedCode = ARMCC::getSwappedCondition(ExpectedCode); + ExpectedCode = ARMCC::getOppositeCondition(ExpectedCode); + if (ExpectedCode != GetCondCode(Prev)) + return false; + + MachineOperand &CondOP1 = Cond.getOperand(1), &CondOP2 = Cond.getOperand(2); + MachineOperand &PrevOP1 = Prev.getOperand(1), &PrevOP2 = Prev.getOperand(2); + // If we have == and != (or the opposite), the operands can be identical. + if ((GetCondCode(Cond) == ARMCC::NE && GetCondCode(Prev) == ARMCC::EQ) || + (GetCondCode(Cond) == ARMCC::EQ && GetCondCode(Prev) == ARMCC::NE)) + if (CondOP1.isIdenticalTo(PrevOP1) && CondOP2.isIdenticalTo(PrevOP2)) + return true; + // Else, operands must be swapped. + return CondOP1.isIdenticalTo(PrevOP2) && CondOP2.isIdenticalTo(PrevOP1); +} + +// Returns true if Instr writes to VCCR or VPR. +static bool IsWritingToVCCRorVPR(MachineInstr &Instr) { + if (Instr.getNumOperands() == 0) + return false; + MachineOperand &Dst = Instr.getOperand(0); + if (!Dst.isReg()) + return false; + Register DstReg = Dst.getReg(); + if (!DstReg.isVirtual()) + return DstReg.id() == ARM::VPR; + MachineRegisterInfo &RegInfo = Instr.getMF()->getRegInfo(); + const TargetRegisterClass *RegClass = RegInfo.getRegClassOrNull(DstReg); + return RegClass && (RegClass->getID() == ARM::VCCRRegClassID); +} + +// Creates a VPNOT before Instr. +MachineInstrBuilder +MVEVPTOptimisations::BuildVPNOTBefore(MachineBasicBlock &MBB, + MachineInstr &Instr) { + return BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT)); +} + +// Transforms +// +// Into +// %K = VPNOT %Target +// +// And returns %K. +// This optimization is done in the hopes of preventing spills/reloads of VPR. +MachineInstr &MVEVPTOptimisations::ReplaceUsageOfRegisterByVPNOT( + MachineBasicBlock &MBB, MachineInstr &Instr, unsigned OpIdx, + Register Target) { + MachineOperand &InstrOperand = Instr.getOperand(OpIdx); + + Register NewResult = MRI->createVirtualRegister(MRI->getRegClass(Target)); + MachineInstrBuilder MIBuilder = BuildVPNOTBefore(MBB, Instr); + MIBuilder.add(MachineOperand::CreateReg(NewResult, /*isDef*/ true)); + MIBuilder.add(MachineOperand::CreateReg(Target, /*isDef*/ false)); + MIBuilder.addImm(0); + MIBuilder.addReg({}); + InstrOperand.setReg(NewResult); + + LLVM_DEBUG(dbgs() << " Inserting VPNOT (for spill prevention): "; + MIBuilder.getInstr()->dump()); + + return *MIBuilder.getInstr(); +} + +// Replaces VCMPs by VPNOTs when possible, and tries to reduce spills by +// replacing uses of old VPR values with VPNOTs inside predicated instruction +// blocks. +bool MVEVPTOptimisations::InsertVPNOTs(MachineBasicBlock &MBB) { + // The first instruction is the VCMP that will be replaced by a VPNOT. + // The second instruction is the VCMP that defines the register that'll be the + // VPNOT's operand. + SmallVector, 4> WorkList; + + // The last VCMP that we have seen and that couldn't be replaced. + // This is reset when an instruction that writes to VCCR/VPR is found, or when + // an element is added to the WorkList. + MachineInstr *PrevVCMP = nullptr; + + // Iterate over all VCMPs to create the worklist. + for (MachineInstr &Instr : MBB.instrs()) { + if (!IsVCMP(Instr.getOpcode())) { + // If it's an unpredicated instruction that writes to VPR (VCCR), forget + // about the previous VCMP. + if ((getVPTInstrPredicate(Instr) == ARMVCC::None) && + IsWritingToVCCRorVPR(Instr)) + PrevVCMP = nullptr; + continue; + } + + // If we have seen a VCMP previously, and this is VCMP is equivalent to a + // VPNOT, we can replace it, so add it to the worklist. + if (PrevVCMP && IsVPNOTEquivalent(Instr, *PrevVCMP)) { + LLVM_DEBUG(dbgs() << " Adding VCMP to WorkList:"; Instr.dump()); + WorkList.push_back({&Instr, PrevVCMP}); + PrevVCMP = nullptr; + } else + PrevVCMP = &Instr; + } + + LLVM_DEBUG(dbgs() << (WorkList.empty() ? "No Work to do\n" + : "Processing worklist\n")); + for (std::pair Item : WorkList) { + MachineInstr *SwappedVCMP = Item.first; + MachineInstr *OriginalVCMP = Item.second; + Register Reg = OriginalVCMP->getOperand(0).getReg(); + + MachineInstrBuilder MIBuilder = BuildVPNOTBefore(MBB, *SwappedVCMP); + MIBuilder.add(SwappedVCMP->getOperand(0)); + MIBuilder.addReg(Reg); + MIBuilder.add(SwappedVCMP->getOperand(4)); + MIBuilder.add(SwappedVCMP->getOperand(5)); + LLVM_DEBUG(dbgs() << " Inserting VPNOT (to replace VCMP): "; + MIBuilder.getInstr()->dump()); + + // While inside the block of predicated instructions, replace usages of old + // VCCR values by VPNOTs. That way, we avoid overlapping lifetimes + // of different VPR values (which always result in spill/reloads). + // Those VPNOTs can then be removed by the MVE VPT Block Insertion pass, + // and we should end up with clean blocks like "TETE", "TEET", etc. + + Register ValueReg = Reg; + Register InverseValueReg = SwappedVCMP->getOperand(0).getReg(); + Register VPNOTOperand = InverseValueReg; + + // On each iteration, we try to replace an usage of "ValueReg" with a VPNOT + // on "VPNOTOperand". When this transformation happens, ValueReg and + // InverseValueReg are swapped, and VPNOTOperand is set to the result of the + // latest VPNOT inserted. + for (MachineBasicBlock::instr_iterator Iter = ++SwappedVCMP->getIterator(); + Iter != MBB.end(); ++Iter) { + // Stop as soon as we leave the block of predicated instructions + if (getVPTInstrPredicate(*Iter) == ARMVCC::None) + break; + + // Keep going until we find an instruction that uses ValueReg. + int Idx = Iter->findRegisterUseOperandIdx(ValueReg.id()); + if (Idx == -1) + continue; + + // Replace the usage of said register by a VPNOT on VPNOTOperand + MachineInstr &VPNOT = + ReplaceUsageOfRegisterByVPNOT(MBB, *Iter, Idx, VPNOTOperand); + + // Continue: The result of the VPNOT we just inserted becomes the new + // VPNOTOperand, and we swap ValueReg/InverseValueReg. + VPNOTOperand = VPNOT.getOperand(0).getReg(); + std::swap(ValueReg, InverseValueReg); + } + + // Finally, remove the old VCMP. + SwappedVCMP->removeFromParent(); + } + + return !WorkList.empty(); +} + +bool MVEVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) { + const ARMSubtarget &STI = + static_cast(Fn.getSubtarget()); + + if (!STI.isThumb2() || !STI.hasMVEIntegerOps()) + return false; + + TII = static_cast(STI.getInstrInfo()); + MRI = &Fn.getRegInfo(); + + LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n" + << "********** Function: " << Fn.getName() << '\n'); + + bool Modified = false; + for (MachineBasicBlock &MBB : Fn) + Modified |= InsertVPNOTs(MBB); + + LLVM_DEBUG(dbgs() << "**************************************\n"); + return Modified; +} + +/// createMVEVPTOptimisations +FunctionPass *llvm::createMVEVPTOptimisationsPass() { + return new MVEVPTOptimisations(); +} \ No newline at end of file Index: llvm/test/CodeGen/ARM/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/ARM/O3-pipeline.ll +++ llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -92,6 +92,7 @@ ; CHECK-NEXT: Machine code sinking ; CHECK-NEXT: Peephole Optimizations ; CHECK-NEXT: Remove dead machine instructions +; CHECK-NEXT: MVE VPT Optimisation Pass ; CHECK-NEXT: ARM MLA / MLS expansion pass ; CHECK-NEXT: ARM pre- register allocation load / store optimization pass ; CHECK-NEXT: ARM A15 S->D optimizer Index: llvm/test/CodeGen/Thumb2/mve-vcmpf.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vcmpf.ll +++ llvm/test/CodeGen/Thumb2/mve-vcmpf.ll @@ -697,7 +697,7 @@ ; CHECK-MVEFP-LABEL: vcmp_ord_v4f32: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vpt.f32 le, q1, q0 -; CHECK-MVEFP-NEXT: vcmpt.f32 lt, q0, q1 +; CHECK-MVEFP-NEXT: vpnott ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q2, q3 ; CHECK-MVEFP-NEXT: bx lr @@ -751,7 +751,7 @@ ; CHECK-MVEFP-LABEL: vcmp_uno_v4f32: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vpt.f32 le, q1, q0 -; CHECK-MVEFP-NEXT: vcmpt.f32 lt, q0, q1 +; CHECK-MVEFP-NEXT: vpnott ; CHECK-MVEFP-NEXT: vpsel q0, q2, q3 ; CHECK-MVEFP-NEXT: bx lr entry: @@ -2405,7 +2405,7 @@ ; CHECK-MVEFP-LABEL: vcmp_ord_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vpt.f16 le, q1, q0 -; CHECK-MVEFP-NEXT: vcmpt.f16 lt, q0, q1 +; CHECK-MVEFP-NEXT: vpnott ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q2, q3 ; CHECK-MVEFP-NEXT: bx lr @@ -2531,7 +2531,7 @@ ; CHECK-MVEFP-LABEL: vcmp_uno_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vpt.f16 le, q1, q0 -; CHECK-MVEFP-NEXT: vcmpt.f16 lt, q0, q1 +; CHECK-MVEFP-NEXT: vpnott ; CHECK-MVEFP-NEXT: vpsel q0, q2, q3 ; CHECK-MVEFP-NEXT: bx lr entry: Index: llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir @@ -0,0 +1,225 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass arm-mve-vpt-opts %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-arm-none-eabi" + + define hidden arm_aapcs_vfpcc <4 x float> @vpt_opts(<4 x float> %inactive1, <4 x float> %inactive2, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 { + entry: + %conv.i = zext i16 %p to i32 + %0 = tail call nnan ninf nsz <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float> undef, <4 x float> %a, <4 x float> %b, i32 %conv.i) #2 + %1 = tail call nnan ninf nsz <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float> undef, <4 x float> %0, <4 x float> %0, i32 %conv.i) #2 + %2 = tail call nnan ninf nsz <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float> %inactive1, <4 x float> %1, <4 x float> %b, i32 %conv.i) #2 + %3 = tail call nnan ninf nsz <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float> %inactive2, <4 x float> %2, <4 x float> %b, i32 %conv.i) #2 + ret <4 x float> %3 + } + + declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1 + + attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { nounwind readnone } + attributes #2 = { nounwind } + +... +--- +name: vpt_opts +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$q0', virtual-reg: '' } + - { reg: '$q1', virtual-reg: '' } + - { reg: '$q2', virtual-reg: '' } + - { reg: '$q3', virtual-reg: '' } + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +constants: [] +body: | + bb.0.entry: + liveins: $q0, $q1, $q2, $r0, $r1 + + ; CHECK-LABEL: name: vpt_opts + ; CHECK: liveins: $q0, $q1, $q2, $r0, $r1 + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPf16 renamable $q0, renamable $q2, 10, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPf32 renamable $q0, renamable $q2, 10, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPi16 renamable $q0, renamable $q2, 10, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPi32 renamable $q0, renamable $q2, 10, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPi8 renamable $q0, renamable $q2, 10, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs16 renamable $q0, renamable $q2, 10, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs8 renamable $q0, renamable $q2, 10, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPu16 renamable $q0, renamable $q2, 10, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPu32 renamable $q0, renamable $q2, 10, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPu8 renamable $q0, renamable $q2, 10, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 11, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 12, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 13, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 0, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 0, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 1, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 1, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q2, renamable $q0, 12, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPf16 renamable $q0, renamable $q2, 0, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 1, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q2, renamable $q0, 11, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 12, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 11, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q2, renamable $q0, 13, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 12, 0, $noreg + ; CHECK: [[MVE_VPNOT:%[0-9]+]]:vccr = MVE_VPNOT killed renamable $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q2, renamable $q0, 10, 0, $noreg + ; CHECK: [[MVE_VCMPs32_:%[0-9]+]]:vccr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg + ; CHECK: [[MVE_VPNOT:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VCMPs32_]], 0, $noreg + ; CHECK: [[MVE_VORR:%[0-9]+]]:mqpr = MVE_VORR renamable $q2, renamable $q2, 1, [[MVE_VPNOT]], undef [[MVE_VORR]] + ; CHECK: [[MVE_VPNOT1:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VPNOT]], 0, $noreg + ; CHECK: [[MVE_VORR1:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR]], [[MVE_VORR]], 1, [[MVE_VPNOT1]], undef [[MVE_VORR1]] + ; CHECK: [[MVE_VPNOT2:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VPNOT1]], 0, $noreg + ; CHECK: [[MVE_VORR2:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR1]], [[MVE_VORR1]], 1, [[MVE_VPNOT2]], undef [[MVE_VORR2]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $q0 + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg + renamable $vpr = MVE_VCMPs32 renamable $q2, renamable $q0, 12, 0, $noreg + + renamable $vpr = MVE_VCMPf16 renamable $q0, renamable $q2, 10, 0, $noreg + renamable $vpr = MVE_VCMPf16 renamable $q2, renamable $q0, 12, 0, $noreg + + renamable $vpr = MVE_VCMPf32 renamable $q0, renamable $q2, 10, 0, $noreg + renamable $vpr = MVE_VCMPf32 renamable $q2, renamable $q0, 12, 0, $noreg + + renamable $vpr = MVE_VCMPi16 renamable $q0, renamable $q2, 10, 0, $noreg + renamable $vpr = MVE_VCMPi16 renamable $q2, renamable $q0, 12, 0, $noreg + + renamable $vpr = MVE_VCMPi32 renamable $q0, renamable $q2, 10, 0, $noreg + renamable $vpr = MVE_VCMPi32 renamable $q2, renamable $q0, 12, 0, $noreg + + renamable $vpr = MVE_VCMPi8 renamable $q0, renamable $q2, 10, 0, $noreg + renamable $vpr = MVE_VCMPi8 renamable $q2, renamable $q0, 12, 0, $noreg + + renamable $vpr = MVE_VCMPs16 renamable $q0, renamable $q2, 10, 0, $noreg + renamable $vpr = MVE_VCMPs16 renamable $q2, renamable $q0, 12, 0, $noreg + + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg + renamable $vpr = MVE_VCMPs32 renamable $q2, renamable $q0, 12, 0, $noreg + + renamable $vpr = MVE_VCMPs8 renamable $q0, renamable $q2, 10, 0, $noreg + renamable $vpr = MVE_VCMPs8 renamable $q2, renamable $q0, 12, 0, $noreg + + renamable $vpr = MVE_VCMPu16 renamable $q0, renamable $q2, 10, 0, $noreg + renamable $vpr = MVE_VCMPu16 renamable $q2, renamable $q0, 12, 0, $noreg + + renamable $vpr = MVE_VCMPu32 renamable $q0, renamable $q2, 10, 0, $noreg + renamable $vpr = MVE_VCMPu32 renamable $q2, renamable $q0, 12, 0, $noreg + + renamable $vpr = MVE_VCMPu8 renamable $q0, renamable $q2, 10, 0, $noreg + renamable $vpr = MVE_VCMPu8 renamable $q2, renamable $q0, 12, 0, $noreg + + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 11, 0, $noreg + renamable $vpr = MVE_VCMPs32 renamable $q2, renamable $q0, 13, 0, $noreg + + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 12, 0, $noreg + renamable $vpr = MVE_VCMPs32 renamable $q2, renamable $q0, 10, 0, $noreg + + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 13, 0, $noreg + renamable $vpr = MVE_VCMPs32 renamable $q2, renamable $q0, 11, 0, $noreg + + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 0, 0, $noreg + renamable $vpr = MVE_VCMPs32 renamable $q2, renamable $q0, 1, 0, $noreg + + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 0, 0, $noreg + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 1, 0, $noreg + + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 1, 0, $noreg + renamable $vpr = MVE_VCMPs32 renamable $q2, renamable $q0, 0, 0, $noreg + + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 1, 0, $noreg + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 0, 0, $noreg + + ; Shouldn't insert 2 VPNOTs + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg + renamable $vpr = MVE_VCMPs32 renamable $q2, renamable $q0, 12, 0, $noreg + renamable $vpr = MVE_VCMPs32 renamable $q2, renamable $q0, 12, 0, $noreg + + ; Shouldn't replace by a VPNOT: Opcodes are different + renamable $vpr = MVE_VCMPf16 renamable $q0, renamable $q2, 0, 0, $noreg + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 1, 0, $noreg + + ; Shouldn't replace by a VPNOT: Condition code is incorrect for second VCMP. + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg + renamable $vpr = MVE_VCMPs32 renamable $q2, renamable $q0, 11, 0, $noreg + + ; Shouldn't replace by a VPNOT: Operands are not swapped. + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 12, 0, $noreg + + ; Shouldn't replace by a VPNOT: Something writes to VPR in-between. + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 11, 0, $noreg + renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + renamable $vpr = MVE_VCMPs32 renamable $q2, renamable $q0, 13, 0, $noreg + + ; Shouldn't replace by a VPNOT: Something writes to VCCR (=VPR) in-between. + renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 12, 0, $noreg + %0:vccr = MVE_VPNOT killed renamable $vpr, 0, $noreg + renamable $vpr = MVE_VCMPs32 renamable $q2, renamable $q0, 10, 0, $noreg + + ; Spill-prevention: Prevent a spill/reload by inserting another VPNOT + ; instead of reusing %0/%1 after VPR has been written to. + %0:vccr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg + %1:vccr = MVE_VCMPs32 renamable $q2, renamable $q0, 12, 0, $noreg + %2:mqpr = MVE_VORR renamable $q2, renamable $q2, 1, %1, undef %2 + %3:mqpr = MVE_VORR %2, %2, 1, %0:vccr, undef %3:mqpr + %4:mqpr = MVE_VORR %3, %3, 1, %1:vccr, undef %4:mqpr + + tBX_RET 14, $noreg, implicit $q0 +...