Index: llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp =================================================================== --- llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp +++ llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp @@ -9,9 +9,14 @@ /// \file This pass does a few optimisations related to MVE VPT blocks before /// register allocation is performed. The goal is to maximize the sizes of the /// blocks that will be created by the MVE VPT Block Insertion pass (which runs -/// after register allocation). Currently, this pass replaces VCMPs with VPNOTs -/// when possible, so the Block Insertion pass can delete them later to create -/// larger VPT blocks. +/// after register allocation). The first optimisation done by this pass is the +/// replacement of VCMPs with VPNOTs when possible, so the Block Insertion pass +/// can delete them later to create larger VPT blocks. The second optimisation +/// replaces re-uses of old VPR values with VPNOTs when inside a block of +/// predicated instructions. This is done to avoid spill/reloads of VPR in the +/// middle of a block, which prevents the Block Insertion pass from creating +/// large blocks. +// //===----------------------------------------------------------------------===// #include "ARM.h" @@ -143,6 +148,35 @@ return BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT)); } +// Transforms +// +// Into +// %K = VPNOT %Target +// +// And returns %K. +// This optimization is done in the hopes of preventing spills/reloads of VPR. +MachineInstr &MVEVPTOptimisations::ReplaceUsageOfRegisterByVPNOT( + MachineBasicBlock &MBB, MachineInstr &Instr, unsigned OpIdx, + Register Target) { + MachineOperand &InstrOperand = Instr.getOperand(OpIdx); + + Register NewResult = MRI->createVirtualRegister(MRI->getRegClass(Target)); + MachineInstrBuilder MIBuilder = BuildVPNOTBefore(MBB, Instr); + MIBuilder.add(MachineOperand::CreateReg(NewResult, /*isDef*/ true)); + MIBuilder.add(MachineOperand::CreateReg(Target, /*isDef*/ false)); + MIBuilder.addImm(0); + MIBuilder.addReg({}); + InstrOperand.setReg(NewResult); + + LLVM_DEBUG(dbgs() << " Inserting VPNOT (for spill prevention): "; + MIBuilder.getInstr()->dump()); + + return *MIBuilder.getInstr(); +} + +// Replaces VCMPs by VPNOTs when possible, and tries to reduce spills by +// replacing uses of old VPR values with VPNOTs inside predicated instruction +// blocks. bool MVEVPTOptimisations::InsertVPNOTs(MachineBasicBlock &MBB) { SmallVector DeadInstructions; @@ -184,6 +218,39 @@ LLVM_DEBUG(dbgs() << " Inserting VPNOT (to replace VCMP): "; MIBuilder.getInstr()->dump()); + // While inside the block of predicated instructions, replace usages of old + // VCCR/VPR values by VPNOTs. That way, we avoid overlapping lifetimes + // of different VPR values (which always result in spill/reloads). + // Those VPNOTs can then be removed by the MVE VPT Block Insertion pass, + // and we should end up with clean blocks like "TETE", "TEET", etc. + + Register ValueReg = PrevVCMPResultReg; + Register InverseValueReg = Instr.getOperand(0).getReg(); + Register VPNOTOperand = InverseValueReg; + + // On each iteration, try to replace an usage of "ValueReg" with a VPNOT + // on "VPNOTOperand". + for (MachineBasicBlock::instr_iterator Iter = ++Instr.getIterator(); + Iter != MBB.end(); ++Iter) { + // Stop as soon as we leave the block of predicated instructions + if (getVPTInstrPredicate(*Iter) == ARMVCC::None) + break; + + // Keep going until we find an instruction that uses ValueReg. + int Idx = Iter->findRegisterUseOperandIdx(ValueReg.id()); + if (Idx == -1) + continue; + + // Replace the usage of said register by a VPNOT on VPNOTOperand + MachineInstr &VPNOT = + ReplaceUsageOfRegisterByVPNOT(MBB, *Iter, Idx, VPNOTOperand); + + // Continue: The result of the VPNOT we just inserted becomes the new + // VPNOTOperand, and ValueReg/InverseValueReg are swapped. + VPNOTOperand = VPNOT.getOperand(0).getReg(); + std::swap(ValueReg, InverseValueReg); + } + // Finally, mark the old VCMP for removal and reset PrevVCMP. DeadInstructions.push_back(&Instr); PrevVCMP = nullptr; Index: llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir +++ llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir @@ -42,6 +42,11 @@ ret <4 x float> %inactive1 } + define arm_aapcs_vfpcc <4 x float> @spill_prevention(<4 x float> %inactive1) #0 { + entry: + ret <4 x float> %inactive1 + } + attributes #0 = { "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" } ... --- @@ -519,3 +524,28 @@ renamable $vpr = MVE_VCMPs32 renamable $q1, renamable $q0, 10, 0, $noreg tBX_RET 14, $noreg, implicit $q0 ... +--- +name: spill_prevention +alignment: 4 +liveins: + - { reg: '$q0', virtual-reg: '' } + - { reg: '$q1', virtual-reg: '' } +body: | + bb.0: + liveins: $q0, $q1 + ; CHECK-LABEL: name: spill_prevention + ; CHECK: [[MVE_VCMPs32_:%[0-9]+]]:vccr = MVE_VCMPs32 renamable $q0, renamable $q1, 10, 0, $noreg + ; CHECK: [[MVE_VPNOT:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VCMPs32_]], 0, $noreg + ; CHECK: [[MVE_VORR:%[0-9]+]]:mqpr = MVE_VORR renamable $q1, renamable $q1, 1, [[MVE_VPNOT]], undef [[MVE_VORR]] + ; CHECK: [[MVE_VPNOT1:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VPNOT]], 0, $noreg + ; CHECK: [[MVE_VORR1:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR]], [[MVE_VORR]], 1, [[MVE_VPNOT1]], undef [[MVE_VORR1]] + ; CHECK: [[MVE_VPNOT2:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VPNOT1]], 0, $noreg + ; CHECK: [[MVE_VORR2:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR1]], [[MVE_VORR1]], 1, [[MVE_VPNOT2]], undef [[MVE_VORR2]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $q0 + %0:vccr = MVE_VCMPs32 renamable $q0, renamable $q1, 10, 0, $noreg + %1:vccr = MVE_VCMPs32 renamable $q1, renamable $q0, 12, 0, $noreg + %2:mqpr = MVE_VORR renamable $q1, renamable $q1, 1, %1, undef %2 + %3:mqpr = MVE_VORR %2, %2, 1, %0:vccr, undef %3:mqpr + %4:mqpr = MVE_VORR %3, %3, 1, %1:vccr, undef %4:mqpr + tBX_RET 14, $noreg, implicit $q0 +...