Index: llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp =================================================================== --- llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp +++ llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp @@ -65,6 +65,7 @@ Register Target); bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB); bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB); + bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT); bool ConvertVPSEL(MachineBasicBlock &MBB); }; @@ -619,6 +620,90 @@ return !DeadInstructions.empty(); } +bool MVEVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB, + MachineDominatorTree *DT) { + // Scan through the block, looking for instructions that use constants moves + // into VPR that are the negative of one another. These are expected to be + // COPY's to VCCRRegClass, from a t2MOVi or t2MOVi16. The seen constant VPT + // masks are places into a VPTMap, kept updated to the most recent version of + // a constant as we scan through the block. + unsigned LastVPTImm = 0; + Register LastVPTReg = 0; + SmallSet DeadInstructions; + + for (MachineInstr &Instr : MBB.instrs()) { + // Look for predicated MVE instructions. + int PIdx = llvm::findFirstVPTPredOperandIdx(Instr); + if (PIdx == -1) + continue; + Register VPR = Instr.getOperand(PIdx + 1).getReg(); + if (!VPR.isVirtual()) + continue; + + // From that we are looking for an instruction like %11:vccr = COPY %9:rgpr. + MachineInstr *Copy = MRI->getVRegDef(VPR); + if (!Copy || Copy->getOpcode() != TargetOpcode::COPY || + !Copy->getOperand(1).getReg().isVirtual() || + MRI->getRegClass(Copy->getOperand(1).getReg()) == &ARM::VCCRRegClass) { + LastVPTReg = 0; + continue; + } + Register GPR = Copy->getOperand(1).getReg(); + + // Find the Immediate used by the copy. + auto getImm = [&](Register GPR) -> unsigned { + MachineInstr *Def = MRI->getVRegDef(GPR); + if (Def && (Def->getOpcode() == ARM::t2MOVi || + Def->getOpcode() == ARM::t2MOVi16)) + return Def->getOperand(1).getImm(); + return -1U; + }; + unsigned Imm = getImm(GPR); + if (Imm == -1U) { + LastVPTReg = 0; + continue; + } + + unsigned NotImm = ~Imm & 0xffff; + if (LastVPTReg != 0 && LastVPTReg != VPR && LastVPTImm == Imm) { + Instr.getOperand(PIdx + 1).setReg(LastVPTReg); + if (MRI->use_empty(VPR)) { + DeadInstructions.insert(Copy); + if (MRI->hasOneUse(GPR)) + DeadInstructions.insert(MRI->getVRegDef(GPR)); + } + LLVM_DEBUG(dbgs() << "Reusing predicate: in " << Instr); + } else if (LastVPTReg != 0 && LastVPTImm == NotImm) { + // We have found the not of a previous constant. Create a VPNot of the + // earlier predicate reg and use it instead of the copy. + Register NewVPR = MRI->createVirtualRegister(&ARM::VCCRRegClass); + auto VPNot = BuildMI(MBB, &Instr, Instr.getDebugLoc(), + TII->get(ARM::MVE_VPNOT), NewVPR) + .addReg(LastVPTReg); + addUnpredicatedMveVpredNOp(VPNot); + + // Use the new register and check if the def is now dead. + Instr.getOperand(PIdx + 1).setReg(NewVPR); + if (MRI->use_empty(VPR)) { + DeadInstructions.insert(Copy); + if (MRI->hasOneUse(GPR)) + DeadInstructions.insert(MRI->getVRegDef(GPR)); + } + LLVM_DEBUG(dbgs() << "Adding VPNot: " << *VPNot << " to replace use at " + << Instr); + VPR = NewVPR; + } + + LastVPTImm = Imm; + LastVPTReg = VPR; + } + + for (MachineInstr *DI : DeadInstructions) + DI->eraseFromParent(); + + return !DeadInstructions.empty(); +} + // Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a // somewhat blunt approximation to allow tail predicated with vpsel // instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly @@ -678,6 +763,7 @@ Modified |= ConvertTailPredLoop(ML, DT); for (MachineBasicBlock &MBB : Fn) { + Modified |= ReplaceConstByVPNOTs(MBB, DT); Modified |= ReplaceVCMPsByVPNOTs(MBB); Modified |= ReduceOldVCCRValueUses(MBB); Modified |= ConvertVPSEL(MBB); Index: llvm/test/CodeGen/Thumb2/mve-pred-constfold.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-constfold.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-constfold.ll @@ -6,27 +6,15 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r6, r7, lr} ; CHECK-NEXT: push {r4, r6, r7, lr} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: movw r1, #52428 ; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: movw r1, #13107 -; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstete ; CHECK-NEXT: vaddvt.s16 r12, q1 -; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: vpst -; CHECK-NEXT: vaddvt.s16 r2, q1 -; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vpst +; CHECK-NEXT: vaddve.s16 r2, q1 ; CHECK-NEXT: vaddvt.s16 r4, q0 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vaddvt.s16 r6, q0 +; CHECK-NEXT: vaddve.s16 r6, q0 ; CHECK-NEXT: strd r6, r4, [r0] ; CHECK-NEXT: strd r2, r12, [r0, #8] -; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: pop {r4, r6, r7, pc} entry: %0 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 13107) @@ -152,14 +140,11 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: movs r1, #1 ; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vaddvat.s32 r0, q0 -; CHECK-NEXT: vaddvat.s32 r0, q1 -; CHECK-NEXT: movw r1, #65534 -; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vpstt +; CHECK-NEXT: vpsttee ; CHECK-NEXT: vaddvat.s32 r0, q0 ; CHECK-NEXT: vaddvat.s32 r0, q1 +; CHECK-NEXT: vaddvae.s32 r0, q0 +; CHECK-NEXT: vaddvae.s32 r0, q1 ; CHECK-NEXT: bx lr %4 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 1) %5 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %0, i32 0, <4 x i1> %4) @@ -206,14 +191,11 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: movw r1, #1234 ; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vaddvat.s32 r0, q0 -; CHECK-NEXT: vaddvat.s32 r0, q1 -; CHECK-NEXT: movw r1, #64301 -; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vpstt +; CHECK-NEXT: vpsttee ; CHECK-NEXT: vaddvat.s32 r0, q0 ; CHECK-NEXT: vaddvat.s32 r0, q1 +; CHECK-NEXT: vaddvae.s32 r0, q0 +; CHECK-NEXT: vaddvae.s32 r0, q1 ; CHECK-NEXT: bx lr %4 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 1234) %5 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %0, i32 0, <4 x i1> %4) @@ -231,25 +213,13 @@ define arm_aapcs_vfpcc i32 @const_mask_abab(<4 x i32> %0, <4 x i32> %1, i32 %2) { ; CHECK-LABEL: const_mask_abab: ; CHECK: @ %bb.0: -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: movw r1, #1234 ; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: movw r1, #64301 -; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstete ; CHECK-NEXT: vaddvat.s32 r0, q0 -; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: vpst -; CHECK-NEXT: vaddvat.s32 r0, q1 -; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvae.s32 r0, q1 ; CHECK-NEXT: vaddvat.s32 r0, q1 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vaddvat.s32 r0, q0 -; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: vaddvae.s32 r0, q0 ; CHECK-NEXT: bx lr %4 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 1234) %5 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %0, i32 0, <4 x i1> %4) @@ -267,26 +237,16 @@ define arm_aapcs_vfpcc i32 @const_mask_abbreakab(<4 x i32> %0, <4 x i32> %1, i32 %2) { ; CHECK-LABEL: const_mask_abbreakab: ; CHECK: @ %bb.0: -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: movw r1, #1234 ; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: movw r1, #64301 -; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: vpst +; CHECK-NEXT: vpste ; CHECK-NEXT: vaddvat.s32 r0, q0 -; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: vpst -; CHECK-NEXT: vaddvat.s32 r0, q1 +; CHECK-NEXT: vaddvae.s32 r0, q1 ; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vpst +; CHECK-NEXT: vpnot +; CHECK-NEXT: vpste ; CHECK-NEXT: vaddvat.s32 r0, q1 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vaddvat.s32 r0, q0 -; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: vaddvae.s32 r0, q0 ; CHECK-NEXT: bx lr %4 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 1234) %5 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %0, i32 0, <4 x i1> %4) @@ -312,9 +272,8 @@ ; CHECK-NEXT: vpstt ; CHECK-NEXT: vaddvat.s32 r0, q0 ; CHECK-NEXT: vaddvat.s32 r0, q1 -; CHECK-NEXT: movw r1, #64301 ; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpnot ; CHECK-NEXT: vpstt ; CHECK-NEXT: vaddvat.s32 r0, q1 ; CHECK-NEXT: vaddvat.s32 r0, q0 @@ -372,27 +331,24 @@ define arm_aapcs_vfpcc i32 @const_mask_threepredabab(<4 x i32> %0, <4 x i32> %1, i32 %2) { ; CHECK-LABEL: const_mask_threepredabab: ; CHECK: @ %bb.0: -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: movw r1, #1234 ; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: movw r1, #64301 -; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill ; CHECK-NEXT: vpst ; CHECK-NEXT: vaddvat.s32 r0, q0 -; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill +; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload +; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst ; CHECK-NEXT: vaddvat.s32 r0, q1 ; CHECK-NEXT: vpt.s32 gt, q1, q0 ; CHECK-NEXT: vaddvat.s32 r0, q1 -; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vaddvat.s32 r0, q1 ; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vaddvat.s32 r0, q0 -; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: vpste +; CHECK-NEXT: vaddvat.s32 r0, q1 +; CHECK-NEXT: vaddvae.s32 r0, q0 +; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: bx lr %4 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 1234) %5 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %0, i32 0, <4 x i1> %4) Index: llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir +++ llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir @@ -836,11 +836,11 @@ %4:mqpr = MVE_VORR %0:mqpr, %0:mqpr, 1, %2:vccr, undef %4:mqpr %5:mqpr = MVE_VORR %4:mqpr, %4:mqpr, 1, %3:vccr, undef %5:mqpr bb.1: - %2:vccr = MVE_VCMPs32 %0:mqpr, %1:mqpr, 10, 0, $noreg - %3:vccr = MVE_VPNOT %2:vccr, 1, %2:vccr - %4:mqpr = MVE_VORR %0:mqpr, %1:mqpr, 1, %3:vccr, undef %4:mqpr - %5:mqpr = MVE_VORR %0:mqpr, %0:mqpr, 1, %2:vccr, undef %5:mqpr - %6:mqpr = MVE_VORR %1:mqpr, %0:mqpr, 1, %3:vccr, undef %6:mqpr + %12:vccr = MVE_VCMPs32 %0:mqpr, %1:mqpr, 10, 0, $noreg + %13:vccr = MVE_VPNOT %12:vccr, 1, %12:vccr + %14:mqpr = MVE_VORR %0:mqpr, %1:mqpr, 1, %13:vccr, undef %14:mqpr + %15:mqpr = MVE_VORR %0:mqpr, %0:mqpr, 1, %12:vccr, undef %15:mqpr + %16:mqpr = MVE_VORR %1:mqpr, %0:mqpr, 1, %13:vccr, undef %16:mqpr tBX_RET 14, $noreg, implicit %0:mqpr ... ---