Index: llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp =================================================================== --- llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp +++ llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp @@ -9,9 +9,14 @@ /// \file This pass does a few optimisations related to MVE VPT blocks before /// register allocation is performed. The goal is to maximize the sizes of the /// blocks that will be created by the MVE VPT Block Insertion pass (which runs -/// after register allocation). Currently, this pass replaces VCMPs with VPNOTs -/// when possible, so the Block Insertion pass can delete them later to create -/// larger VPT blocks. +/// after register allocation). The first optimisation done by this pass is the +/// replacement of "opposite" VCMPs with VPNOTs, so the Block Insertion pass +/// can delete them later to create larger VPT blocks. +/// The second optimisation replaces re-uses of old VCCR values with VPNOTs when +/// inside a block of predicated instructions. This is done to avoid +/// spill/reloads of VPR in the middle of a block, which prevents the Block +/// Insertion pass from creating large blocks. +// //===----------------------------------------------------------------------===// #include "ARM.h" @@ -46,6 +51,11 @@ } private: + MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB, + MachineInstr &Instr, + MachineOperand &User, + Register Target); + bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB); bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB); }; @@ -131,6 +141,225 @@ return RegClass && (RegClass->getID() == ARM::VCCRRegClassID); } +// Transforms +// +// Into +// %K = VPNOT %Target +// +// And returns the newly inserted VPNOT. +// This optimization is done in the hopes of preventing spills/reloads of VPR by +// reducing the number of VCCR values with overlapping lifetimes. +MachineInstr &MVEVPTOptimisations::ReplaceRegisterUseWithVPNOT( + MachineBasicBlock &MBB, MachineInstr &Instr, MachineOperand &User, + Register Target) { + Register NewResult = MRI->createVirtualRegister(MRI->getRegClass(Target)); + + MachineInstrBuilder MIBuilder = + BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT)) + .addDef(NewResult) + .addReg(Target); + addUnpredicatedMveVpredNOp(MIBuilder); + + // Make the user use NewResult instead, and clear its kill flag. + User.setReg(NewResult); + User.setIsKill(false); + + LLVM_DEBUG(dbgs() << " Inserting VPNOT (for spill prevention): "; + MIBuilder.getInstr()->dump()); + + return *MIBuilder.getInstr(); +} + +// Moves a VPNOT before its first user if an instruction that uses Reg is found +// in-between the VPNOT and its user. +// Returns true if at there is at least one user of the VPNOT in the block. +static bool MoveVPNOTBeforeFirstUser(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Iter, + Register Reg) { + assert(Iter->getOpcode() == ARM::MVE_VPNOT && "Not a VPNOT!"); + assert(getVPTInstrPredicate(*Iter) == ARMVCC::None && + "The VPNOT cannot be predicated"); + + MachineInstr &VPNOT = *Iter; + Register VPNOTResult = VPNOT.getOperand(0).getReg(); + Register VPNOTOperand = VPNOT.getOperand(1).getReg(); + + // Whether the VPNOT will need to be moved, and whether we found a user of the + // VPNOT. + bool MustMove = false, HasUser = false; + MachineOperand *VPNOTOperandKiller = nullptr; + for (; Iter != MBB.end(); ++Iter) { + if (MachineOperand *MO = + Iter->findRegisterUseOperand(VPNOTOperand, /*isKill*/ true)) { + // If we find the operand that kills the VPNOTOperand's result, save it. + VPNOTOperandKiller = MO; + } + + if (Iter->findRegisterUseOperandIdx(Reg) != -1) { + MustMove = true; + continue; + } + + if (Iter->findRegisterUseOperandIdx(VPNOTResult) == -1) + continue; + + HasUser = true; + if (!MustMove) + break; + + // Move the VPNOT right before Iter + LLVM_DEBUG(dbgs() << "Moving: "; VPNOT.dump(); dbgs() << " Before: "; + Iter->dump()); + MBB.splice(Iter, &MBB, VPNOT.getIterator()); + // If we move the instr, and its operand was killed earlier, remove the kill + // flag. + if(VPNOTOperandKiller) + VPNOTOperandKiller->setIsKill(false); + + break; + } + return HasUser; +} + +// This optimisation attempts to reduce the number of overlapping lifetimes of +// VCCR values by replacing uses of old VCCR values with VPNOTs. For example, +// this replaces +// %A:vccr = (something) +// %B:vccr = VPNOT %A +// %Foo = (some op that uses %B) +// %Bar = (some op that uses %A) +// With +// %A:vccr = (something) +// %B:vccr = VPNOT %A +// %Foo = (some op that uses %B) +// %TMP2:vccr = VPNOT %B +// %Bar = (some op that uses %A) +bool MVEVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) { + MachineBasicBlock::iterator Iter = MBB.begin(), End = MBB.end(); + + Register VCCRValue, OppositeVCCRValue; + // The first loop looks for 2 unpredicated instructions: + // %A:vccr = (instr) ; A is stored in VCCRValue + // %B:vccr = VPNOT %A ; B is stored in OppositeVCCRValue + for (; Iter != End; ++Iter) { + // We're only interested in unpredicated instructions that write to VCCR. + if (!IsWritingToVCCR(*Iter) || getVPTInstrPredicate(*Iter) != ARMVCC::None) + continue; + Register Dst = Iter->getOperand(0).getReg(); + + // If we already have a VCCRValue, and this is a VPNOT on VCCRValue, we've + // found what we were looking for. + if (VCCRValue && Iter->getOpcode() == ARM::MVE_VPNOT && + Iter->findRegisterUseOperand(VCCRValue)) { + // Move the VPNOT closer to its first user if needed, and ignore if it has + // no users. + if (!MoveVPNOTBeforeFirstUser(MBB, Iter, VCCRValue)) + continue; + + OppositeVCCRValue = Dst; + ++Iter; + break; + } + + // Else, just set VCCRValue and continue. + VCCRValue = Dst; + } + + // If the first loop didn't find anything, stop here. + if (Iter == End) + return false; + + assert(VCCRValue && OppositeVCCRValue && + "VCCRValue and OppositeVCCRValue shouldn't be empty if the loop " + "stopped before the end of the block!"); + assert(VCCRValue != OppositeVCCRValue && + "VCCRValue should not be equal to OppositeVCCRValue!"); + + bool Modified = false; + SmallVector DeadInstructions; + + // LastVPNOTResult always contains the same value as OppositeVCCRValue. + Register LastVPNOTResult = OppositeVCCRValue; + + // Try to optimize the remaining instructions. + for (; Iter != End; ++Iter) { + // If this instr uses VCCRValue, we can do something about it. + if (MachineOperand *MO = Iter->findRegisterUseOperand(VCCRValue)) { + if (Iter->getOpcode() == ARM::MVE_VPNOT) { + // Instead of inserting a VPNOT before a VPNOT (= doing a double not), + // replace the existing VPNOT with COPY LastVPNOTResult. + Register CopyResult = Iter->getOperand(0).getReg(); + + MachineInstrBuilder MIB = + BuildMI(MBB, &*Iter, Iter->getDebugLoc(), TII->get(ARM::COPY)) + // Use same destination, but copy LastVPNOTResult instead. + .addDef(CopyResult) + .addReg(LastVPNOTResult); + DeadInstructions.push_back(&*Iter); + + // Treat the result of the copy as the LastVPNOTResult + LastVPNOTResult = CopyResult; + + LLVM_DEBUG(dbgs() << "Replacing: "; Iter->dump(); dbgs() << " With: "; + MIB.getInstr()->dump()); + } else { + // Replace this usage of VCCRValue by the result of a VPNOT on + // LastVPNOTResult. + MachineInstr &VPNOT = + ReplaceRegisterUseWithVPNOT(MBB, *Iter, *MO, LastVPNOTResult); + Modified = true; + + // The result of the VPNOT we just inserted becomes the new + // LastVPNOTResult, and VCCRValue/OppositeVCCRValue are swapped. + LastVPNOTResult = VPNOT.getOperand(0).getReg(); + std::swap(VCCRValue, OppositeVCCRValue); + + LLVM_DEBUG(dbgs() << "Replacing usage of '" << printReg(VCCRValue) + << "' with '" << printReg(LastVPNOTResult) + << " for instr: "; + Iter->dump()); + } + } else { + // Else, if it uses OppositeVCCRValue, make it use LastVPNOTResult + // instead. + if (MachineOperand *Use = + Iter->findRegisterUseOperand(OppositeVCCRValue)) { + // This is pointless if LastVPNOTResult == OppositeVCCRValue. + if (LastVPNOTResult != OppositeVCCRValue) { + LLVM_DEBUG(dbgs() << "Replacing usage of '" + << printReg(OppositeVCCRValue) << "' with '" + << printReg(LastVPNOTResult) << " for instr: "; + Iter->dump()); + Use->setReg(LastVPNOTResult); + Modified = true; + } + + // In both cases, clear the kill flag as we might reuse that value + // later. + Use->setIsKill(false); + } + + // If this instr is an unpredicated VPNOT on LastVPNOTResult or + // OppositeVCCRValue, swap VCCRValue/OppositeVCCRValue and set + // LastVPNOTResult to the result of this instr. + if (Iter->getOpcode() == ARM::MVE_VPNOT && + getVPTInstrPredicate(*Iter) == ARMVCC::None) { + Register VPNOTOperand = Iter->getOperand(1).getReg(); + if (VPNOTOperand == LastVPNOTResult || + VPNOTOperand == OppositeVCCRValue) { + std::swap(VCCRValue, OppositeVCCRValue); + LastVPNOTResult = Iter->getOperand(0).getReg(); + } + } + } + } + + for (MachineInstr *DeadInstruction : DeadInstructions) + DeadInstruction->removeFromParent(); + + return Modified; +} + // This optimisation replaces VCMPs with VPNOTs when they are equivalent. bool MVEVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) { SmallVector DeadInstructions; @@ -219,8 +448,10 @@ << "********** Function: " << Fn.getName() << '\n'); bool Modified = false; - for (MachineBasicBlock &MBB : Fn) + for (MachineBasicBlock &MBB : Fn) { Modified |= ReplaceVCMPsByVPNOTs(MBB); + Modified |= ReduceOldVCCRValueUses(MBB); + } LLVM_DEBUG(dbgs() << "**************************************\n"); return Modified; Index: llvm/test/CodeGen/Thumb2/mve-pred-not.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-not.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-not.ll @@ -405,21 +405,11 @@ define arm_aapcs_vfpcc <4 x i32> @vpttet_v4i1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: vpttet_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vcmp.s32 ge, q0, q2 -; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: vpstt +; CHECK-NEXT: vpttet.s32 ge, q0, q2 ; CHECK-NEXT: vmovt q0, q2 ; CHECK-NEXT: vmovt q0, q2 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vpnot -; CHECK-NEXT: vpst +; CHECK-NEXT: vmove q0, q2 ; CHECK-NEXT: vmovt q0, q2 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vmovt q0, q2 -; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: bx lr entry: %0 = icmp sge <4 x i32> %x, %z Index: llvm/test/CodeGen/Thumb2/mve-vpt-blocks.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vpt-blocks.ll +++ llvm/test/CodeGen/Thumb2/mve-vpt-blocks.ll @@ -117,20 +117,10 @@ define arm_aapcs_vfpcc <4 x i32> @vptet_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: vptet_block: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vcmp.s32 ge, q0, q2 -; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: vpst +; CHECK-NEXT: vptet.s32 ge, q0, q2 ; CHECK-NEXT: vorrt q0, q1, q2 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vpnot -; CHECK-NEXT: vpst -; CHECK-NEXT: vmovt q0, q2 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vpst +; CHECK-NEXT: vmove q0, q2 ; CHECK-NEXT: vmovt q0, q2 -; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: bx lr entry: %0 = icmp sge <4 x i32> %a, %c @@ -144,21 +134,11 @@ define arm_aapcs_vfpcc <4 x i32> @vpttet_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: vpttet_block: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vcmp.s32 ge, q0, q2 -; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: vpstt +; CHECK-NEXT: vpttet.s32 ge, q0, q2 ; CHECK-NEXT: vorrt q0, q1, q2 ; CHECK-NEXT: vmovt q0, q2 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vpnot -; CHECK-NEXT: vpst -; CHECK-NEXT: vmovt q0, q2 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vpst +; CHECK-NEXT: vmove q0, q2 ; CHECK-NEXT: vmovt q0, q2 -; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: bx lr entry: %0 = icmp sge <4 x i32> %a, %c @@ -173,21 +153,11 @@ define arm_aapcs_vfpcc <4 x i32> @vptett_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: vptett_block: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vcmp.s32 ge, q0, q2 -; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: vpst +; CHECK-NEXT: vptett.s32 ge, q0, q2 ; CHECK-NEXT: vorrt q0, q1, q2 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vpnot -; CHECK-NEXT: vpst -; CHECK-NEXT: vmovt q0, q2 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vpstt +; CHECK-NEXT: vmove q0, q2 ; CHECK-NEXT: vmovt q0, q2 ; CHECK-NEXT: vmovt q0, q2 -; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: bx lr entry: %0 = icmp sge <4 x i32> %a, %c @@ -202,25 +172,11 @@ define arm_aapcs_vfpcc <4 x i32> @vpteet_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: vpteet_block: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vcmp.s32 ge, q0, q2 -; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: vpst +; CHECK-NEXT: vpteet.s32 ge, q0, q2 ; CHECK-NEXT: vorrt q0, q1, q2 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vpnot -; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vmovt q0, q2 -; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vmovt q0, q2 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vpst +; CHECK-NEXT: vmove q0, q2 +; CHECK-NEXT: vmove q0, q2 ; CHECK-NEXT: vmovt q0, q2 -; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: bx lr entry: %0 = icmp sge <4 x i32> %a, %c @@ -254,25 +210,11 @@ define arm_aapcs_vfpcc <4 x i32> @vptete_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: vptete_block: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vcmp.s32 ge, q0, q2 -; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: vpst +; CHECK-NEXT: vptete.s32 ge, q0, q2 ; CHECK-NEXT: vorrt q0, q1, q2 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vpnot -; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vmovt q0, q2 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vmovt q0, q2 -; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vpst +; CHECK-NEXT: vmove q0, q2 ; CHECK-NEXT: vmovt q0, q2 -; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: vmove q0, q2 ; CHECK-NEXT: bx lr entry: %0 = icmp sge <4 x i32> %a, %c Index: llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir +++ llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir @@ -52,6 +52,26 @@ ret <4 x float> %inactive1 } + define arm_aapcs_vfpcc <4 x float> @spill_prevention(<4 x float> %inactive1) #0 { + entry: + ret <4 x float> %inactive1 + } + + define arm_aapcs_vfpcc <4 x float> @spill_prevention_predicated_vpnots(<4 x float> %inactive1) #0 { + entry: + ret <4 x float> %inactive1 + } + + define arm_aapcs_vfpcc <4 x float> @spill_prevention_copies(<4 x float> %inactive1) #0 { + entry: + ret <4 x float> %inactive1 + } + + define arm_aapcs_vfpcc <4 x float> @spill_prevention_vpnot_reordering(<4 x float> %inactive1) #0 { + entry: + ret <4 x float> %inactive1 + } + attributes #0 = { "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" } ... --- @@ -356,20 +376,72 @@ name: killed_vccr_values alignment: 4 body: | + ; CHECK-LABEL: name: killed_vccr_values + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: [[MVE_VCMPf16_:%[0-9]+]]:vccr = MVE_VCMPf16 %1:mqpr, %2:mqpr, 10, 0, $noreg + ; CHECK: [[MVE_VORR:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %2:mqpr, 1, [[MVE_VCMPf16_]], undef [[MVE_VORR]] + ; CHECK: [[MVE_VPNOT:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VCMPf16_]], 0, $noreg + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[MVE_VCMPs32_:%[0-9]+]]:vccr = MVE_VCMPs32 %1:mqpr, %2:mqpr, 10, 0, $noreg + ; CHECK: [[MVE_VPNOT1:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VCMPs32_]], 0, $noreg + ; CHECK: [[MVE_VORR1:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %1:mqpr, 1, [[MVE_VPNOT1]], undef [[MVE_VORR1]] + ; CHECK: [[MVE_VPNOT2:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VPNOT1]], 0, $noreg + ; CHECK: [[MVE_VORR2:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR1]], [[MVE_VORR1]], 1, [[MVE_VPNOT2]], undef [[MVE_VORR2]] + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: [[MVE_VCMPs32_1:%[0-9]+]]:vccr = MVE_VCMPs32 %1:mqpr, %2:mqpr, 10, 0, $noreg + ; CHECK: [[MVE_VPNOT3:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VCMPs32_1]], 0, $noreg + ; CHECK: [[MVE_VORR3:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %1:mqpr, 1, [[MVE_VPNOT3]], undef [[MVE_VORR3]] + ; CHECK: [[MVE_VPNOT4:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VPNOT3]], 0, $noreg + ; CHECK: [[MVE_VORR4:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR3]], [[MVE_VORR3]], 1, [[MVE_VPNOT4]], undef [[MVE_VORR4]] + ; CHECK: bb.3: + ; CHECK: [[MVE_VCMPs32_2:%[0-9]+]]:vccr = MVE_VCMPs32 %1:mqpr, %2:mqpr, 10, 0, $noreg + ; CHECK: [[MVE_VPNOT5:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VCMPs32_2]], 0, $noreg + ; CHECK: [[MVE_VORR5:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %1:mqpr, 1, [[MVE_VPNOT5]], undef [[MVE_VORR5]] + ; CHECK: [[MVE_VPNOT6:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VPNOT5]], 0, $noreg + ; CHECK: [[MVE_VORR6:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR5]], [[MVE_VORR5]], 1, [[MVE_VPNOT6]], undef [[MVE_VORR6]] + ; CHECK: [[MVE_VORR7:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR6]], [[MVE_VORR6]], 1, [[MVE_VPNOT6]], undef [[MVE_VORR7]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit %1:mqpr bb.0: ; ; Tests that, if the result of the VCMP is killed before the ; second VCMP (that will be converted into a VPNOT) is found, ; the kill flag is removed. ; - ; CHECK-LABEL: name: killed_vccr_values - ; CHECK: [[MVE_VCMPf16_:%[0-9]+]]:vccr = MVE_VCMPf16 %1:mqpr, %2:mqpr, 10, 0, $noreg - ; CHECK: [[MVE_VORR:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %2:mqpr, 1, [[MVE_VCMPf16_]], undef [[MVE_VORR]] - ; CHECK: [[MVE_VPNOT:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VCMPf16_]], 0, $noreg - ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit %1:mqpr %2:vccr = MVE_VCMPf16 %0:mqpr, %1:mqpr, 10, 0, $noreg %3:mqpr = MVE_VORR %0:mqpr, %1:mqpr, 1, killed %2:vccr, undef %3:mqpr %4:vccr = MVE_VCMPf16 %0:mqpr, %1:mqpr, 11, 0, $noreg + bb.1: + ; + ; Tests that, if the result of the VCMP that has been replaced with a + ; VPNOT is killed (before the insertion of the second VPNOT), + ; the kill flag is removed. + ; + %5:vccr = MVE_VCMPs32 %0:mqpr, %1:mqpr, 10, 0, $noreg + %6:vccr = MVE_VCMPs32 %1:mqpr, %0:mqpr, 12, 0, $noreg + %7:mqpr = MVE_VORR %0:mqpr, %0:mqpr, 1, killed %6:vccr, undef %7:mqpr + %8:mqpr = MVE_VORR %7:mqpr, %7:mqpr, 1, %5:vccr, undef %8:mqpr + bb.2: + ; + ; Tests that the kill flag is removed when inserting a VPNOT for + ; an instruction. + ; + %9:vccr = MVE_VCMPs32 %0:mqpr, %1:mqpr, 10, 0, $noreg + %10:vccr = MVE_VCMPs32 %1:mqpr, %0:mqpr, 12, 0, $noreg + %11:mqpr = MVE_VORR %0:mqpr, %0:mqpr, 1, %10:vccr, undef %11:mqpr + %12:mqpr = MVE_VORR %11:mqpr, %11:mqpr, 1, killed %9:vccr, undef %12:mqpr + bb.3: + ; + ; Tests that the kill flag is correctly removed when replacing a use + ; of the opposite VCCR value with the last VPNOT's result + ; + %13:vccr = MVE_VCMPs32 %0:mqpr, %1:mqpr, 10, 0, $noreg + %14:vccr = MVE_VCMPs32 %1:mqpr, %0:mqpr, 12, 0, $noreg + %15:mqpr = MVE_VORR %0:mqpr, %0:mqpr, 1, %14:vccr, undef %15:mqpr + %16:mqpr = MVE_VORR %15:mqpr, %15:mqpr, 1, %13:vccr, undef %16:mqpr + %17:mqpr = MVE_VORR %16:mqpr, %16:mqpr, 1, killed %13:vccr, undef %17:mqpr tBX_RET 14, $noreg, implicit %0:mqpr ... --- @@ -545,3 +617,204 @@ %4:vccr = MVE_VCMPs32 %1:mqpr, %0:mqpr, 10, 0, $noreg tBX_RET 14, $noreg, implicit %0:mqpr ... +--- +name: spill_prevention +alignment: 4 +body: | + ; CHECK-LABEL: name: spill_prevention + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: [[MVE_VCMPs32_:%[0-9]+]]:vccr = MVE_VCMPs32 %1:mqpr, %2:mqpr, 10, 0, $noreg + ; CHECK: [[MVE_VPNOT:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VCMPs32_]], 0, $noreg + ; CHECK: [[MVE_VORR:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %1:mqpr, 1, [[MVE_VPNOT]], undef [[MVE_VORR]] + ; CHECK: [[MVE_VPNOT1:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VPNOT]], 0, $noreg + ; CHECK: [[MVE_VORR1:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR]], [[MVE_VORR]], 1, [[MVE_VPNOT1]], undef [[MVE_VORR1]] + ; CHECK: [[MVE_VPNOT2:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VPNOT1]], 0, $noreg + ; CHECK: [[MVE_VORR2:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR1]], [[MVE_VORR1]], 1, [[MVE_VPNOT2]], undef [[MVE_VORR2]] + ; CHECK: [[MVE_VPNOT3:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VPNOT2]], 0, $noreg + ; CHECK: [[MVE_VORR3:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR2]], [[MVE_VORR2]], 1, [[MVE_VPNOT3]], undef [[MVE_VORR3]] + ; CHECK: [[MVE_VPNOT4:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VPNOT3]], 0, $noreg + ; CHECK: [[MVE_VORR4:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR3]], [[MVE_VORR3]], 1, [[MVE_VPNOT4]], undef [[MVE_VORR4]] + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[MVE_VCMPs32_1:%[0-9]+]]:vccr = MVE_VCMPs32 %1:mqpr, %2:mqpr, 10, 0, $noreg + ; CHECK: [[MVE_VPNOT5:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VCMPs32_1]], 0, $noreg + ; CHECK: [[MVE_VORR5:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %1:mqpr, 1, [[MVE_VPNOT5]], undef [[MVE_VORR5]] + ; CHECK: [[MVE_VORR6:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR5]], [[MVE_VORR5]], 0, $noreg, undef [[MVE_VORR6]] + ; CHECK: [[MVE_VPNOT6:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VPNOT5]], 0, $noreg + ; CHECK: [[MVE_VORR7:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR6]], [[MVE_VORR6]], 1, [[MVE_VPNOT6]], undef [[MVE_VORR7]] + ; CHECK: [[MVE_VORR8:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR7]], [[MVE_VORR7]], 0, $noreg, undef [[MVE_VORR8]] + ; CHECK: [[MVE_VPNOT7:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VPNOT6]], 0, $noreg + ; CHECK: [[MVE_VORR9:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR8]], [[MVE_VORR8]], 1, [[MVE_VPNOT7]], undef [[MVE_VORR9]] + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: [[MVE_VCMPs32_2:%[0-9]+]]:vccr = MVE_VCMPs32 %1:mqpr, %2:mqpr, 10, 0, $noreg + ; CHECK: [[MVE_VPNOT8:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VCMPs32_2]], 0, $noreg + ; CHECK: [[MVE_VORR10:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %1:mqpr, 1, [[MVE_VPNOT8]], undef [[MVE_VORR10]] + ; CHECK: [[MVE_VORR11:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR10]], [[MVE_VORR10]], 1, [[MVE_VPNOT8]], undef [[MVE_VORR11]] + ; CHECK: [[MVE_VPNOT9:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VPNOT8]], 0, $noreg + ; CHECK: [[MVE_VORR12:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR11]], [[MVE_VORR11]], 1, [[MVE_VPNOT9]], undef [[MVE_VORR12]] + ; CHECK: [[MVE_VORR13:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR12]], [[MVE_VORR12]], 1, [[MVE_VPNOT9]], undef [[MVE_VORR13]] + ; CHECK: [[MVE_VPNOT10:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VPNOT9]], 0, $noreg + ; CHECK: [[MVE_VORR14:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR13]], [[MVE_VORR13]], 1, [[MVE_VPNOT10]], undef [[MVE_VORR14]] + ; CHECK: [[MVE_VORR15:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR14]], [[MVE_VORR14]], 1, [[MVE_VPNOT10]], undef [[MVE_VORR15]] + ; CHECK: [[MVE_VPNOT11:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VPNOT10]], 0, $noreg + ; CHECK: [[MVE_VORR16:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR15]], [[MVE_VORR15]], 1, [[MVE_VPNOT11]], undef [[MVE_VORR16]] + ; CHECK: [[MVE_VORR17:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR16]], [[MVE_VORR16]], 1, [[MVE_VPNOT11]], undef [[MVE_VORR17]] + ; CHECK: bb.3: + ; CHECK: [[MVE_VCMPs32_3:%[0-9]+]]:vccr = MVE_VCMPs32 %1:mqpr, %2:mqpr, 10, 0, $noreg + ; CHECK: [[MVE_VPNOT12:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VCMPs32_3]], 0, $noreg + ; CHECK: [[MVE_VORR18:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %1:mqpr, 1, [[MVE_VPNOT12]], undef [[MVE_VORR11]] + ; CHECK: [[MVE_VPNOT13:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VPNOT12]], 0, $noreg + ; CHECK: [[MVE_VORR19:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %1:mqpr, 1, [[MVE_VPNOT13]], undef [[MVE_VORR19]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit %1:mqpr + bb.0: + ; + ; Basic test case + ; + %2:vccr = MVE_VCMPs32 %0:mqpr, %1:mqpr, 10, 0, $noreg + %3:vccr = MVE_VPNOT %2:vccr, 0, $noreg + %4:mqpr = MVE_VORR %0:mqpr, %0:mqpr, 1, %3:vccr, undef %4:mqpr + %5:mqpr = MVE_VORR %4:mqpr, %4:mqpr, 1, %2:vccr, undef %5:mqpr + %6:mqpr = MVE_VORR %5:mqpr, %5:mqpr, 1, %3:vccr, undef %6:mqpr + %7:mqpr = MVE_VORR %6:mqpr, %6:mqpr, 1, %2:vccr, undef %7:mqpr + %8:mqpr = MVE_VORR %7:mqpr, %7:mqpr, 1, %3:vccr, undef %8:mqpr + bb.1: + ; + ; Tests that unpredicated instructions in the middle of the block + ; don't interfere with the replacement. + ; + %9:vccr = MVE_VCMPs32 %0:mqpr, %1:mqpr, 10, 0, $noreg + %10:vccr = MVE_VPNOT %9:vccr, 0, $noreg + %11:mqpr = MVE_VORR %0:mqpr, %0:mqpr, 1, %10:vccr, undef %11:mqpr + %12:mqpr = MVE_VORR %11:mqpr, %11:mqpr, 0, $noreg, undef %12:mqpr + %13:mqpr = MVE_VORR %12:mqpr, %12:mqpr, 1, %9:vccr, undef %13:mqpr + %14:mqpr = MVE_VORR %13:mqpr, %13:mqpr, 0, $noreg, undef %14:mqpr + %15:mqpr = MVE_VORR %14:mqpr, %14:mqpr, 1, %10:vccr, undef %15:mqpr + bb.2: + ; + ; Tests that all uses of the register are replaced, even when it's used + ; multiple times in a row. + ; + %16:vccr = MVE_VCMPs32 %0:mqpr, %1:mqpr, 10, 0, $noreg + %17:vccr = MVE_VPNOT %16:vccr, 0, $noreg + %18:mqpr = MVE_VORR %0:mqpr, %0:mqpr, 1, %17:vccr, undef %18:mqpr + %19:mqpr = MVE_VORR %18:mqpr, %18:mqpr, 1, %17:vccr, undef %19:mqpr + %20:mqpr = MVE_VORR %19:mqpr, %19:mqpr, 1, %16:vccr, undef %20:mqpr + %21:mqpr = MVE_VORR %20:mqpr, %20:mqpr, 1, %16:vccr, undef %21:mqpr + %22:mqpr = MVE_VORR %21:mqpr, %21:mqpr, 1, %17:vccr, undef %22:mqpr + %23:mqpr = MVE_VORR %22:mqpr, %22:mqpr, 1, %17:vccr, undef %23:mqpr + %24:mqpr = MVE_VORR %23:mqpr, %23:mqpr, 1, %16:vccr, undef %24:mqpr + %25:mqpr = MVE_VORR %24:mqpr, %24:mqpr, 1, %16:vccr, undef %25:mqpr + bb.3: + ; + ; Tests that already present VPNOTs are "registered" by the pass so + ; it does not insert a useless VPNOT. + ; + %26:vccr = MVE_VCMPs32 %0:mqpr, %1:mqpr, 10, 0, $noreg + %27:vccr = MVE_VPNOT %26:vccr, 0, $noreg + %28:mqpr = MVE_VORR %0:mqpr, %0:mqpr, 1, %27:vccr, undef %19:mqpr + %29:vccr = MVE_VPNOT %27:vccr, 0, $noreg + %30:mqpr = MVE_VORR %0:mqpr, %0:mqpr, 1, %26:vccr, undef %30:mqpr + tBX_RET 14, $noreg, implicit %0:mqpr +... +--- +name: spill_prevention_predicated_vpnots +alignment: 4 +body: | + ; CHECK-LABEL: name: spill_prevention_predicated_vpnots + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: [[MVE_VCMPs32_:%[0-9]+]]:vccr = MVE_VCMPs32 %1:mqpr, %2:mqpr, 10, 0, $noreg + ; CHECK: [[MVE_VPNOT:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VCMPs32_]], 1, [[MVE_VCMPs32_]] + ; CHECK: [[MVE_VORR:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %1:mqpr, 1, [[MVE_VCMPs32_]], undef [[MVE_VORR]] + ; CHECK: [[MVE_VORR1:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR]], [[MVE_VORR]], 1, [[MVE_VPNOT]], undef [[MVE_VORR1]] + ; CHECK: bb.1: + ; CHECK: [[MVE_VCMPs32_1:%[0-9]+]]:vccr = MVE_VCMPs32 %1:mqpr, %2:mqpr, 10, 0, $noreg + ; CHECK: [[MVE_VPNOT1:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VCMPs32_1]], 1, [[MVE_VCMPs32_1]] + ; CHECK: [[MVE_VORR2:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %2:mqpr, 1, [[MVE_VPNOT1]], undef [[MVE_VORR2]] + ; CHECK: [[MVE_VORR2:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %1:mqpr, 1, [[MVE_VCMPs32_1]], undef [[MVE_VORR2]] + ; CHECK: [[MVE_VORR2:%[0-9]+]]:mqpr = MVE_VORR %2:mqpr, %1:mqpr, 1, [[MVE_VPNOT1]], undef [[MVE_VORR2]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit %1:mqpr + ; + ; Tests that predicated VPNOTs are not considered by this pass + ; (This means that these examples should not be optimized.) + ; + bb.0: + %2:vccr = MVE_VCMPs32 %0:mqpr, %1:mqpr, 10, 0, $noreg + %3:vccr = MVE_VPNOT %2:vccr, 1, %2:vccr + %4:mqpr = MVE_VORR %0:mqpr, %0:mqpr, 1, %2:vccr, undef %4:mqpr + %5:mqpr = MVE_VORR %4:mqpr, %4:mqpr, 1, %3:vccr, undef %5:mqpr + bb.1: + %2:vccr = MVE_VCMPs32 %0:mqpr, %1:mqpr, 10, 0, $noreg + %3:vccr = MVE_VPNOT %2:vccr, 1, %2:vccr + %4:mqpr = MVE_VORR %0:mqpr, %1:mqpr, 1, %3:vccr, undef %4:mqpr + %5:mqpr = MVE_VORR %0:mqpr, %0:mqpr, 1, %2:vccr, undef %5:mqpr + %6:mqpr = MVE_VORR %1:mqpr, %0:mqpr, 1, %3:vccr, undef %6:mqpr + tBX_RET 14, $noreg, implicit %0:mqpr +... +--- +name: spill_prevention_copies +alignment: 4 +body: | + ; + ; Tests that VPNOTs are replaced by a COPY instead of inserting a VPNOT + ; (which would result in a double VPNOT). + ; + bb.0: + ; CHECK-LABEL: name: spill_prevention_copies + ; CHECK: [[MVE_VCMPs32_:%[0-9]+]]:vccr = MVE_VCMPs32 %1:mqpr, %2:mqpr, 10, 0, $noreg + ; CHECK: [[MVE_VPNOT:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VCMPs32_]], 0, $noreg + ; CHECK: [[MVE_VORR:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %1:mqpr, 1, [[MVE_VPNOT]], undef [[MVE_VORR]] + ; CHECK: [[COPY:%[0-9]+]]:vccr = COPY [[MVE_VPNOT]] + ; CHECK: [[MVE_VORR1:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %1:mqpr, 1, [[COPY]], undef [[MVE_VORR1]] + ; CHECK: [[COPY1:%[0-9]+]]:vccr = COPY [[COPY]] + ; CHECK: [[MVE_VORR2:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %1:mqpr, 1, [[COPY1]], undef [[MVE_VORR2]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit %1:mqpr + %2:vccr = MVE_VCMPs32 %0:mqpr, %1:mqpr, 10, 0, $noreg + %3:vccr = MVE_VPNOT %2:vccr, 0, $noreg + %4:mqpr = MVE_VORR %0:mqpr, %0:mqpr, 1, %3:vccr, undef %4:mqpr + %5:vccr = MVE_VPNOT %2:vccr, 0, $noreg + %6:mqpr = MVE_VORR %0:mqpr, %0:mqpr, 1, %5:vccr, undef %6:mqpr + %7:vccr = MVE_VPNOT %2:vccr, 0, $noreg + %8:mqpr = MVE_VORR %0:mqpr, %0:mqpr, 1, %7:vccr, undef %8:mqpr + tBX_RET 14, $noreg, implicit %0:mqpr +... +--- +name: spill_prevention_vpnot_reordering +alignment: 4 +body: | + ; CHECK-LABEL: name: spill_prevention_vpnot_reordering + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: [[MVE_VCMPs32_:%[0-9]+]]:vccr = MVE_VCMPs32 %1:mqpr, %2:mqpr, 10, 0, $noreg + ; CHECK: [[MVE_VORR:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %2:mqpr, 1, [[MVE_VCMPs32_]], undef [[MVE_VORR]] + ; CHECK: [[MVE_VORR1:%[0-9]+]]:mqpr = MVE_VORR %2:mqpr, %1:mqpr, 1, [[MVE_VCMPs32_]], undef [[MVE_VORR1]] + ; CHECK: [[MVE_VPNOT:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VCMPs32_]], 0, $noreg + ; CHECK: [[MVE_VORR2:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR]], [[MVE_VORR1]], 1, [[MVE_VPNOT]], undef [[MVE_VORR2]] + ; CHECK: bb.1: + ; CHECK: [[MVE_VCMPs32_1:%[0-9]+]]:vccr = MVE_VCMPs32 %1:mqpr, %2:mqpr, 10, 0, $noreg + ; CHECK: [[MVE_VORR3:%[0-9]+]]:mqpr = MVE_VORR %1:mqpr, %2:mqpr, 1, [[MVE_VCMPs32_1]], undef [[MVE_VORR3]] + ; CHECK: [[MVE_VORR4:%[0-9]+]]:mqpr = MVE_VORR %2:mqpr, %1:mqpr, 1, [[MVE_VCMPs32_1]], undef [[MVE_VORR4]] + ; CHECK: [[MVE_VPNOT1:%[0-9]+]]:vccr = MVE_VPNOT [[MVE_VCMPs32_1]], 0, $noreg + ; CHECK: [[MVE_VORR5:%[0-9]+]]:mqpr = MVE_VORR [[MVE_VORR3]], [[MVE_VORR4]], 1, [[MVE_VPNOT1]], undef [[MVE_VORR5]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit %1:mqpr + ; + ; Tests that the first VPNOT is moved down when the result of the VCMP is used + ; before the first usage of the VPNOT's result. + ; + bb.0: + %2:vccr = MVE_VCMPs32 %0:mqpr, %1:mqpr, 10, 0, $noreg + %3:vccr = MVE_VPNOT %2:vccr, 0, $noreg + %4:mqpr = MVE_VORR %0:mqpr, %1:mqpr, 1, %2:vccr, undef %4:mqpr + %5:mqpr = MVE_VORR %1:mqpr, %0:mqpr, 1, %2:vccr, undef %5:mqpr + %6:mqpr = MVE_VORR %4:mqpr, %5:mqpr, 1, %3:vccr, undef %6:mqpr + bb.1: + ; Test again with a "killed" flag to check if it's properly removed. + %7:vccr = MVE_VCMPs32 %0:mqpr, %1:mqpr, 10, 0, $noreg + %8:vccr = MVE_VPNOT %7:vccr, 0, $noreg + %9:mqpr = MVE_VORR %0:mqpr, %1:mqpr, 1, %7:vccr, undef %9:mqpr + %10:mqpr = MVE_VORR %1:mqpr, %0:mqpr, 1, killed %7:vccr, undef %10:mqpr + %11:mqpr = MVE_VORR %9:mqpr, %10:mqpr, 1, %8:vccr, undef %11:mqpr + tBX_RET 14, $noreg, implicit %0:mqpr +...