diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -916,7 +916,7 @@ else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && Subtarget.hasFP64()) Opc = ARM::VMOVD; else if (ARM::QPRRegClass.contains(DestReg, SrcReg)) - Opc = Subtarget.hasNEON() ? ARM::VORRq : ARM::MVE_VORR; + Opc = Subtarget.hasNEON() ? ARM::VORRq : ARM::MQPRCopy; if (Opc) { MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg); @@ -925,7 +925,7 @@ MIB.addReg(SrcReg, getKillRegState(KillSrc)); if (Opc == ARM::MVE_VORR) addUnpredicatedMveVpredROp(MIB, DestReg); - else + else if (Opc != ARM::MQPRCopy) MIB.add(predOps(ARMCC::AL)); return; } diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -6975,6 +6975,17 @@ } } +// Pseudo for lowering MVE Q register COPYs. These will usually get converted +// to a "MVE_VORR dst, src, src", but may behave differently in tail predicated +// loops to ensure the whole register is copied, not a subset from a +// tail-predicated MVE_VORR. In the event we cannot prove a MVE_VORR is valid, +// it will become a pair of VMOVD instructions for each half of the Q register. +let Predicates = [HasMVEInt], hasSideEffects = 0, isMoveReg = 1, + D = MVEDomain in { + def MQPRCopy : t2PseudoInst<(outs MQPR:$dst), (ins MQPR:$src), + 8, NoItinerary, []>; +} + //===----------------------------------------------------------------------===// // Patterns diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -376,10 +376,11 @@ MachineInstr *Dec = nullptr; MachineInstr *End = nullptr; MachineOperand TPNumElements; - SmallVector VCTPs; - SmallPtrSet ToRemove; - SmallPtrSet BlockMasksToRecompute; - SmallPtrSet DoubleWidthResultInstrs; + SmallVector VCTPs; + SmallPtrSet ToRemove; + SmallPtrSet BlockMasksToRecompute; + SmallPtrSet DoubleWidthResultInstrs; + SmallPtrSet VMOVCopies; bool Revert = false; bool CannotTailPredicate = false; @@ -976,8 +977,7 @@ else if (!isPredicated && retainsOrReduces) { LLVM_DEBUG(dbgs() << " Unpredicated instruction that retainsOrReduces: " << MI); return false; - } - else if (!isPredicated) + } else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy) FalseLanesUnknown.insert(&MI); } @@ -1052,10 +1052,20 @@ // any VPT predicated instruction is predicated upon VCTP. Any live-out // instruction needs to be predicated, so check this here. The instructions // in NonPredicated have been found to be a reduction that we can ensure its - // legality. - for (auto *MI : LiveOutMIs) { - if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) { - LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI); + // legality. Any MQPRCopy found will need to validate its input as if it was + // live out. + SmallVector Worklist(LiveOutMIs.begin(), LiveOutMIs.end()); + while (!Worklist.empty()) { + MachineInstr *MI = Worklist.pop_back_val(); + if (MI->getOpcode() == ARM::MQPRCopy) { + VMOVCopies.insert(MI); + MachineInstr *CopySrc = + RDA.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg()); + if (CopySrc) + Worklist.push_back(CopySrc); + } else if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) { + LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI); + VMOVCopies.clear(); return false; } } @@ -1256,6 +1266,8 @@ bool RequiresExplicitPredication = (MCID.TSFlags & ARMII::ValidForTailPredication) == 0; if (isDomainMVE(MI) && RequiresExplicitPredication) { + if (MI->getOpcode() == ARM::MQPRCopy) + return true; if (!IsUse && producesDoubleWidthResult(*MI)) { DoubleWidthResultInstrs.insert(MI); return true; @@ -1739,6 +1751,29 @@ } }; + // And VMOVCopies need to become 2xVMOVD for tail predication to be valid. + // Anything other MQPRCopy can be converted to MVE_VORR later on. + auto ExpandVMOVCopies = [this](SmallPtrSet &VMOVCopies) { + for (auto *MI : VMOVCopies) { + LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI); + assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!"); + MachineBasicBlock *MBB = MI->getParent(); + Register Dst = MI->getOperand(0).getReg(); + Register Src = MI->getOperand(1).getReg(); + auto MIB1 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD), + ARM::D0 + (Dst - ARM::Q0) * 2) + .addReg(ARM::D0 + (Src - ARM::Q0) * 2) + .add(predOps(ARMCC::AL)); + LLVM_DEBUG(dbgs() << " into " << *MIB1); + auto MIB2 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD), + ARM::D0 + (Dst - ARM::Q0) * 2 + 1) + .addReg(ARM::D0 + (Src - ARM::Q0) * 2 + 1) + .add(predOps(ARMCC::AL)); + LLVM_DEBUG(dbgs() << " and " << *MIB2); + MI->eraseFromParent(); + } + }; + if (LoLoop.Revert) { if (isWhileLoopStart(*LoLoop.Start)) RevertWhile(LoLoop.Start); @@ -1749,6 +1784,7 @@ else RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec)); } else { + ExpandVMOVCopies(LoLoop.VMOVCopies); LoLoop.Start = ExpandLoopStart(LoLoop); if (LoLoop.Start) RemoveDeadBranch(LoLoop.Start); @@ -1793,6 +1829,7 @@ SmallVector Decs; SmallVector Ends; SmallVector EndDecs; + SmallVector MQPRCopies; for (auto &I : MBB) { if (isLoopStart(I)) @@ -1803,9 +1840,12 @@ Ends.push_back(&I); else if (I.getOpcode() == ARM::t2LoopEndDec) EndDecs.push_back(&I); + else if (I.getOpcode() == ARM::MQPRCopy) + MQPRCopies.push_back(&I); } - if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty()) + if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() && + MQPRCopies.empty()) continue; Changed = true; @@ -1823,6 +1863,17 @@ RevertLoopEnd(End); for (auto *End : EndDecs) RevertLoopEndDec(End); + for (auto *MI : MQPRCopies) { + LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI); + assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!"); + MachineBasicBlock *MBB = MI->getParent(); + auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::MVE_VORR), + MI->getOperand(0).getReg()) + .add(MI->getOperand(1)) + .add(MI->getOperand(1)); + addUnpredicatedMveVpredROp(MIB, MI->getOperand(0).getReg()); + MI->eraseFromParent(); + } } return Changed; } diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll @@ -207,21 +207,16 @@ ; CHECK-NEXT: uxtb r6, r6 ; CHECK-NEXT: movs r5, #120 ; CHECK-NEXT: mul lr, r4, r7 -; CHECK-NEXT: adds r4, r2, #7 ; CHECK-NEXT: and.w r5, r5, r3, lsr #9 ; CHECK-NEXT: muls r6, r7, r6 -; CHECK-NEXT: bic r4, r4, #7 ; CHECK-NEXT: vmov.i16 q0, #0x78 ; CHECK-NEXT: rsb.w r3, r7, #256 ; CHECK-NEXT: muls r5, r7, r5 ; CHECK-NEXT: lsls r7, r1, #1 -; CHECK-NEXT: sub.w r1, r4, #8 -; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vdup.16 q4, r6 ; CHECK-NEXT: mov.w r6, #2016 ; CHECK-NEXT: vdup.16 q0, lr -; CHECK-NEXT: add.w r1, r4, r1, lsr #3 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: vmov.i16 q2, #0xf8 ; CHECK-NEXT: vmov.i16 q5, #0xfc @@ -236,29 +231,28 @@ ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB1_4 Depth 2 ; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: dls lr, r1 +; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB1_4: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB1_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vctp.16 r6 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrht.u16 q0, [r5] +; CHECK-NEXT: vldrh.u16 q0, [r5] ; CHECK-NEXT: vshl.i16 q1, q0, #3 ; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.f64 d6, d4 +; CHECK-NEXT: vmov.f64 d7, d5 ; CHECK-NEXT: vmov q2, q4 ; CHECK-NEXT: vmla.u16 q2, q1, r3 ; CHECK-NEXT: vshr.u16 q1, q0, #3 ; CHECK-NEXT: vand q1, q1, q5 -; CHECK-NEXT: vmov q7, q5 -; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmov.f64 d14, d10 +; CHECK-NEXT: vmov.f64 d15, d11 +; CHECK-NEXT: vmov.f64 d10, d8 +; CHECK-NEXT: vmov.f64 d11, d9 ; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vshr.u16 q0, q0, #9 ; CHECK-NEXT: vmla.u16 q4, q1, r3 ; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: subs r6, #8 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vmla.u16 q1, q0, r3 @@ -266,15 +260,17 @@ ; CHECK-NEXT: vshr.u16 q2, q4, #5 ; CHECK-NEXT: vand q2, q2, q6 ; CHECK-NEXT: vorr q0, q2, q0 -; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov.f64 d4, d6 +; CHECK-NEXT: vmov.f64 d5, d7 ; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov q4, q5 -; CHECK-NEXT: vmov q5, q7 +; CHECK-NEXT: vmov.f64 d8, d10 +; CHECK-NEXT: vmov.f64 d9, d11 +; CHECK-NEXT: vmov.f64 d10, d14 +; CHECK-NEXT: vmov.f64 d11, d15 ; CHECK-NEXT: vand q1, q1, q3 ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q0, [r5], #16 -; CHECK-NEXT: le lr, .LBB1_4 +; CHECK-NEXT: vstrh.16 q0, [r5], #16 +; CHECK-NEXT: letp lr, .LBB1_4 ; CHECK-NEXT: @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB1_3 Depth=1 ; CHECK-NEXT: adds r4, #1 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.mir @@ -193,7 +193,7 @@ ; CHECK-NEXT: liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r6, $r12 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $q1 = MVE_VLDRHU16 renamable $r4, 0, 0, $noreg, renamable $lr - ; CHECK-NEXT: $q2 = MVE_VORR $q0, $q0, 0, $noreg, renamable $lr, undef renamable $q2 + ; CHECK-NEXT: $q2 = MVE_VORR $q0, $q0, 0, $noreg, $noreg, undef $q2 ; CHECK-NEXT: renamable $q1 = MVE_VAND killed renamable $q1, killed renamable $q2, 0, $noreg, renamable $lr, undef renamable $q1 ; CHECK-NEXT: renamable $r4 = MVE_VSTRHU16_post killed renamable $q1, killed renamable $r4, 16, 0, killed $noreg, renamable $lr ; CHECK-NEXT: $lr = MVE_LETP killed renamable $lr, %bb.3 @@ -257,7 +257,7 @@ renamable $r5, dead $cpsr = tSUBi8 killed renamable $r5, 8, 14 /* CC::al */, $noreg MVE_VPST 8, implicit $vpr renamable $q1 = MVE_VLDRHU16 renamable $r4, 0, 1, renamable $vpr, renamable $lr - $q2 = MVE_VORR $q0, $q0, 0, $noreg, renamable $lr, undef renamable $q2 + $q2 = MQPRCopy $q0 renamable $q1 = MVE_VAND killed renamable $q1, renamable $q2, 0, $noreg, renamable $lr, undef renamable $q1 MVE_VPST 8, implicit $vpr renamable $r4 = MVE_VSTRHU16_post killed renamable $q1, killed renamable $r4, 16, 1, killed renamable $vpr, renamable $lr @@ -309,38 +309,30 @@ ; CHECK-NEXT: liveins: $r0, $r1, $r2, $r12 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $r3 = t2LDRHi12 $sp, 16, 14 /* CC::al */, $noreg - ; CHECK-NEXT: renamable $r6, dead $cpsr = nsw tADDi3 renamable $r2, 7, 14 /* CC::al */, $noreg - ; CHECK-NEXT: renamable $r5, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg ; CHECK-NEXT: renamable $r1, dead $cpsr = nsw tLSLri killed renamable $r1, 1, 14 /* CC::al */, $noreg ; CHECK-NEXT: renamable $r3 = t2RSBri killed renamable $r3, 256, 14 /* CC::al */, $noreg, $noreg ; CHECK-NEXT: renamable $q0 = MVE_VDUP16 killed renamable $r3, 0, $noreg, $noreg, undef renamable $q0 - ; CHECK-NEXT: renamable $r3 = t2BICri killed renamable $r6, 7, 14 /* CC::al */, $noreg, $noreg - ; CHECK-NEXT: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14 /* CC::al */, $noreg - ; CHECK-NEXT: renamable $r6 = nuw nsw t2ADDrs killed renamable $r5, killed renamable $r3, 27, 14 /* CC::al */, $noreg, $noreg ; CHECK-NEXT: renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) - ; CHECK-NEXT: liveins: $q0, $r0, $r1, $r2, $r3, $r6, $r12 + ; CHECK-NEXT: liveins: $d0, $d1, $r0, $r1, $r2, $r3, $r6, $r12 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $r4 = tMOVr $r0, 14 /* CC::al */, $noreg - ; CHECK-NEXT: $r5 = tMOVr $r2, 14 /* CC::al */, $noreg - ; CHECK-NEXT: $lr = t2DLS renamable $r6 + ; CHECK-NEXT: $lr = MVE_DLSTP_16 renamable $r2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.3(0x7c000000), %bb.4(0x04000000) - ; CHECK-NEXT: liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r12 + ; CHECK-NEXT: liveins: $lr, $d0, $d1, $r0, $r1, $r2, $r3, $r4, $r6, $r12 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $vpr = MVE_VCTP16 renamable $r5, 0, $noreg, $noreg - ; CHECK-NEXT: renamable $r5, dead $cpsr = tSUBi8 killed renamable $r5, 8, 14 /* CC::al */, $noreg - ; CHECK-NEXT: MVE_VPST 8, implicit $vpr - ; CHECK-NEXT: renamable $q1 = MVE_VLDRHU16 renamable $r4, 0, 1, renamable $vpr, renamable $lr - ; CHECK-NEXT: $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, renamable $lr, undef renamable $q2 + ; CHECK-NEXT: renamable $q1 = MVE_VLDRHU16 renamable $r4, 0, 0, $noreg, renamable $lr + ; CHECK-NEXT: $d4 = VMOVD killed $d0, 14 /* CC::al */, $noreg + ; CHECK-NEXT: $d5 = VMOVD killed $d1, 14 /* CC::al */, $noreg ; CHECK-NEXT: renamable $q1 = MVE_VAND killed renamable $q1, renamable $q2, 0, $noreg, renamable $lr, undef renamable $q1 - ; CHECK-NEXT: $q0 = MVE_VORR killed $q2, killed $q2, 0, $noreg, renamable $lr, undef renamable $q0 - ; CHECK-NEXT: MVE_VPST 8, implicit $vpr - ; CHECK-NEXT: renamable $r4 = MVE_VSTRHU16_post killed renamable $q1, killed renamable $r4, 16, 1, killed renamable $vpr, renamable $lr - ; CHECK-NEXT: $lr = t2LEUpdate killed renamable $lr, %bb.3 + ; CHECK-NEXT: $d0 = VMOVD killed $d4, 14 /* CC::al */, $noreg + ; CHECK-NEXT: $d1 = VMOVD killed $d5, 14 /* CC::al */, $noreg + ; CHECK-NEXT: renamable $r4 = MVE_VSTRHU16_post killed renamable $q1, killed renamable $r4, 16, 0, killed $noreg, renamable $lr + ; CHECK-NEXT: $lr = MVE_LETP killed renamable $lr, %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x04000000), %bb.2(0x7c000000) @@ -401,9 +393,9 @@ renamable $r5, dead $cpsr = tSUBi8 killed renamable $r5, 8, 14 /* CC::al */, $noreg MVE_VPST 8, implicit $vpr renamable $q1 = MVE_VLDRHU16 renamable $r4, 0, 1, renamable $vpr, renamable $lr - $q2 = MVE_VORR $q0, $q0, 0, $noreg, renamable $lr, undef renamable $q2 + $q2 = MQPRCopy $q0 renamable $q1 = MVE_VAND killed renamable $q1, renamable $q2, 0, $noreg, renamable $lr, undef renamable $q1 - $q0 = MVE_VORR $q2, $q2, 0, $noreg, renamable $lr, undef renamable $q0 + $q0 = MQPRCopy $q2 MVE_VPST 8, implicit $vpr renamable $r4 = MVE_VSTRHU16_post killed renamable $q1, killed renamable $r4, 16, 1, killed renamable $vpr, renamable $lr renamable $lr = t2LoopEndDec killed renamable $lr, %bb.3, implicit-def dead $cpsr diff --git a/llvm/unittests/Target/ARM/MachineInstrTest.cpp b/llvm/unittests/Target/ARM/MachineInstrTest.cpp --- a/llvm/unittests/Target/ARM/MachineInstrTest.cpp +++ b/llvm/unittests/Target/ARM/MachineInstrTest.cpp @@ -1417,6 +1417,7 @@ case MVE_VPNOT: case MVE_VPSEL: case MVE_VPST: + case MQPRCopy: return 0; case MVE_VABAVs16: case MVE_VABAVu16: @@ -2098,4 +2099,4 @@ << MII->getName(i) << ": mismatched expectation for MVE vec size\n"; } -} \ No newline at end of file +}