diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -61,6 +61,8 @@ namespace { + using InstSet = SmallPtrSetImpl; + class PostOrderLoopTraversal { MachineLoop &ML; MachineLoopInfo &MLI; @@ -518,6 +520,59 @@ return MO.isReg() && MO.getReg() && Class->contains(MO.getReg()); } +// Can this instruction generate a non-zero result when given only zeroed +// operands? This allows us to know that, given operands with false bytes +// zeroed by masked loads, that the result will also contain zeros in those +// bytes. +static bool canGenerateNonZeros(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + break; + // FIXME: FP minus 0? + //case ARM::MVE_VNEGf16: + //case ARM::MVE_VNEGf32: + case ARM::MVE_VMVN: + case ARM::MVE_VORN: + case ARM::MVE_VCLZs8: + case ARM::MVE_VCLZs16: + case ARM::MVE_VCLZs32: + return true; + } + return false; +} + +// MVE 'narrowing' operate on half a lane, reading from half and writing +// to half, which are referred to has the top and bottom half. The other +// half retains its previous value. +static bool retainsPreviousHalfElement(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + uint64_t Flags = MCID.TSFlags; + return (Flags & ARMII::RetainsPreviousHalfElement) != 0; +} + +// Look at its register uses to see if it only can only receive zeros +// into its false lanes which would then produce zeros. Also check that +// the output register is also defined by an FalseLaneZeros instruction +// so that if tail-predication happens, the lanes that aren't updated will +// still be zeros. +static bool producesFalseLaneZeros(MachineInstr &MI, + const TargetRegisterClass *QPRs, + const ReachingDefAnalysis &RDA, + InstSet &FalseLaneZeros) { + if (canGenerateNonZeros(MI)) + return false; + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.getReg()) + continue; + if (auto *OpDef = RDA.getMIOperand(&MI, MO)) + if (FalseLaneZeros.count(OpDef)) + continue; + return false; + } + LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI); + return true; +} + bool LowOverheadLoop::ValidateLiveOuts() const { // We want to find out if the tail-predicated version of this loop will // produce the same values as the loop in its original form. For this to @@ -538,12 +593,14 @@ // operands, or stored results are equivalent already. Other explicitly // predicated instructions will perform the same operation in the original // loop and the tail-predicated form too. Because of this, we can insert - // loads, stores and other predicated instructions into our KnownFalseZeros + // loads, stores and other predicated instructions into our Predicated // set and build from there. const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID); - SetVector UnknownFalseLanes; - SmallPtrSet KnownFalseZeros; + SetVector Unknown; + SmallPtrSet FalseLaneZeros; + SmallPtrSet Predicated; MachineBasicBlock *MBB = ML.getHeader(); + for (auto &MI : *MBB) { const MCInstrDesc &MCID = MI.getDesc(); uint64_t Flags = MCID.TSFlags; @@ -551,63 +608,49 @@ continue; if (isVectorPredicated(&MI)) { - KnownFalseZeros.insert(&MI); + if (MI.mayLoad()) + FalseLaneZeros.insert(&MI); + Predicated.insert(&MI); continue; } if (MI.getNumDefs() == 0) continue; - // Only evaluate instructions which produce a single value. - assert((MI.getNumDefs() == 1 && MI.defs().begin()->isReg()) && - "Expected no more than one register def"); - - Register DefReg = MI.defs().begin()->getReg(); - for (auto &MO : MI.operands()) { - if (!isRegInClass(MO, QPRs) || !MO.isUse() || MO.getReg() != DefReg) - continue; - - // If this instruction overwrites one of its operands, and that register - // has known lanes, then this instruction also has known predicated false - // lanes. - if (auto *OpDef = RDA.getMIOperand(&MI, MO)) { - if (KnownFalseZeros.count(OpDef)) { - KnownFalseZeros.insert(&MI); - break; - } - } - } - if (!KnownFalseZeros.count(&MI)) - UnknownFalseLanes.insert(&MI); + if (producesFalseLaneZeros(MI, QPRs, RDA, FalseLaneZeros)) + FalseLaneZeros.insert(&MI); + else if (retainsPreviousHalfElement(MI)) + return false; + else + Unknown.insert(&MI); } - auto HasKnownUsers = [this](MachineInstr *MI, const MachineOperand &MO, - SmallPtrSetImpl &Knowns) { + auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO, + SmallPtrSetImpl &Predicated) { SmallPtrSet Uses; RDA.getGlobalUses(MI, MO.getReg(), Uses); for (auto *Use : Uses) { - if (Use != MI && !Knowns.count(Use)) + if (Use != MI && !Predicated.count(Use)) return false; } return true; }; - // Now for all the unknown values, see if they're only consumed by known - // instructions. Visit in reverse so that we can start at the values being + // Visit the unknowns in reverse so that we can start at the values being // stored and then we can work towards the leaves, hopefully adding more - // instructions to KnownFalseZeros. - for (auto *MI : reverse(UnknownFalseLanes)) { + // instructions to Predicated. + for (auto *MI : reverse(Unknown)) { for (auto &MO : MI->operands()) { if (!isRegInClass(MO, QPRs) || !MO.isDef()) continue; - if (!HasKnownUsers(MI, MO, KnownFalseZeros)) { + if (!HasPredicatedUsers(MI, MO, Predicated)) { LLVM_DEBUG(dbgs() << "ARM Loops: Found an unknown def of : " << TRI.getRegAsmName(MO.getReg()) << " at " << *MI); return false; } } // Any unknown false lanes have been masked away by the user(s). - KnownFalseZeros.insert(MI); + Predicated.insert(MI); } // Collect Q-regs that are live in the exit blocks. We don't collect scalars