Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -73,6 +73,15 @@ #define DEBUG_TYPE "arm-low-overhead-loops" #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass" +static bool isVectorPredicated(MachineInstr *MI) { + int PIdx = llvm::findFirstVPTPredOperandIdx(*MI); + return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR; +} + +static bool isVectorPredicate(MachineInstr *MI) { + return MI->findRegisterDefOperandIdx(ARM::VPR) != -1; +} + namespace { using InstSet = SmallPtrSetImpl; @@ -148,57 +157,127 @@ // the block are predicated upon the vpr and we allow instructions to define // the vpr within in the block too. class VPTBlock { - // The predicate then instruction, which is either a VPT, or a VPST - // instruction. - std::unique_ptr PredicateThen; - PredicatedMI *Divergent = nullptr; - SmallVector Insts; + friend struct LowOverheadLoop; - public: - VPTBlock(MachineInstr *MI, SetVector &Preds) { - PredicateThen = std::make_unique(MI, Preds); + SmallVector Insts; + + static SmallVector Blocks; + static SetVector CurrentPredicate; + static std::map> PredicatedInsts; + + static void Create(MachineInstr *MI) { + assert(CurrentPredicate.size() && "Can't begin VPT without predicate"); + Blocks.emplace_back(MI); + PredicatedInsts.emplace( + MI, std::make_unique(MI, CurrentPredicate)); } - void addInst(MachineInstr *MI, SetVector &Preds) { - LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI); - if (!Divergent && !set_difference(Preds, PredicateThen->Predicates).empty()) { - Divergent = &Insts.back(); - LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI); - } - Insts.emplace_back(MI, Preds); - assert(Insts.size() <= 4 && "Too many instructions in VPT block!"); + static void reset() { + Blocks.clear(); + PredicatedInsts.clear(); + CurrentPredicate.clear(); + } + + static void addInst(MachineInstr *MI) { + Blocks.back().insert(MI); + PredicatedInsts.emplace( + MI, std::make_unique(MI, CurrentPredicate)); } + static void addPredicate(MachineInstr *MI) { + LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI); + CurrentPredicate.insert(MI); + } + + static void resetPredicate(MachineInstr *MI) { + LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI); + CurrentPredicate.clear(); + CurrentPredicate.insert(MI); + } + + public: // Have we found an instruction within the block which defines the vpr? If // so, not all the instructions in the block will have the same predicate. - bool HasNonUniformPredicate() const { - return Divergent != nullptr; + static bool hasUniformPredicate(VPTBlock &Block) { + return getDivergent(Block) == nullptr; } - // Is the given instruction part of the predicate set controlling the entry - // to the block. - bool IsPredicatedOn(MachineInstr *MI) const { - return PredicateThen->Predicates.count(MI); + // If it exists, return the first internal instruction which modifies the + // VPR. + static MachineInstr *getDivergent(VPTBlock &Block) { + SmallVectorImpl &Insts = Block.getInsts(); + for (unsigned i = 1; i < Insts.size(); ++i) { + MachineInstr *Next = Insts[i]; + if (isVectorPredicate(Next)) + return Next; // Found an instruction altering the vpr. + } + return nullptr; } - // Returns true if this is a VPT instruction. - bool isVPT() const { return !isVPST(); } + // Return whether the given instruction is predicated upon a VCTP. + static bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) { + SetVector &Predicates = PredicatedInsts[MI]->Predicates; + if (Exclusive && Predicates.size() != 1) + return false; + for (auto *PredMI : Predicates) + if (isVCTP(PredMI)) + return true; + return false; + } - // Returns true if this is a VPST instruction. - bool isVPST() const { - return PredicateThen->MI->getOpcode() == ARM::MVE_VPST; + // Is the VPST, controlling the block entry, predicated upon a VCTP. + static bool isEntryPredicatedOnVCTP(VPTBlock &Block, + bool Exclusive = false) { + SmallVectorImpl &Insts = Block.getInsts(); + return isPredicatedOnVCTP(Insts.front(), Exclusive); } - // Is the given instruction the only predicate which controls the entry to - // the block. - bool IsOnlyPredicatedOn(MachineInstr *MI) const { - return IsPredicatedOn(MI) && PredicateThen->Predicates.size() == 1; + static bool isValid() { + // All predication within the loop should be based on vctp. If the block + // isn't predicated on entry, check whether the vctp is within the block + // and that all other instructions are then predicated on it. + for (auto &Block : Blocks) { + if (isEntryPredicatedOnVCTP(Block)) + continue; + + SmallVectorImpl &Insts = Block.getInsts(); + for (auto *MI : Insts) { + // Check that any internal VCTPs are 'Then' predicated. + if (isVCTP(MI) && getVPTInstrPredicate(*MI) != ARMVCC::Then) + return false; + // Skip other instructions that build up the predicate. + if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI)) + continue; + // Check that any other instructions are predicated upon a vctp. + // TODO: We could infer when VPTs are implicitly predicated on the + // vctp (when the operands are predicated). + if (!isPredicatedOnVCTP(MI)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI); + return false; + } + } + } + return true; + } + + VPTBlock(MachineInstr *MI) { Insts.push_back(MI); } + + void insert(MachineInstr *MI) { + Insts.push_back(MI); + // VPT/VPST + 4 predicated instructions. + assert(Insts.size() <= 5 && "Too many instructions in VPT block!"); + } + + bool containsVCTP() const { + for (auto *MI : Insts) + if (isVCTP(MI)) + return true; + return false; } unsigned size() const { return Insts.size(); } - SmallVectorImpl &getInsts() { return Insts; } - MachineInstr *getPredicateThen() const { return PredicateThen->MI; } - PredicatedMI *getDivergent() const { return Divergent; } + SmallVectorImpl &getInsts() { return Insts; } }; struct LowOverheadLoop { @@ -214,12 +293,8 @@ MachineInstr *Start = nullptr; MachineInstr *Dec = nullptr; MachineInstr *End = nullptr; - MachineInstr *VCTP = nullptr; MachineOperand TPNumElements; - SmallPtrSet SecondaryVCTPs; - VPTBlock *CurrentBlock = nullptr; - SetVector CurrentPredicate; - SmallVector VPTBlocks; + SmallVector VCTPs; SmallPtrSet ToRemove; SmallPtrSet BlockMasksToRecompute; bool Revert = false; @@ -235,6 +310,7 @@ Preheader = MBB; else if (auto *MBB = MLI.findLoopPreheader(&ML, true)) Preheader = MBB; + VPTBlock::reset(); } // If this is an MVE instruction, check that we know how to use tail @@ -249,10 +325,14 @@ bool IsTailPredicationLegal() const { // For now, let's keep things really simple and only support a single // block for tail predication. - return !Revert && FoundAllComponents() && VCTP && + return !Revert && FoundAllComponents() && !VCTPs.empty() && !CannotTailPredicate && ML.getNumBlocks() == 1; } + // Given that MI is a VCTP, check that is equivalent to any other VCTPs + // found. + bool AddVCTP(MachineInstr *MI); + // Check that the predication in the loop will be equivalent once we // perform the conversion. Also ensure that we can provide the number // of elements to the loop start instruction. @@ -275,7 +355,9 @@ return Start && Dec && End; } - SmallVectorImpl &getVPTBlocks() { return VPTBlocks; } + SmallVectorImpl &getVPTBlocks() { + return VPTBlock::Blocks; + } // Return the operand for the loop start instruction. This will be the loop // iteration count, or the number of elements if we're tail predicating. @@ -288,14 +370,18 @@ if (!IsTailPredicationLegal()) return IsDo ? ARM::t2DLS : ARM::t2WLS; - return VCTPOpcodeToLSTP(VCTP->getOpcode(), IsDo); + return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo); } void dump() const { if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; if (End) dbgs() << "ARM Loops: Found Loop End: " << *End; - if (VCTP) dbgs() << "ARM Loops: Found VCTP: " << *VCTP; + if (!VCTPs.empty()) { + dbgs() << "ARM Loops: Found VCTP(s):\n"; + for (auto *MI : VCTPs) + dbgs() << " - " << *MI; + } if (!FoundAllComponents()) dbgs() << "ARM Loops: Not a low-overhead loop.\n"; else if (!(Start && Dec && End)) @@ -359,6 +445,11 @@ char ARMLowOverheadLoops::ID = 0; +SmallVector VPTBlock::Blocks; +SetVector VPTBlock::CurrentPredicate; +std::map> VPTBlock::PredicatedInsts; + INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, false, false) @@ -396,38 +487,10 @@ } bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) { - assert(VCTP && "VCTP instruction expected but is not set"); - // All predication within the loop should be based on vctp. If the block - // isn't predicated on entry, check whether the vctp is within the block - // and that all other instructions are then predicated on it. - for (auto &Block : VPTBlocks) { - if (Block.IsPredicatedOn(VCTP)) - continue; - if (Block.HasNonUniformPredicate() && !isVCTP(Block.getDivergent()->MI)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Found unsupported diverging predicate: " - << *Block.getDivergent()->MI); - return false; - } - SmallVectorImpl &Insts = Block.getInsts(); - for (auto &PredMI : Insts) { - // Check the instructions in the block and only allow: - // - VCTPs - // - Instructions predicated on the main VCTP - // - Any VCMP - // - VCMPs just "and" their result with VPR.P0. Whether they are - // located before/after the VCTP is irrelevant - the end result will - // be the same in both cases, so there's no point in requiring them - // to be located after the VCTP! - if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI) || - VCMPOpcodeToVPT(PredMI.MI->getOpcode()) != 0) - continue; - LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI - << " - which is predicated on:\n"; - for (auto *MI : PredMI.Predicates) - dbgs() << " - " << *MI); - return false; - } - } + assert(!VCTPs.empty() && "VCTP instruction expected but is not set"); + + if (!VPTBlock::isValid()) + return false; if (!ValidateLiveOuts()) { LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n"); @@ -438,6 +501,7 @@ // of the iteration count, to the loop start instruction. The number of // elements is provided to the vctp instruction, so we need to check that // we can use this register at InsertPt. + MachineInstr *VCTP = VCTPs.back(); TPNumElements = VCTP->getOperand(1); Register NumElements = TPNumElements.getReg(); @@ -534,10 +598,10 @@ if (auto *Def = RDA.getUniqueReachingMIDef(&MBB->back(), VCTP->getOperand(1).getReg())) { SmallPtrSet ElementChain; - SmallPtrSet Ignore = { VCTP }; + SmallPtrSet Ignore; unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode()); - Ignore.insert(SecondaryVCTPs.begin(), SecondaryVCTPs.end()); + Ignore.insert(VCTPs.begin(), VCTPs.end()); if (RDA.isSafeToRemove(Def, ElementChain, Ignore)) { bool FoundSub = false; @@ -563,11 +627,6 @@ return true; } -static bool isVectorPredicated(MachineInstr *MI) { - int PIdx = llvm::findFirstVPTPredOperandIdx(*MI); - return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR; -} - static bool isRegInClass(const MachineOperand &MO, const TargetRegisterClass *Class) { return MO.isReg() && MO.getReg() && Class->contains(MO.getReg()); @@ -837,7 +896,7 @@ LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt); if (!IsTailPredicationLegal()) { - LLVM_DEBUG(if (!VCTP) + LLVM_DEBUG(if (VCTPs.empty()) dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n"; dbgs() << "ARM Loops: Tail-predication is not valid.\n"); return; @@ -850,6 +909,26 @@ dbgs() << "ARM Loops: Couldn't validate tail predicate.\n"); } +bool LowOverheadLoop::AddVCTP(MachineInstr *MI) { + LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI); + if (VCTPs.empty()) { + VCTPs.push_back(MI); + return true; + } + + // If we find another VCTP, check whether it uses the same value as the main VCTP. + // If it does, store it in the VCTPs set, else refuse it. + MachineInstr *Prev = VCTPs.back(); + if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) || + !RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg())) { + LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching " + "definition from the main VCTP"); + return false; + } + VCTPs.push_back(MI); + return true; +} + bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) { if (CannotTailPredicate) return false; @@ -872,83 +951,58 @@ return false; } - if (isVCTP(MI)) { - // If we find another VCTP, check whether it uses the same value as the main VCTP. - // If it does, store it in the SecondaryVCTPs set, else refuse it. - if (VCTP) { - if (!VCTP->getOperand(1).isIdenticalTo(MI->getOperand(1)) || - !RDA.hasSameReachingDef(VCTP, MI, MI->getOperand(1).getReg())) { - LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching " - "definition from the main VCTP"); - return false; - } - LLVM_DEBUG(dbgs() << "ARM Loops: Found secondary VCTP: " << *MI); - SecondaryVCTPs.insert(MI); - } else { - LLVM_DEBUG(dbgs() << "ARM Loops: Found 'main' VCTP: " << *MI); - VCTP = MI; - } - } else if (isVPTOpcode(MI->getOpcode())) { - if (MI->getOpcode() != ARM::MVE_VPST) { - assert(MI->findRegisterDefOperandIdx(ARM::VPR) != -1 && - "VPT does not implicitly define VPR?!"); - CurrentPredicate.clear(); - CurrentPredicate.insert(MI); - } - - VPTBlocks.emplace_back(MI, CurrentPredicate); - CurrentBlock = &VPTBlocks.back(); - return true; - } + // Record all VCTPs and check that they're equivalent to one another. + if (isVCTP(MI) && !AddVCTP(MI)) + return false; + // Inspect uses first so that any instructions that alter the VPR don't + // alter the predicate upon themselves. bool IsUse = false; - bool IsDef = false; for (int i = MI->getNumOperands() - 1; i >= 0; --i) { const MachineOperand &MO = MI->getOperand(i); - if (!MO.isReg() || MO.getReg() != ARM::VPR) + if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR) continue; - if (MO.isDef()) { - CurrentPredicate.insert(MI); - IsDef = true; - } else if (ARM::isVpred(MCID.OpInfo[i].OperandType)) { - CurrentBlock->addInst(MI, CurrentPredicate); + if (ARM::isVpred(MCID.OpInfo[i].OperandType)) { + VPTBlock::addInst(MI); IsUse = true; - } else { + } else if (MI->getOpcode() != ARM::MVE_VPST) { LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI); return false; } } - // If this instruction defines the VPR, update the predicate for the - // proceeding instructions. - if (IsDef) { - // Clear the existing predicate when we're not in VPT Active state. - if (!isVectorPredicated(MI)) - CurrentPredicate.clear(); - CurrentPredicate.insert(MI); - LLVM_DEBUG(dbgs() << "ARM Loops: Adding Predicate: " << *MI); - } - - // If we find a vpr def that is not already predicated on the vctp, we've - // got disjoint predicates that may not be equivalent when we do the - // conversion. - if (IsDef && !IsUse && VCTP && !isVCTP(MI)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Found disjoint vpr def: " << *MI); - return false; - } - // If we find an instruction that has been marked as not valid for tail // predication, only allow the instruction if it's contained within a valid // VPT block. if ((Flags & ARMII::ValidForTailPredication) == 0) { - LLVM_DEBUG(dbgs() << "ARM Loops: Can't tail predicate: " << *MI); + LLVM_DEBUG(if (!IsUse) + dbgs() << "ARM Loops: Can't tail predicate: " << *MI); return IsUse; } // If the instruction is already explicitly predicated, then the conversion // will be fine, but ensure that all store operations are predicated. - return !IsUse && MI->mayStore() ? false : true; + if (MI->mayStore()) + return IsUse; + + // If this instruction defines the VPR, update the predicate for the + // proceeding instructions. + if (isVectorPredicate(MI)) { + // Clear the existing predicate when we're not in VPT Active state, + // otherwise we add to it. + if (!isVectorPredicated(MI)) + VPTBlock::resetPredicate(MI); + else + VPTBlock::addPredicate(MI); + } + + // Finally once the predicate has been modified, we can start a new VPT + // block if necessary. + if (isVPTOpcode(MI->getOpcode())) + VPTBlock::Create(MI); + + return true; } bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { @@ -1281,23 +1335,20 @@ llvm_unreachable("trying to unpredicate a non-predicated instruction"); }; - // There are a few scenarios which we have to fix up: - // 1. VPT Blocks with non-uniform predicates: - // - a. When the divergent instruction is a vctp - // - b. When the block uses a vpst, and is only predicated on the vctp - // - c. When the block uses a vpt and (optionally) contains one or more - // vctp. - // 2. VPT Blocks with uniform predicates: - // - a. The block uses a vpst, and is only predicated on the vctp for (auto &Block : LoLoop.getVPTBlocks()) { - SmallVectorImpl &Insts = Block.getInsts(); - if (Block.HasNonUniformPredicate()) { - PredicatedMI *Divergent = Block.getDivergent(); - if (isVCTP(Divergent->MI)) { - // The vctp will be removed, so the block mask of the vp(s)t will need - // to be recomputed. - LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen()); - } else if (Block.isVPST() && Block.IsOnlyPredicatedOn(LoLoop.VCTP)) { + SmallVectorImpl &Insts = Block.getInsts(); + + if (VPTBlock::isEntryPredicatedOnVCTP(Block, /*exclusive*/true)) { + if (VPTBlock::hasUniformPredicate(Block)) { + // A vpt block starting with VPST, is only predicated upon vctp and has no + // internal vpr defs: + // - Remove vpst. + // - Unpredicate the remaining instructions. + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Insts.front()); + LoLoop.ToRemove.insert(Insts.front()); + for (unsigned i = 1; i < Insts.size(); ++i) + RemovePredicate(Insts[i]); + } else { // The VPT block has a non-uniform predicate but it uses a vpst and its // entry is guarded only by a vctp, which means we: // - Need to remove the original vpst. @@ -1307,28 +1358,28 @@ // the divergent vpr def. // TODO: We could be producing more VPT blocks than necessary and could // fold the newly created one into a proceeding one. - for (auto I = ++MachineBasicBlock::iterator(Block.getPredicateThen()), - E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I) + MachineInstr *Divergent = VPTBlock::getDivergent(Block); + for (auto I = ++MachineBasicBlock::iterator(Insts.front()), + E = ++MachineBasicBlock::iterator(Divergent); I != E; ++I) RemovePredicate(&*I); // Check if the instruction defining vpr is a vcmp so it can be combined // with the VPST This should be the divergent instruction - MachineInstr *VCMP = VCMPOpcodeToVPT(Divergent->MI->getOpcode()) != 0 - ? Divergent->MI - : nullptr; + MachineInstr *VCMP = VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 + ? Divergent + : nullptr; unsigned Size = 0; - auto E = MachineBasicBlock::reverse_iterator(Divergent->MI); - auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI); + auto E = MachineBasicBlock::reverse_iterator(Divergent); + auto I = MachineBasicBlock::reverse_iterator(Insts.back()); MachineInstr *InsertAt = nullptr; while (I != E) { InsertAt = &*I; ++Size; ++I; } + MachineInstrBuilder MIB; - LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " - << *Block.getPredicateThen()); if (VCMP) { // Combine the VPST and VCMP into a VPT MIB = @@ -1352,51 +1403,18 @@ MIB.addImm(0); LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); } - LoLoop.ToRemove.insert(Block.getPredicateThen()); + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Insts.front()); + LoLoop.ToRemove.insert(Insts.front()); LoLoop.BlockMasksToRecompute.insert(MIB.getInstr()); } - // Else, if the block uses a vpt, iterate over the block, removing the - // extra VCTPs it may contain. - else if (Block.isVPT()) { - bool RemovedVCTP = false; - for (PredicatedMI &Elt : Block.getInsts()) { - MachineInstr *MI = Elt.MI; - if (isVCTP(MI)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *MI); - LoLoop.ToRemove.insert(MI); - RemovedVCTP = true; - continue; - } - } - if (RemovedVCTP) - LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen()); - } - } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP) && Block.isVPST()) { - // A vpt block starting with VPST, is only predicated upon vctp and has no - // internal vpr defs: - // - Remove vpst. - // - Unpredicate the remaining instructions. - LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen()); - LoLoop.ToRemove.insert(Block.getPredicateThen()); - for (auto &PredMI : Insts) - RemovePredicate(PredMI.MI); - } - } - LLVM_DEBUG(dbgs() << "ARM Loops: Removing remaining VCTPs...\n"); - // Remove the "main" VCTP - LoLoop.ToRemove.insert(LoLoop.VCTP); - LLVM_DEBUG(dbgs() << " " << *LoLoop.VCTP); - // Remove remaining secondary VCTPs - for (MachineInstr *VCTP : LoLoop.SecondaryVCTPs) { - // All VCTPs that aren't marked for removal yet should be unpredicated ones. - // The predicated ones should have already been marked for removal when - // visiting the VPT blocks. - if (LoLoop.ToRemove.insert(VCTP).second) { - assert(getVPTInstrPredicate(*VCTP) == ARMVCC::None && - "Removing Predicated VCTP without updating the block mask!"); - LLVM_DEBUG(dbgs() << " " << *VCTP); + } else if (Block.containsVCTP()) { + // The vctp will be removed, so the block mask of the vp(s)t will need + // to be recomputed. + LoLoop.BlockMasksToRecompute.insert(Insts.front()); } } + + LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end()); } void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -464,28 +464,19 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB5_1: @ %bb4 -; CHECK-NEXT: add.w r12, r3, #3 -; CHECK-NEXT: mov.w lr, #1 -; CHECK-NEXT: bic r12, r12, #3 -; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: add.w lr, lr, r12, lsr #2 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB5_2: @ %bb12 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r0] -; CHECK-NEXT: vpttt.i32 ne, q0, zr +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vptt.i32 ne, q0, zr ; CHECK-NEXT: vcmpt.s32 le, q0, r2 -; CHECK-NEXT: vctpt.32 r3 ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r0], #16 -; CHECK-NEXT: le lr, .LBB5_2 +; CHECK-NEXT: letp lr, .LBB5_2 ; CHECK-NEXT: @ %bb.3: @ %bb32 ; CHECK-NEXT: pop {r7, pc} bb: Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll @@ -0,0 +1,87 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=enabled %s -o - | FileCheck %s + +define dso_local arm_aapcs_vfpcc i32 @minmaxval4(i32* nocapture readonly %x, i32* nocapture %minp) { +; CHECK-LABEL: minmaxval4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: adr r3, .LCPI0_0 +; CHECK-NEXT: mov.w lr, #3 +; CHECK-NEXT: vldrw.u32 q2, [r3] +; CHECK-NEXT: vmov.i32 q0, #0x80000000 +; CHECK-NEXT: vmvn.i32 q1, #0x80000000 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov.i32 q3, #0xa +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vadd.i32 q4, q2, r2 +; CHECK-NEXT: vdup.32 q5, r2 +; CHECK-NEXT: vcmp.u32 hi, q5, q4 +; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: vpnot +; CHECK-NEXT: vpst +; CHECK-NEXT: vcmpt.u32 hi, q3, q4 +; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 +; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vpst +; CHECK-NEXT: vcmpt.s32 gt, q4, q0 +; CHECK-NEXT: vpsel q0, q4, q0 +; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vpst +; CHECK-NEXT: vcmpt.s32 gt, q1, q4 +; CHECK-NEXT: vpsel q1, q4, q1 +; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: mvn r0, #-2147483648 +; CHECK-NEXT: vminv.s32 r0, q1 +; CHECK-NEXT: str r0, [r1] +; CHECK-NEXT: mov.w r0, #-2147483648 +; CHECK-NEXT: vmaxv.s32 r0, q0 +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI0_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 3 @ 0x3 +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ , %entry ], [ %5, %vector.body ] + %vec.phi29 = phi <4 x i32> [ , %entry ], [ %7, %vector.body ] + %0 = getelementptr inbounds i32, i32* %x, i32 %index + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 10) + %1 = bitcast i32* %0 to <4 x i32>* + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %2 = icmp sgt <4 x i32> %wide.masked.load, %vec.phi29 + %3 = icmp slt <4 x i32> %wide.masked.load, %vec.phi + %4 = and <4 x i1> %active.lane.mask, %3 + %5 = select <4 x i1> %4, <4 x i32> %wide.masked.load, <4 x i32> %vec.phi + %6 = and <4 x i1> %active.lane.mask, %2 + %7 = select <4 x i1> %6, <4 x i32> %wide.masked.load, <4 x i32> %vec.phi29 + %index.next = add i32 %index, 4 + %8 = icmp eq i32 %index.next, 12 + br i1 %8, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %9 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %7) + %10 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %5) + store i32 %10, i32* %minp, align 4 + ret i32 %9 +} + +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1 +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2 +declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) #3 +declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) #3 + Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir @@ -118,32 +118,23 @@ ; CHECK: bb.1.bb3: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2, $r3 - ; CHECK: renamable $r12 = t2ADDri renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: $vpr = VMSR_P0 killed $r3, 14 /* CC::al */, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg ; CHECK: VSTR_P0_off killed renamable $vpr, $sp, 0, 14 /* CC::al */, $noreg :: (store 4 into %stack.0) ; CHECK: $r3 = tMOVr $r0, 14 /* CC::al */, $noreg - ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 ; CHECK: bb.2.bb9: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 + ; CHECK: liveins: $lr, $r0, $r1, $r3 ; CHECK: renamable $vpr = VLDR_P0_off $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.0) - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr + ; CHECK: MVE_VPST 8, implicit $vpr ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg - ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $vpr = MVE_VCMPi32r renamable $q0, $zr, 1, 1, killed renamable $vpr + ; CHECK: MVE_VPTv4i32r 8, renamable $q0, $zr, 1, implicit-def $vpr ; CHECK: renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4) ; CHECK: renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: MVE_VPST 8, implicit $vpr ; CHECK: MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4) ; CHECK: $r0 = tMOVr $r3, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 ; CHECK: bb.3.bb27: ; CHECK: $sp = tADDspi $sp, 1, 14 /* CC::al */, $noreg ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir @@ -215,26 +215,17 @@ ; CHECK: bb.1.vector.ph: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2 - ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 - ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r1 ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg - ; CHECK: MVE_VPST 8, implicit $vpr - ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr - ; CHECK: MVE_VPTv4s32r 2, renamable $q1, renamable $r2, 11, implicit-def $vpr + ; CHECK: liveins: $lr, $q0, $r0, $r2, $r3 + ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg + ; CHECK: MVE_VPTv4s32r 4, renamable $q1, renamable $r2, 11, implicit-def $vpr ; CHECK: renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr ; CHECK: renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr - ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 ; CHECK: bb.3.for.cond.cleanup: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: @@ -731,26 +722,17 @@ ; CHECK: bb.1.vector.ph: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2 - ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 - ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r1 ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg - ; CHECK: MVE_VPST 8, implicit $vpr - ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr - ; CHECK: MVE_VPTv4s32r 2, renamable $q0, renamable $r2, 11, implicit-def $vpr + ; CHECK: liveins: $lr, $q0, $r0, $r2, $r3 + ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg + ; CHECK: MVE_VPTv4s32r 4, renamable $q0, renamable $r2, 11, implicit-def $vpr ; CHECK: renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr ; CHECK: renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr - ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 ; CHECK: bb.3.for.cond.cleanup: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: