diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -166,62 +166,136 @@
     }
   };
 
-  // Represent a VPT block, a list of instructions that begins with a VPT/VPST
-  // and has a maximum of four proceeding instructions. All instructions within
-  // the block are predicated upon the vpr and we allow instructions to define
-  // the vpr within in the block too.
-  class VPTBlock {
-    // The predicate then instruction, which is either a VPT, or a VPST
-    // instruction.
-    std::unique_ptr<PredicatedMI> PredicateThen;
-    PredicatedMI *Divergent = nullptr;
-    SmallVector<PredicatedMI, 4> Insts;
+  // Represent the current state of the VPR and hold all instances which
+  // represent a VPT block, which is a list of instructions that begins with a
+  // VPT/VPST and has a maximum of four proceeding instructions. All
+  // instructions within the block are predicated upon the vpr and we allow
+  // instructions to define the vpr within in the block too.
+  class VPTState {
+    friend struct LowOverheadLoop;
+
+    SmallVector<MachineInstr *, 4> Insts;
+
+    static SmallVector<VPTState, 4> Blocks;
+    static SetVector<MachineInstr *> CurrentPredicates;
+    static std::map<MachineInstr *,
+      std::unique_ptr<PredicatedMI>> PredicatedInsts;
+
+    static void CreateVPTBlock(MachineInstr *MI) {
+      assert(CurrentPredicates.size() && "Can't begin VPT without predicate");
+      Blocks.emplace_back(MI);
+      // The execution of MI is predicated upon the current set of instructions
+      // that are AND'ed together to form the VPR predicate value. In the case
+      // that MI is a VPT, CurrentPredicates will also just be MI.
+      PredicatedInsts.emplace(
+        MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
+    }
 
-  public:
-    VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
-      PredicateThen = std::make_unique<PredicatedMI>(MI, Preds);
+    static void reset() {
+      Blocks.clear();
+      PredicatedInsts.clear();
+      CurrentPredicates.clear();
     }
 
-    void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
-      LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI);
-      if (!Divergent && !set_difference(Preds, PredicateThen->Predicates).empty()) {
-        Divergent = &Insts.back();
-        LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI);
-      }
-      Insts.emplace_back(MI, Preds);
-      assert(Insts.size() <= 4 && "Too many instructions in VPT block!");
+    static void addInst(MachineInstr *MI) {
+      Blocks.back().insert(MI);
+      PredicatedInsts.emplace(
+        MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
     }
 
+    static void addPredicate(MachineInstr *MI) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);
+      CurrentPredicates.insert(MI);
+    }
+
+    static void resetPredicate(MachineInstr *MI) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);
+      CurrentPredicates.clear();
+      CurrentPredicates.insert(MI);
+    }
+
+  public:
     // Have we found an instruction within the block which defines the vpr? If
     // so, not all the instructions in the block will have the same predicate.
-    bool HasNonUniformPredicate() const {
-      return Divergent != nullptr;
+    static bool hasUniformPredicate(VPTState &Block) {
+      return getDivergent(Block) == nullptr;
+    }
+
+    // If it exists, return the first internal instruction which modifies the
+    // VPR.
+    static MachineInstr *getDivergent(VPTState &Block) {
+      SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+      for (unsigned i = 1; i < Insts.size(); ++i) {
+        MachineInstr *Next = Insts[i];
+        if (isVectorPredicate(Next))
+          return Next; // Found an instruction altering the vpr.
+      }
+      return nullptr;
     }
 
-    // Is the given instruction part of the predicate set controlling the entry
-    // to the block.
-    bool IsPredicatedOn(MachineInstr *MI) const {
-      return PredicateThen->Predicates.count(MI);
+    // Return whether the given instruction is predicated upon a VCTP.
+    static bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) {
+      SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]->Predicates;
+      if (Exclusive && Predicates.size() != 1)
+        return false;
+      for (auto *PredMI : Predicates)
+        if (isVCTP(PredMI))
+          return true;
+      return false;
+    }
+
+    // Is the VPST, controlling the block entry, predicated upon a VCTP.
+    static bool isEntryPredicatedOnVCTP(VPTState &Block,
+                                        bool Exclusive = false) {
+      SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+      return isPredicatedOnVCTP(Insts.front(), Exclusive);
+    }
+
+    static bool isValid() {
+      // All predication within the loop should be based on vctp. If the block
+      // isn't predicated on entry, check whether the vctp is within the block
+      // and that all other instructions are then predicated on it.
+      for (auto &Block : Blocks) {
+        if (isEntryPredicatedOnVCTP(Block))
+          continue;
+
+        SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+        for (auto *MI : Insts) {
+          // Check that any internal VCTPs are 'Then' predicated.
+          if (isVCTP(MI) && getVPTInstrPredicate(*MI) != ARMVCC::Then)
+            return false;
+          // Skip other instructions that build up the predicate.
+          if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI))
+            continue;
+          // Check that any other instructions are predicated upon a vctp.
+          // TODO: We could infer when VPTs are implicitly predicated on the
+          // vctp (when the operands are predicated).
+          if (!isPredicatedOnVCTP(MI)) {
+            LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);
+            return false;
+          }
+        }
+      }
+      return true;
     }
 
-    // Returns true if this is a VPT instruction.
-    bool isVPT() const { return !isVPST(); }
+    VPTState(MachineInstr *MI) { Insts.push_back(MI); }
 
-    // Returns true if this is a VPST instruction.
-    bool isVPST() const {
-      return PredicateThen->MI->getOpcode() == ARM::MVE_VPST;
+    void insert(MachineInstr *MI) {
+      Insts.push_back(MI);
+      // VPT/VPST + 4 predicated instructions.
+      assert(Insts.size() <= 5 && "Too many instructions in VPT block!");
     }
 
-    // Is the given instruction the only predicate which controls the entry to
-    // the block.
-    bool IsOnlyPredicatedOn(MachineInstr *MI) const {
-      return IsPredicatedOn(MI) && PredicateThen->Predicates.size() == 1;
+    bool containsVCTP() const {
+      for (auto *MI : Insts)
+        if (isVCTP(MI))
+          return true;
+      return false;
     }
 
     unsigned size() const { return Insts.size(); }
-    SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; }
-    MachineInstr *getPredicateThen() const { return PredicateThen->MI; }
-    PredicatedMI *getDivergent() const { return Divergent; }
+    SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; }
   };
 
   struct LowOverheadLoop {
@@ -237,12 +311,8 @@
     MachineInstr *Start = nullptr;
     MachineInstr *Dec = nullptr;
     MachineInstr *End = nullptr;
-    MachineInstr *VCTP = nullptr;
     MachineOperand TPNumElements;
-    SmallPtrSet<MachineInstr*, 4> SecondaryVCTPs;
-    VPTBlock *CurrentBlock = nullptr;
-    SetVector<MachineInstr*> CurrentPredicate;
-    SmallVector<VPTBlock, 4> VPTBlocks;
+    SmallVector<MachineInstr*, 4> VCTPs;
     SmallPtrSet<MachineInstr*, 4> ToRemove;
     SmallPtrSet<MachineInstr*, 4> BlockMasksToRecompute;
     bool Revert = false;
@@ -258,6 +328,7 @@
         Preheader = MBB;
       else if (auto *MBB = MLI.findLoopPreheader(&ML, true))
         Preheader = MBB;
+      VPTState::reset();
     }
 
     // If this is an MVE instruction, check that we know how to use tail
@@ -272,10 +343,14 @@
     bool IsTailPredicationLegal() const {
       // For now, let's keep things really simple and only support a single
       // block for tail predication.
-      return !Revert && FoundAllComponents() && VCTP &&
+      return !Revert && FoundAllComponents() && !VCTPs.empty() &&
              !CannotTailPredicate && ML.getNumBlocks() == 1;
     }
 
+    // Given that MI is a VCTP, check that is equivalent to any other VCTPs
+    // found.
+    bool AddVCTP(MachineInstr *MI);
+
     // Check that the predication in the loop will be equivalent once we
     // perform the conversion. Also ensure that we can provide the number
     // of elements to the loop start instruction.
@@ -298,7 +373,9 @@
       return Start && Dec && End;
     }
 
-    SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTBlocks; }
+    SmallVectorImpl<VPTState> &getVPTBlocks() {
+      return VPTState::Blocks;
+    }
 
     // Return the operand for the loop start instruction. This will be the loop
     // iteration count, or the number of elements if we're tail predicating.
@@ -311,14 +388,18 @@
       if (!IsTailPredicationLegal())
         return IsDo ? ARM::t2DLS : ARM::t2WLS;
 
-      return VCTPOpcodeToLSTP(VCTP->getOpcode(), IsDo);
+      return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo);
     }
 
     void dump() const {
       if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
       if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
       if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
-      if (VCTP) dbgs() << "ARM Loops: Found VCTP: " << *VCTP;
+      if (!VCTPs.empty()) {
+        dbgs() << "ARM Loops: Found VCTP(s):\n";
+        for (auto *MI : VCTPs)
+          dbgs() << " - " << *MI;
+      }
       if (!FoundAllComponents())
         dbgs() << "ARM Loops: Not a low-overhead loop.\n";
       else if (!(Start && Dec && End))
@@ -382,6 +463,11 @@
 
 char ARMLowOverheadLoops::ID = 0;
 
+SmallVector<VPTState, 4> VPTState::Blocks;
+SetVector<MachineInstr *> VPTState::CurrentPredicates;
+std::map<MachineInstr *,
+         std::unique_ptr<PredicatedMI>> VPTState::PredicatedInsts;
+
 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
                 false, false)
 
@@ -419,38 +505,10 @@
 }
 
 bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
-  assert(VCTP && "VCTP instruction expected but is not set");
-  // All predication within the loop should be based on vctp. If the block
-  // isn't predicated on entry, check whether the vctp is within the block
-  // and that all other instructions are then predicated on it.
-  for (auto &Block : VPTBlocks) {
-    if (Block.IsPredicatedOn(VCTP))
-      continue;
-    if (Block.HasNonUniformPredicate() && !isVCTP(Block.getDivergent()->MI)) {
-      LLVM_DEBUG(dbgs() << "ARM Loops: Found unsupported diverging predicate: "
-                        << *Block.getDivergent()->MI);
-      return false;
-    }
-    SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
-    for (auto &PredMI : Insts) {
-      // Check the instructions in the block and only allow:
-      //   - VCTPs
-      //   - Instructions predicated on the main VCTP
-      //   - Any VCMP
-      //      - VCMPs just "and" their result with VPR.P0. Whether they are
-      //      located before/after the VCTP is irrelevant - the end result will
-      //      be the same in both cases, so there's no point in requiring them
-      //      to be located after the VCTP!
-      if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI) ||
-          VCMPOpcodeToVPT(PredMI.MI->getOpcode()) != 0)
-        continue;
-      LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI
-                 << " - which is predicated on:\n";
-                 for (auto *MI : PredMI.Predicates)
-                   dbgs() << "   - " << *MI);
-      return false;
-    }
-  }
+  assert(!VCTPs.empty() && "VCTP instruction expected but is not set");
+
+  if (!VPTState::isValid())
+    return false;
 
   if (!ValidateLiveOuts()) {
     LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
@@ -461,6 +519,7 @@
   // of the iteration count, to the loop start instruction. The number of
   // elements is provided to the vctp instruction, so we need to check that
   // we can use this register at InsertPt.
+  MachineInstr *VCTP = VCTPs.back();
   TPNumElements = VCTP->getOperand(1);
   Register NumElements = TPNumElements.getReg();
 
@@ -557,10 +616,10 @@
   if (auto *Def = RDA.getUniqueReachingMIDef(&MBB->back(),
                                              VCTP->getOperand(1).getReg())) {
     SmallPtrSet<MachineInstr*, 2> ElementChain;
-    SmallPtrSet<MachineInstr*, 2> Ignore = { VCTP };
+    SmallPtrSet<MachineInstr*, 2> Ignore;
     unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());
 
-    Ignore.insert(SecondaryVCTPs.begin(), SecondaryVCTPs.end());
+    Ignore.insert(VCTPs.begin(), VCTPs.end());
 
     if (RDA.isSafeToRemove(Def, ElementChain, Ignore)) {
       bool FoundSub = false;
@@ -853,7 +912,7 @@
     LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt);
 
   if (!IsTailPredicationLegal()) {
-    LLVM_DEBUG(if (!VCTP)
+    LLVM_DEBUG(if (VCTPs.empty())
                  dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
                dbgs() << "ARM Loops: Tail-predication is not valid.\n");
     return;
@@ -866,6 +925,26 @@
              dbgs() << "ARM Loops: Couldn't validate tail predicate.\n");
 }
 
+bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
+  LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);
+  if (VCTPs.empty()) {
+    VCTPs.push_back(MI);
+    return true;
+  }
+
+  // If we find another VCTP, check whether it uses the same value as the main VCTP.
+  // If it does, store it in the VCTPs set, else refuse it.
+  MachineInstr *Prev = VCTPs.back();
+  if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
+      !RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg())) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
+                         "definition from the main VCTP");
+    return false;
+  }
+  VCTPs.push_back(MI);
+  return true;
+}
+
 bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
   if (CannotTailPredicate)
     return false;
@@ -886,75 +965,28 @@
     return false;
   }
 
-  if (isVCTP(MI)) {
-    // If we find another VCTP, check whether it uses the same value as the main VCTP.
-    // If it does, store it in the SecondaryVCTPs set, else refuse it.
-    if (VCTP) {
-      if (!VCTP->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
-          !RDA.hasSameReachingDef(VCTP, MI, MI->getOperand(1).getReg())) {
-        LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
-                             "definition from the main VCTP");
-        return false;
-      }
-      LLVM_DEBUG(dbgs() << "ARM Loops: Found secondary VCTP: " << *MI);
-      SecondaryVCTPs.insert(MI);
-    } else {
-      LLVM_DEBUG(dbgs() << "ARM Loops: Found 'main' VCTP: " << *MI);
-      VCTP = MI;
-    }
-  } else if (isVPTOpcode(MI->getOpcode())) {
-    if (MI->getOpcode() != ARM::MVE_VPST) {
-      assert(MI->findRegisterDefOperandIdx(ARM::VPR) != -1 &&
-             "VPT does not implicitly define VPR?!");
-      CurrentPredicate.clear();
-      CurrentPredicate.insert(MI);
-    }
-
-    VPTBlocks.emplace_back(MI, CurrentPredicate);
-    CurrentBlock = &VPTBlocks.back();
-    return true;
-  }
+  // Record all VCTPs and check that they're equivalent to one another.
+  if (isVCTP(MI) && !AddVCTP(MI))
+    return false;
 
   // Inspect uses first so that any instructions that alter the VPR don't
   // alter the predicate upon themselves.
   const MCInstrDesc &MCID = MI->getDesc();
   bool IsUse = false;
-  bool IsDef = false;
   for (int i = MI->getNumOperands() - 1; i >= 0; --i) {
     const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg() || MO.getReg() != ARM::VPR)
+    if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR)
       continue;
 
-    if (MO.isDef()) {
-      CurrentPredicate.insert(MI);
-      IsDef = true;
-    } else if (ARM::isVpred(MCID.OpInfo[i].OperandType)) {
-      CurrentBlock->addInst(MI, CurrentPredicate);
+    if (ARM::isVpred(MCID.OpInfo[i].OperandType)) {
+      VPTState::addInst(MI);
       IsUse = true;
-    } else {
+    } else if (MI->getOpcode() != ARM::MVE_VPST) {
       LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
       return false;
     }
   }
 
-  // If this instruction defines the VPR, update the predicate for the
-  // proceeding instructions.
-  if (IsDef) {
-    // Clear the existing predicate when we're not in VPT Active state.
-    if (!isVectorPredicated(MI))
-      CurrentPredicate.clear();
-    CurrentPredicate.insert(MI);
-    LLVM_DEBUG(dbgs() << "ARM Loops: Adding Predicate: " << *MI);
-  }
-
-  // If we find a vpr def that is not already predicated on the vctp, we've
-  // got disjoint predicates that may not be equivalent when we do the
-  // conversion.
-  if (IsDef && !IsUse && VCTP && !isVCTP(MI)) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Found disjoint vpr def: " << *MI);
-    return false;
-  }
-
   // If we find an instruction that has been marked as not valid for tail
   // predication, only allow the instruction if it's contained within a valid
   // VPT block.
@@ -968,7 +1000,26 @@
 
   // If the instruction is already explicitly predicated, then the conversion
   // will be fine, but ensure that all store operations are predicated.
-  return !IsUse && MI->mayStore() ? false : true;
+  if (MI->mayStore())
+    return IsUse;
+
+  // If this instruction defines the VPR, update the predicate for the
+  // proceeding instructions.
+  if (isVectorPredicate(MI)) {
+    // Clear the existing predicate when we're not in VPT Active state,
+    // otherwise we add to it.
+    if (!isVectorPredicated(MI))
+      VPTState::resetPredicate(MI);
+    else
+      VPTState::addPredicate(MI);
+  }
+
+  // Finally once the predicate has been modified, we can start a new VPT
+  // block if necessary.
+  if (isVPTOpcode(MI->getOpcode()))
+    VPTState::CreateVPTBlock(MI);
+
+  return true;
 }
 
 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
@@ -1301,23 +1352,20 @@
       llvm_unreachable("trying to unpredicate a non-predicated instruction");
   };
 
-  // There are a few scenarios which we have to fix up:
-  // 1. VPT Blocks with non-uniform predicates:
-  //    - a. When the divergent instruction is a vctp
-  //    - b. When the block uses a vpst, and is only predicated on the vctp
-  //    - c. When the block uses a vpt and (optionally) contains one or more
-  //         vctp.
-  // 2. VPT Blocks with uniform predicates:
-  //    - a. The block uses a vpst, and is only predicated on the vctp
   for (auto &Block : LoLoop.getVPTBlocks()) {
-    SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
-    if (Block.HasNonUniformPredicate()) {
-      PredicatedMI *Divergent = Block.getDivergent();
-      if (isVCTP(Divergent->MI)) {
-        // The vctp will be removed, so the block mask of the vp(s)t will need
-        // to be recomputed.
-        LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen());
-      } else if (Block.isVPST() && Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
+    SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+
+    if (VPTState::isEntryPredicatedOnVCTP(Block, /*exclusive*/true)) {
+      if (VPTState::hasUniformPredicate(Block)) {
+        // A vpt block starting with VPST, is only predicated upon vctp and has no
+        // internal vpr defs:
+        // - Remove vpst.
+        // - Unpredicate the remaining instructions.
+        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Insts.front());
+        LoLoop.ToRemove.insert(Insts.front());
+        for (unsigned i = 1; i < Insts.size(); ++i)
+          RemovePredicate(Insts[i]);
+      } else {
         // The VPT block has a non-uniform predicate but it uses a vpst and its
         // entry is guarded only by a vctp, which means we:
         // - Need to remove the original vpst.
@@ -1327,28 +1375,28 @@
         //   the divergent vpr def.
         // TODO: We could be producing more VPT blocks than necessary and could
         // fold the newly created one into a proceeding one.
-        for (auto I = ++MachineBasicBlock::iterator(Block.getPredicateThen()),
-             E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I)
+        MachineInstr *Divergent = VPTState::getDivergent(Block);
+        for (auto I = ++MachineBasicBlock::iterator(Insts.front()),
+             E = ++MachineBasicBlock::iterator(Divergent); I != E; ++I)
           RemovePredicate(&*I);
 
         // Check if the instruction defining vpr is a vcmp so it can be combined
         // with the VPST This should be the divergent instruction
-        MachineInstr *VCMP = VCMPOpcodeToVPT(Divergent->MI->getOpcode()) != 0
-                                 ? Divergent->MI
-                                 : nullptr;
+        MachineInstr *VCMP = VCMPOpcodeToVPT(Divergent->getOpcode()) != 0
+          ? Divergent
+          : nullptr;
 
         unsigned Size = 0;
-        auto E = MachineBasicBlock::reverse_iterator(Divergent->MI);
-        auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI);
+        auto E = MachineBasicBlock::reverse_iterator(Divergent);
+        auto I = MachineBasicBlock::reverse_iterator(Insts.back());
         MachineInstr *InsertAt = nullptr;
         while (I != E) {
           InsertAt = &*I;
           ++Size;
           ++I;
         }
+
         MachineInstrBuilder MIB;
-        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: "
-                          << *Block.getPredicateThen());
         if (VCMP) {
           // Combine the VPST and VCMP into a VPT
           MIB =
@@ -1372,51 +1420,18 @@
           MIB.addImm(0);
           LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
         }
-        LoLoop.ToRemove.insert(Block.getPredicateThen());
+        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Insts.front());
+        LoLoop.ToRemove.insert(Insts.front());
         LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
       }
-      // Else, if the block uses a vpt, iterate over the block, removing the
-      // extra VCTPs it may contain.
-      else if (Block.isVPT()) {
-        bool RemovedVCTP = false;
-        for (PredicatedMI &Elt : Block.getInsts()) {
-          MachineInstr *MI = Elt.MI;
-          if (isVCTP(MI)) {
-            LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *MI);
-            LoLoop.ToRemove.insert(MI);
-            RemovedVCTP = true;
-            continue;
-          }
-        }
-        if (RemovedVCTP)
-          LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen());
-      }
-    } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP) && Block.isVPST()) {
-      // A vpt block starting with VPST, is only predicated upon vctp and has no
-      // internal vpr defs:
-      // - Remove vpst.
-      // - Unpredicate the remaining instructions.
-      LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen());
-      LoLoop.ToRemove.insert(Block.getPredicateThen());
-      for (auto &PredMI : Insts)
-        RemovePredicate(PredMI.MI);
-    }
-  }
-  LLVM_DEBUG(dbgs() << "ARM Loops: Removing remaining VCTPs...\n");
-  // Remove the "main" VCTP
-  LoLoop.ToRemove.insert(LoLoop.VCTP);
-  LLVM_DEBUG(dbgs() << "    " << *LoLoop.VCTP);
-  // Remove remaining secondary VCTPs
-  for (MachineInstr *VCTP : LoLoop.SecondaryVCTPs) {
-    // All VCTPs that aren't marked for removal yet should be unpredicated ones.
-    // The predicated ones should have already been marked for removal when
-    // visiting the VPT blocks.
-    if (LoLoop.ToRemove.insert(VCTP).second) {
-      assert(getVPTInstrPredicate(*VCTP) == ARMVCC::None &&
-             "Removing Predicated VCTP without updating the block mask!");
-      LLVM_DEBUG(dbgs() << "    " << *VCTP);
+    } else if (Block.containsVCTP()) {
+      // The vctp will be removed, so the block mask of the vp(s)t will need
+      // to be recomputed.
+      LoLoop.BlockMasksToRecompute.insert(Insts.front());
     }
   }
+
+  LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end());
 }
 
 void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@@ -464,28 +464,19 @@
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB5_1: @ %bb4
-; CHECK-NEXT:    add.w r12, r3, #3
-; CHECK-NEXT:    mov.w lr, #1
-; CHECK-NEXT:    bic r12, r12, #3
-; CHECK-NEXT:    sub.w r12, r12, #4
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #2
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB5_2: @ %bb12
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vctp.32 r3
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrwt.u32 q0, [r0]
-; CHECK-NEXT:    vpttt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vptt.i32 ne, q0, zr
 ; CHECK-NEXT:    vcmpt.s32 le, q0, r2
-; CHECK-NEXT:    vctpt.32 r3
 ; CHECK-NEXT:    vldrwt.u32 q1, [r1], #16
 ; CHECK-NEXT:    add.w r12, r12, #4
-; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q0, [r0], #16
-; CHECK-NEXT:    le lr, .LBB5_2
+; CHECK-NEXT:    letp lr, .LBB5_2
 ; CHECK-NEXT:  @ %bb.3: @ %bb32
 ; CHECK-NEXT:    pop {r7, pc}
 bb:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=enabled %s -o - | FileCheck %s
+
+define dso_local arm_aapcs_vfpcc i32 @minmaxval4(i32* nocapture readonly %x, i32* nocapture %minp) {
+; CHECK-LABEL: minmaxval4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    adr r3, .LCPI0_0
+; CHECK-NEXT:    mov.w lr, #3
+; CHECK-NEXT:    vldrw.u32 q2, [r3]
+; CHECK-NEXT:    vmov.i32 q0, #0x80000000
+; CHECK-NEXT:    vmvn.i32 q1, #0x80000000
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    vmov.i32 q3, #0xa
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vadd.i32 q4, q2, r2
+; CHECK-NEXT:    vdup.32 q5, r2
+; CHECK-NEXT:    vcmp.u32 hi, q5, q4
+; CHECK-NEXT:    adds r2, #4
+; CHECK-NEXT:    vpnot
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcmpt.u32 hi, q3, q4
+; CHECK-NEXT:    vstr p0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q4, [r0], #16
+; CHECK-NEXT:    vldr p0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcmpt.s32 gt, q4, q0
+; CHECK-NEXT:    vpsel q0, q4, q0
+; CHECK-NEXT:    vldr p0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcmpt.s32 gt, q1, q4
+; CHECK-NEXT:    vpsel q1, q4, q1
+; CHECK-NEXT:    le lr, .LBB0_1
+; CHECK-NEXT:  @ %bb.2: @ %middle.block
+; CHECK-NEXT:    mvn r0, #-2147483648
+; CHECK-NEXT:    vminv.s32 r0, q1
+; CHECK-NEXT:    str r0, [r1]
+; CHECK-NEXT:    mov.w r0, #-2147483648
+; CHECK-NEXT:    vmaxv.s32 r0, q0
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:  .LCPI0_0:
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 1 @ 0x1
+; CHECK-NEXT:    .long 2 @ 0x2
+; CHECK-NEXT:    .long 3 @ 0x3
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, %entry ], [ %5, %vector.body ]
+  %vec.phi29 = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %entry ], [ %7, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %x, i32 %index
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 10)
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %2 = icmp sgt <4 x i32> %wide.masked.load, %vec.phi29
+  %3 = icmp slt <4 x i32> %wide.masked.load, %vec.phi
+  %4 = and <4 x i1> %active.lane.mask, %3
+  %5 = select <4 x i1> %4, <4 x i32> %wide.masked.load, <4 x i32> %vec.phi
+  %6 = and <4 x i1> %active.lane.mask, %2
+  %7 = select <4 x i1> %6, <4 x i32> %wide.masked.load, <4 x i32> %vec.phi29
+  %index.next = add i32 %index, 4
+  %8 = icmp eq i32 %index.next, 12
+  br i1 %8, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %9 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %7)
+  %10 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %5)
+  store i32 %10, i32* %minp, align 4
+  ret i32 %9
+}
+
+declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2
+declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) #3
+declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) #3
+
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
@@ -118,32 +118,23 @@
   ; CHECK: bb.1.bb3:
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   liveins: $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $r12 = t2ADDri renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   $vpr = VMSR_P0 killed $r3, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   VSTR_P0_off killed renamable $vpr, $sp, 0, 14 /* CC::al */, $noreg :: (store 4 into %stack.0)
   ; CHECK:   $r3 = tMOVr $r0, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
   ; CHECK: bb.2.bb9:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
+  ; CHECK:   liveins: $lr, $r0, $r1, $r3
   ; CHECK:   renamable $vpr = VLDR_P0_off $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.0)
-  ; CHECK:   MVE_VPST 4, implicit $vpr
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
+  ; CHECK:   MVE_VPST 8, implicit $vpr
   ; CHECK:   renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4)
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
-  ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   MVE_VPST 4, implicit $vpr
-  ; CHECK:   renamable $vpr = MVE_VCMPi32r renamable $q0, $zr, 1, 1, killed renamable $vpr
+  ; CHECK:   MVE_VPTv4i32r 8, renamable $q0, $zr, 1, implicit-def $vpr
   ; CHECK:   renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4)
   ; CHECK:   renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
   ; CHECK:   MVE_VPST 8, implicit $vpr
   ; CHECK:   MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
   ; CHECK:   $r0 = tMOVr $r3, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.bb27:
   ; CHECK:   $sp = tADDspi $sp, 1, 14 /* CC::al */, $noreg
   ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir
@@ -215,26 +215,17 @@
   ; CHECK: bb.1.vector.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   liveins: $r0, $r1, $r2
-  ; CHECK:   renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r1
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr
-  ; CHECK:   MVE_VPTv4s32r 2, renamable $q1, renamable $r2, 11, implicit-def $vpr
+  ; CHECK:   liveins: $lr, $q0, $r0, $r2, $r3
+  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg
+  ; CHECK:   MVE_VPTv4s32r 4, renamable $q1, renamable $r2, 11, implicit-def $vpr
   ; CHECK:   renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr
   ; CHECK:   renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr
-  ; CHECK:   renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.for.cond.cleanup:
   ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
   bb.0.entry:
@@ -731,26 +722,17 @@
   ; CHECK: bb.1.vector.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   liveins: $r0, $r1, $r2
-  ; CHECK:   renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r1
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr
-  ; CHECK:   MVE_VPTv4s32r 2, renamable $q0, renamable $r2, 11, implicit-def $vpr
+  ; CHECK:   liveins: $lr, $q0, $r0, $r2, $r3
+  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg
+  ; CHECK:   MVE_VPTv4s32r 4, renamable $q0, renamable $r2, 11, implicit-def $vpr
   ; CHECK:   renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr
   ; CHECK:   renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr
-  ; CHECK:   renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.for.cond.cleanup:
   ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
   bb.0.entry: