diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -79,14 +79,9 @@
 class MVETailPredication : public LoopPass {
   SmallVector<IntrinsicInst*, 4> MaskedInsts;
   Loop *L = nullptr;
-  LoopInfo *LI = nullptr;
-  const DataLayout *DL;
-  DominatorTree *DT = nullptr;
   ScalarEvolution *SE = nullptr;
   TargetTransformInfo *TTI = nullptr;
   const ARMSubtarget *ST = nullptr;
-  TargetLibraryInfo *TLI = nullptr;
-  bool ClonedVCTPInExitBlock = false;
 
 public:
   static char ID;
@@ -98,8 +93,6 @@
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<TargetPassConfig>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.setPreservesCFG();
   }
@@ -123,8 +116,7 @@
 
   /// Insert the intrinsic to represent the effect of tail predication.
   void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount,
-                           FixedVectorType *VecTy,
-                           DenseMap<Instruction *, Instruction *> &NewPredicates);
+                           FixedVectorType *VecTy);
 
   /// Rematerialize the iteration count in exit blocks, which enables
   /// ARMLowOverheadLoops to better optimise away loop update statements inside
@@ -153,16 +145,6 @@
   return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load;
 }
 
-void MVETailPredication::RematerializeIterCount() {
-  SmallVector<WeakTrackingVH, 16> DeadInsts;
-  SCEVExpander Rewriter(*SE, *DL, "mvetp");
-  ReplaceExitVal ReplaceExitValue = AlwaysRepl;
-
-  formLCSSARecursively(*L, *DT, LI, SE);
-  rewriteLoopExitValues(L, LI, TLI, SE, TTI, Rewriter, DT, ReplaceExitValue,
-                        DeadInsts);
-}
-
 bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
   if (skipLoop(L) || DisableTailPredication)
     return false;
@@ -172,13 +154,8 @@
   auto &TPC = getAnalysis<TargetPassConfig>();
   auto &TM = TPC.getTM<TargetMachine>();
   ST = &TM.getSubtarget<ARMSubtarget>(F);
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-  TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
-  DL = &L->getHeader()->getModule()->getDataLayout();
   this->L = L;
 
   // The MVE and LOB extensions are combined to enable tail-predication, but
@@ -232,7 +209,6 @@
   if (!Decrement)
     return false;
 
-  ClonedVCTPInExitBlock = false;
   LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
              << *Decrement << "\n");
 
@@ -241,8 +217,6 @@
     return false;
   }
 
-  if (ClonedVCTPInExitBlock)
-    RematerializeIterCount();
   return true;
 }
 
@@ -319,32 +293,11 @@
 // in the block. This means that the VPR doesn't have to be live into the
 // exit block which should make it easier to convert this loop into a proper
 // tail predicated loop.
-static bool Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
-                    SetVector<Instruction*> &MaybeDead, Loop *L) {
+static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) {
   BasicBlock *Exit = L->getUniqueExitBlock();
   if (!Exit) {
     LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n");
-    return false;
-  }
-
-  bool ClonedVCTPInExitBlock = false;
-
-  for (auto &Pair : NewPredicates) {
-    Instruction *OldPred = Pair.first;
-    Instruction *NewPred = Pair.second;
-
-    for (auto &I : *Exit) {
-      if (I.isSameOperationAs(OldPred)) {
-        Instruction *PredClone = NewPred->clone();
-        PredClone->insertBefore(&I);
-        I.replaceAllUsesWith(PredClone);
-        MaybeDead.insert(&I);
-        ClonedVCTPInExitBlock = true;
-        LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump();
-                   dbgs() << "ARM TP: with:      "; PredClone->dump());
-        break;
-      }
-    }
+    return;
   }
 
   // Drop references and add operands to check for dead.
@@ -369,8 +322,6 @@
 
   for (auto I : L->blocks())
     DeleteDeadPHIs(I);
-
-  return ClonedVCTPInExitBlock;
 }
 
 // The active lane intrinsic has this form:
@@ -549,8 +500,7 @@
 }
 
 void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
-    Value *TripCount, FixedVectorType *VecTy,
-    DenseMap<Instruction*, Instruction*> &NewPredicates) {
+    Value *TripCount, FixedVectorType *VecTy) {
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   Module *M = L->getHeader()->getModule();
   Type *Ty = IntegerType::get(M->getContext(), 32);
@@ -591,7 +541,6 @@
   Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
   Value *VCTPCall = Builder.CreateCall(VCTP, Processed);
   ActiveLaneMask->replaceAllUsesWith(VCTPCall);
-  NewPredicates[ActiveLaneMask] = cast<Instruction>(VCTPCall);
 
   // Add the incoming value to the new phi.
   // TODO: This add likely already exists in the loop.
@@ -609,9 +558,7 @@
   }
 
   LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n");
-
   SetVector<Instruction*> Predicates;
-  DenseMap<Instruction*, Instruction*> NewPredicates;
 
   // Walk through the masked intrinsics and try to find whether the predicate
   // operand is generated by intrinsic @llvm.get.active.lane.mask().
@@ -636,11 +583,10 @@
       return false;
     }
     LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n");
-    InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy, NewPredicates);
+    InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy);
   }
 
-  // Now clean up.
-  ClonedVCTPInExitBlock = Cleanup(NewPredicates, Predicates, L);
+  Cleanup(Predicates, L);
   return true;
 }
 
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
@@ -15,8 +15,7 @@
 ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]],
 
 ; CHECK: middle.block:
-; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[REMAT_ITER:%.*]])
-; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]],
+; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP]],
 ; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
 
 define i32 @vec_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
@@ -64,16 +63,12 @@
   br i1 %11, label %vector.body, label %middle.block
 
 middle.block:                                     ; preds = %vector.body
-; TODO: check that the intrinsic is also emitted here by the loop vectoriser
-;  %12 = icmp ule <4 x i32> %induction, %broadcast.splat12
-  %12 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
-
-  %13 = select <4 x i1> %12, <4 x i32> %9, <4 x i32> %vec.phi
-  %14 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %13)
+  %12 = select <4 x i1> %7, <4 x i32> %9, <4 x i32> %vec.phi
+  %13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
-  %res.0.lcssa = phi i32 [ 0, %entry ], [ %14, %middle.block ]
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %13, %middle.block ]
   ret i32 %res.0.lcssa
 }