Index: llvm/lib/Target/ARM/MVETailPredication.cpp
===================================================================
--- llvm/lib/Target/ARM/MVETailPredication.cpp
+++ llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -122,6 +122,10 @@
   bool runOnLoop(Loop *L, LPPassManager&) override;
 
 private:
+  /// Finds and fix a pattern generated by the LSR pass which interferes with
+  /// tail predication.
+  void FixLSRPattern(Loop *L, ScalarEvolution &SE);
+
   /// Perform the relevant checks on the loop and convert if possible.
   bool TryConvert(Value *TripCount);
 
@@ -150,6 +154,121 @@
 
 } // end namespace
 
+static bool IsVCTPIntrinsic(const Value *V) {
+  const IntrinsicInst *Call = dyn_cast<IntrinsicInst>(V);
+  if (!Call)
+    return false;
+
+  Intrinsic::ID ID = Call->getIntrinsicID();
+  return ID == Intrinsic::arm_mve_vctp8 || ID == Intrinsic::arm_mve_vctp16 ||
+         ID == Intrinsic::arm_mve_vctp32 || ID == Intrinsic::arm_mve_vctp64;
+}
+
+// This function attempts to find and rewrite a pattern that the LSR
+// pass generates in some situation and which can't be tail-predicated.
+//
+// Here is an example of IR that LSR can generate:
+//
+// loopbody:
+//   %lsr.iv = phi i32 [ %lsr.iv.next, %loopbody ], [ %42, %pred ]
+//   %44 = add i32 %lsr.iv, -4
+//   %45 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %44) #5
+//   ; ... etc
+//   %lsr.iv.next = add nsw i32 %lsr.iv, -4
+//
+// That can't be tail-predicated because the VCTP's operand is defined in the
+// loop, so this function will transform this into:
+//
+// pred:
+//   %44 = add i32 %42, -4
+// loopbody:
+//   %lsr.iv = phi i32 [ %lsr.iv.next, %loopbody ], [ %42, %pred ]
+//   %lsr.fixed = phi i32 [ %lsr.iv.next, %loopbody ], [ %44, %pred ]
+//   %45 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %lsr.fixed)
+//   ; ... etc
+//   %lsr.iv.next = add nsw i32 %lsr.iv, -4
+//
+// Which is equivalent to what LSR generated, but can be tail-predicated just
+// fine.
+void MVETailPredication::FixLSRPattern(Loop *L, ScalarEvolution &SE) {
+  // This pattern has only been found in single-block loops for now, so let's
+  // keep this simple.
+  if (L->getBlocks().size() != 1)
+    return;
+  BasicBlock *BodyBB = L->getHeader();
+  BasicBlock *LoopPredBB = L->getLoopPredecessor();
+
+  // VCTPAdd should be an add of a PHI value and a constant, used only by a VCTP
+  // intrisic.
+  Instruction *VCTPAdd = nullptr;
+  // The VCTP that uses VCTPAdd.
+  Instruction *VCTP = nullptr;
+  // PHIAdd should be an add with the same type/operands as VCTPAdd, used only
+  // by a PHI instruction in the loop's body.
+  Instruction *PHIAdd = nullptr;
+
+  for (Instruction &I : BodyBB->instructionsWithoutDebug()) {
+    // We're only interested in single-use adds.
+    if (!I.hasOneUse() || I.getOpcode() != Instruction::Add)
+      continue;
+    Value *User = *I.user_begin();
+    Value *Op0 = I.getOperand(0);
+    Value *Op1 = I.getOperand(1);
+
+    if (!isa<PHINode>(Op0) || !isa<ConstantInt>(Op1))
+      continue;
+
+    if (!VCTPAdd) {
+      if (IsVCTPIntrinsic(User)) {
+        VCTPAdd = &I;
+        VCTP = cast<Instruction>(User);
+      }
+      continue;
+    }
+
+    PHINode *PHIUser = dyn_cast<PHINode>(User);
+    if (!PHIUser || PHIUser->getParent() != BodyBB)
+      continue;
+
+    // Check if this is add is equivalent to VCTPAdd.
+    if (I.getType() != VCTPAdd->getType() || Op0 != VCTPAdd->getOperand(0) ||
+        Op1 != VCTPAdd->getOperand(1))
+      continue;
+
+    PHIAdd = &I;
+    break;
+  }
+
+  if (!VCTPAdd || !PHIAdd)
+    return;
+
+  // Move VCTPAdd in the loop's predecessor.
+  VCTPAdd->moveBefore(LoopPredBB->getTerminator());
+  LLVM_DEBUG(dbgs() << "FixLSRPattern: Moving" << *VCTPAdd
+                    << "\n  in the loop's header, before"
+                    << *LoopPredBB->getTerminator() << "\n");
+
+  // Replace VCTPAdd's operand with the PHI's incoming value of the loop's
+  // predecessor.
+  LLVM_DEBUG(dbgs() << "FixLSRPattern: Fixing:" << *VCTPAdd << "\n");
+  PHINode *PHIOperand = cast<PHINode>(VCTPAdd->getOperand(0));
+  VCTPAdd->setOperand(0, PHIOperand->getIncomingValueForBlock(LoopPredBB));
+  LLVM_DEBUG(dbgs() << "  ... done:" << *VCTPAdd << "\n");
+
+  // Insert a new PHI for PHIAdd/VCTPAdd, and replace the uses of VCTPAdd
+  // within the loop body with that new PHI.
+  PHINode *NewPHI = PHINode::Create(VCTPAdd->getType(), 2, "lsr.fixed",
+                                    BodyBB->getFirstNonPHI());
+  NewPHI->addIncoming(VCTPAdd, LoopPredBB);
+  NewPHI->addIncoming(PHIAdd, BodyBB);
+  LLVM_DEBUG(dbgs() << "FixLSRPattern: Added PHI:" << *NewPHI << "\n");
+
+  // Fix the VCTP that uses VCTPAdd so it uses the New PHI instead.
+  LLVM_DEBUG(dbgs() << "FixLSRPattern: Fixing:" << *VCTP << "\n");
+  VCTP->replaceUsesOfWith(VCTPAdd, NewPHI);
+  LLVM_DEBUG(dbgs() << "  ... done:" << *VCTP << "\n");
+}
+
 static bool IsDecrement(Instruction &I) {
   auto *Call = dyn_cast<IntrinsicInst>(&I);
   if (!Call)
@@ -204,6 +323,8 @@
     return false;
   }
 
+  FixLSRPattern(L, *SE);
+
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader)
     return false;
Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tp-lsr-patterns.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tp-lsr-patterns.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mve-tail-predication -disable-mve-tail-predication=false %s -S -o - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-none-none-eabi"
+
+define dso_local arm_aapcs_vfpcc void @arm_cmplx_abs_sum_real_f32(float* %0, i32 %1) local_unnamed_addr #0 {
+; CHECK-LABEL: @arm_cmplx_abs_sum_real_f32(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[TMP6]], <i32 -32, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP1:%.*]], 3
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP1]], -1
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP12]], 4
+; CHECK-NEXT:    [[SMIN:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 4
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP11]], [[SMIN]]
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i32 [[TMP14]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = add nuw nsw i32 [[TMP15]], 1
+; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP16]])
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP10]], -4
+; CHECK-NEXT:    br label [[TMP18:%.*]]
+; CHECK:       18:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[TMP18]] ], [ [[TMP10]], [[TMP2:%.*]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP26:%.*]], [[TMP18]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x i32> [ [[TMP9]], [[TMP2]] ], [ [[TMP24:%.*]], [[TMP18]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = phi i32 [ [[TMP16]], [[TMP2]] ], [ [[TMP27:%.*]], [[TMP18]] ]
+; CHECK-NEXT:    [[LSR_FIXED:%.*]] = phi i32 [ [[TMP17]], [[TMP2]] ], [ [[LSR_IV_NEXT]], [[TMP18]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 [[LSR_FIXED]])
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP20]], i32 32, <4 x i1> [[TMP22]])
+; CHECK-NEXT:    [[TMP24]] = extractvalue { <4 x float>, <4 x i32> } [[TMP23]], 1
+; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP23]], 0
+; CHECK-NEXT:    [[TMP26]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP19]], <4 x float> [[TMP25]], <4 x i1> [[TMP22]], <4 x float> [[TMP19]])
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4
+; CHECK-NEXT:    [[TMP27]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP21]], i32 1)
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+; CHECK-NEXT:    br i1 [[TMP28]], label [[TMP18]], label [[TMP29:%.*]]
+; CHECK:       29:
+; CHECK-NEXT:    ret void
+;
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = ptrtoint float* %0 to i32
+  %6 = insertelement <4 x i32> undef, i32 %5, i32 0
+  %7 = add <4 x i32> %6, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %8 = shufflevector <4 x i32> %7, <4 x i32> undef, <4 x i32> zeroinitializer
+  %9 = add <4 x i32> %4, %8
+  %10 = add i32 %1, 3
+  %11 = add i32 %1, 2
+  %12 = add i32 %1, -1
+  %13 = icmp slt i32 %12, 4
+  %smin = select i1 %13, i32 %12, i32 4
+  %14 = sub i32 %11, %smin
+  %15 = lshr i32 %14, 2
+  %16 = add nuw nsw i32 %15, 1
+  call void @llvm.set.loop.iterations.i32(i32 %16)
+  br label %17
+
+17:                                               ; preds = %17, %2
+  %lsr.iv = phi i32 [ %lsr.iv.next, %17 ], [ %10, %2 ]
+  %18 = phi <4 x float> [ zeroinitializer, %2 ], [ %26, %17 ]
+  %19 = phi <4 x i32> [ %9, %2 ], [ %24, %17 ]
+  %20 = phi i32 [ %16, %2 ], [ %27, %17 ]
+  %21 = add i32 %lsr.iv, -4
+  %22 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %21)
+  %23 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %19, i32 32, <4 x i1> %22)
+  %24 = extractvalue { <4 x float>, <4 x i32> } %23, 1
+  %25 = extractvalue { <4 x float>, <4 x i32> } %23, 0
+  %26 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %18, <4 x float> %25, <4 x i1> %22, <4 x float> %18)
+  %lsr.iv.next = add i32 %lsr.iv, -4
+  %27 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %20, i32 1)
+  %28 = icmp ne i32 %27, 0
+  br i1 %28, label %17, label %29
+
+29:                                               ; preds = %17
+  ret void
+}
+
+declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32) #1
+declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
+declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) #2
+declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
+declare void @llvm.set.loop.iterations.i32(i32) #3
+declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-dotprod,-fp16fml,-hwdiv-arm,-sb,-sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+attributes #3 = { noduplicate nounwind }