Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -513,6 +513,9 @@
   bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                      TargetTransformInfo::LSRCost &C2) const;
 
+  /// Return true if LSR can change I's operands.
+  bool canLSRFixupInstruction(Instruction *I) const;
+
   /// Return true if the target can fuse a compare and branch.
   /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost
   /// calculation for the instructions in a loop.
@@ -1210,6 +1213,7 @@
                                      Instruction *I) = 0;
   virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                              TargetTransformInfo::LSRCost &C2) = 0;
+  virtual bool canLSRFixupInstruction(Instruction* I) = 0;
   virtual bool canMacroFuseCmp() = 0;
   virtual bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
                           LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
@@ -1495,6 +1499,9 @@
                      TargetTransformInfo::LSRCost &C2) override {
     return Impl.isLSRCostLess(C1, C2);
   }
+  bool canLSRFixupInstruction(Instruction *I) override {
+    return Impl.canLSRFixupInstruction(I);
+  }
   bool canMacroFuseCmp() override { return Impl.canMacroFuseCmp(); }
   bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
                   DominatorTree *DT, AssumptionCache *AC,
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -165,6 +165,8 @@
                     C2.ScaleCost, C2.ImmCost, C2.SetupCost);
   }
 
+  bool canLSRFixupInstruction(Instruction *I) { return true; }
+
   bool canMacroFuseCmp() { return false; }
 
   bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -262,6 +262,10 @@
     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
   }
 
+  bool canLSRFixupInstruction(Instruction *I) {
+    return TargetTransformInfoImplBase::canLSRFixupInstruction(I);
+  }
+
   int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                            bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
     TargetLoweringBase::AddrMode AM;
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -259,6 +259,10 @@
   return TTIImpl->isLSRCostLess(C1, C2);
 }
 
+bool TargetTransformInfo::canLSRFixupInstruction(Instruction *I) const {
+  return TTIImpl->canLSRFixupInstruction(I);
+}
+
 bool TargetTransformInfo::canMacroFuseCmp() const {
   return TTIImpl->canMacroFuseCmp();
 }
Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -150,6 +150,8 @@
     return ST->getMaxInterleaveFactor();
   }
 
+  bool canLSRFixupInstruction(Instruction *I);
+
   bool isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment);
 
   bool isLegalMaskedStore(Type *DataTy, MaybeAlign Alignment) {
Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/SubtargetFeature.h"
@@ -509,6 +510,23 @@
   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
 }
 
+bool ARMTTIImpl::canLSRFixupInstruction(Instruction *I) {
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    // Do not allow LSR to fixup the vctp intrinsic if present, as it could hurt
+    // tail predication.
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::arm_mve_vctp8:
+    case Intrinsic::arm_mve_vctp16:
+    case Intrinsic::arm_mve_vctp32:
+    case Intrinsic::arm_mve_vctp64:
+      return false;
+    default:
+      break;
+    }
+  }
+  return true;
+}
+
 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
   if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
     return false;
Index: llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -2001,6 +2001,7 @@
   void GenerateCrossUseConstantOffsets();
   void GenerateAllReuseFormulae();
 
+  void FilterOutUndesirableUses();
   void FilterOutUndesirableDedicatedRegisters();
 
   size_t EstimateSearchSpaceComplexity() const;
@@ -4293,6 +4294,29 @@
              print_uses(dbgs()));
 }
 
+/// If TTI->canLSRFixupInstruction returns false for any UserInst of a LSRUse's
+/// Fixups, remove the LSRUse entirely.
+void LSRInstance::FilterOutUndesirableUses() {
+  auto Iter = Uses.begin();
+  while (Iter != Uses.end()) {
+    bool IsDead = false;
+    for (LSRFixup &Fixup : Iter->Fixups) {
+      if (!TTI.canLSRFixupInstruction(Fixup.UserInst)) {
+        IsDead = true;
+        break;
+      }
+    }
+
+    if (IsDead) {
+      LLVM_DEBUG(
+          dbgs() << "Discarding use that contains a fixup forbidden by TTI: ";
+          Iter->dump());
+      Iter = Uses.erase(Iter);
+    } else
+      ++Iter;
+  }
+}
+
 /// If there are multiple formulae with the same set of registers used
 /// by other uses, pick the best one and delete the others.
 void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
@@ -5571,6 +5595,7 @@
   // to formulate the values needed for the uses.
   GenerateAllReuseFormulae();
 
+  FilterOutUndesirableUses();
   FilterOutUndesirableDedicatedRegisters();
   NarrowSearchSpaceUsingHeuristics();
 
Index: llvm/test/Transforms/LoopStrengthReduce/ARM/forbidden-fixups.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopStrengthReduce/ARM/forbidden-fixups.ll
@@ -0,0 +1,274 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=thumbv8.1m.main -mattr=+mve %s -S -loop-reduce -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m-arm-none-eabi"
+
+define dso_local arm_aapcs_vfpcc float @vctp8(float* %0, i32 %1) local_unnamed_addr #0 {
+; CHECK-LABEL: @vctp8(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP1]], 3
+; CHECK-NEXT:    br label [[TMP12:%.*]]
+; CHECK:       12:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[TMP12]] ], [ [[TMP11]], [[TMP2:%.*]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ [[TMP5]], [[TMP2]] ], [ [[TMP21:%.*]], [[TMP12]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP20:%.*]], [[TMP12]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP18:%.*]], [[TMP12]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 [[TMP13]])
+; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @v16i1_to_v4i1(<16 x i1> [[TMP16]])
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP15]], i32 32, <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[TMP18]] = extractvalue { <4 x float>, <4 x i32> } [[TMP17]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP17]], 0
+; CHECK-NEXT:    [[TMP20]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP14]], <4 x float> [[TMP19]], <4 x i1> [[MASK]], <4 x float> [[TMP14]])
+; CHECK-NEXT:    [[TMP21]] = add nsw i32 [[TMP13]], -4
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[LSR_IV_NEXT]], 4
+; CHECK-NEXT:    br i1 [[TMP22]], label [[TMP12]], label [[TMP23:%.*]]
+; CHECK:       23:
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP20]]) #4
+; CHECK-NEXT:    [[TMP25:%.*]] = sitofp i32 [[TMP24]] to float
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.fabs.f32(float [[TMP25]])
+; CHECK-NEXT:    ret float [[TMP26]]
+;
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:                                               ; preds = %11, %2
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 %12)
+  %mask = tail call <4 x i1> @v16i1_to_v4i1(<16 x i1> %15)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %mask)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %mask, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:                                               ; preds = %11
+  %23 = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19) #0
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+define dso_local arm_aapcs_vfpcc float @vctp16(float* %0, i32 %1) local_unnamed_addr #0 {
+; CHECK-LABEL: @vctp16(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP1]], 3
+; CHECK-NEXT:    br label [[TMP12:%.*]]
+; CHECK:       12:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[TMP12]] ], [ [[TMP11]], [[TMP2:%.*]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ [[TMP5]], [[TMP2]] ], [ [[TMP21:%.*]], [[TMP12]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP20:%.*]], [[TMP12]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP18:%.*]], [[TMP12]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP13]])
+; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @v8i1_to_v4i1(<8 x i1> [[TMP16]])
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP15]], i32 32, <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[TMP18]] = extractvalue { <4 x float>, <4 x i32> } [[TMP17]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP17]], 0
+; CHECK-NEXT:    [[TMP20]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP14]], <4 x float> [[TMP19]], <4 x i1> [[MASK]], <4 x float> [[TMP14]])
+; CHECK-NEXT:    [[TMP21]] = add nsw i32 [[TMP13]], -4
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[LSR_IV_NEXT]], 4
+; CHECK-NEXT:    br i1 [[TMP22]], label [[TMP12]], label [[TMP23:%.*]]
+; CHECK:       23:
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP20]]) #4
+; CHECK-NEXT:    [[TMP25:%.*]] = sitofp i32 [[TMP24]] to float
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.fabs.f32(float [[TMP25]])
+; CHECK-NEXT:    ret float [[TMP26]]
+;
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:                                               ; preds = %11, %2
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %12)
+  %mask = tail call <4 x i1> @v8i1_to_v4i1(<8 x i1> %15)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %mask)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %mask, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:                                               ; preds = %11
+  %23 = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19) #0
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+define dso_local arm_aapcs_vfpcc float @vctpi32(float* %0, i32 %1) local_unnamed_addr #0 {
+; CHECK-LABEL: @vctpi32(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP1]], 3
+; CHECK-NEXT:    br label [[TMP12:%.*]]
+; CHECK:       12:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[TMP12]] ], [ [[TMP11]], [[TMP2:%.*]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ [[TMP5]], [[TMP2]] ], [ [[TMP21:%.*]], [[TMP12]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP20:%.*]], [[TMP12]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP18:%.*]], [[TMP12]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP13]])
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP15]], i32 32, <4 x i1> [[TMP16]])
+; CHECK-NEXT:    [[TMP18]] = extractvalue { <4 x float>, <4 x i32> } [[TMP17]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP17]], 0
+; CHECK-NEXT:    [[TMP20]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP14]], <4 x float> [[TMP19]], <4 x i1> [[TMP16]], <4 x float> [[TMP14]])
+; CHECK-NEXT:    [[TMP21]] = add nsw i32 [[TMP13]], -4
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[LSR_IV_NEXT]], 4
+; CHECK-NEXT:    br i1 [[TMP22]], label [[TMP12]], label [[TMP23:%.*]]
+; CHECK:       23:
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP20]])
+; CHECK-NEXT:    [[TMP25:%.*]] = sitofp i32 [[TMP24]] to float
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.fabs.f32(float [[TMP25]])
+; CHECK-NEXT:    ret float [[TMP26]]
+;
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:                                               ; preds = %11, %2
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:                                               ; preds = %11
+  %23 = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19) #5
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+
+define dso_local arm_aapcs_vfpcc float @vctpi64(float* %0, i32 %1) local_unnamed_addr #0 {
+; CHECK-LABEL: @vctpi64(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP1]], 3
+; CHECK-NEXT:    br label [[TMP12:%.*]]
+; CHECK:       12:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[TMP12]] ], [ [[TMP11]], [[TMP2:%.*]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ [[TMP5]], [[TMP2]] ], [ [[TMP21:%.*]], [[TMP12]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP20:%.*]], [[TMP12]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP18:%.*]], [[TMP12]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call <4 x i1> @llvm.arm.mve.vctp64(i32 [[TMP13]])
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP15]], i32 32, <4 x i1> [[TMP16]])
+; CHECK-NEXT:    [[TMP18]] = extractvalue { <4 x float>, <4 x i32> } [[TMP17]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP17]], 0
+; CHECK-NEXT:    [[TMP20]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP14]], <4 x float> [[TMP19]], <4 x i1> [[TMP16]], <4 x float> [[TMP14]])
+; CHECK-NEXT:    [[TMP21]] = add nsw i32 [[TMP13]], -4
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[LSR_IV_NEXT]], 4
+; CHECK-NEXT:    br i1 [[TMP22]], label [[TMP12]], label [[TMP23:%.*]]
+; CHECK:       23:
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP20]]) #4
+; CHECK-NEXT:    [[TMP25:%.*]] = sitofp i32 [[TMP24]] to float
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.fabs.f32(float [[TMP25]])
+; CHECK-NEXT:    ret float [[TMP26]]
+;
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:                                               ; preds = %11, %2
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <4 x i1> @llvm.arm.mve.vctp64(i32 %12)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:                                               ; preds = %11
+  %23 = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19) #0
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32) #1
+declare <16 x i1> @llvm.arm.mve.vctp8(i32) #1
+declare <8 x i1> @llvm.arm.mve.vctp16(i32) #1
+declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
+declare <4 x i1> @llvm.arm.mve.vctp64(i32) #1
+declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) #2
+declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
+declare dso_local arm_aapcs_vfpcc i32 @vecAddAcrossF32Mve(...) local_unnamed_addr #0
+declare dso_local arm_aapcs_vfpcc <4 x i1> @v8i1_to_v4i1(<8 x i1>) local_unnamed_addr #0
+declare dso_local arm_aapcs_vfpcc <4 x i1> @v16i1_to_v4i1(<16 x i1>) local_unnamed_addr #0
+declare float @llvm.fabs.f32(float) #4
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+attributes #4 = { nounwind readnone speculatable willreturn }