Index: llvm/lib/Target/ARM/MVETailPredication.cpp
===================================================================
--- llvm/lib/Target/ARM/MVETailPredication.cpp
+++ llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -441,6 +441,9 @@
   if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes())))
     return false;
 
+  LLVM_DEBUG(dbgs() << "Comparing SCEV info with IR pattern match\n";
+             dbgs() << "SCEV: "; NumElements->dump();
+             dbgs() << "IR: "; TripCount->dump(); );
   if (TripCount != NumElements || !L->isLoopInvariant(BECount))
     return false;
 
@@ -492,7 +495,13 @@
     } else
       return nullptr;
 
-    if (auto *RoundUp = dyn_cast<SCEVAddExpr>(S->getLHS())) {
+    const SCEV *AddExpr;
+    if (auto *RoundUp = dyn_cast<SCEVAddRecExpr>(S->getLHS()))
+      AddExpr = RoundUp->getOperand(0);
+    else
+      AddExpr = S->getLHS();
+
+    if (auto *RoundUp = dyn_cast<SCEVAddExpr>(AddExpr)) {
       if (auto *Const = dyn_cast<SCEVConstant>(RoundUp->getOperand(0))) {
         if (Const->getAPInt() != (VF->getValue() - 1))
           return nullptr;
@@ -512,6 +521,10 @@
 
   // Search for Elems in the following SCEV:
   // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))<nuw>) /u VF))<nuw><nsw>
+
+  LLVM_DEBUG(dbgs() << "Searching for scalar trip count in:\n";
+             TripCountSE->dump());
+
   const SCEV *Elems = nullptr;
   if (auto *TC = dyn_cast<SCEVAddExpr>(TripCountSE))
     if (auto *Div = dyn_cast<SCEVUDivExpr>(TC->getOperand(1)))
@@ -532,9 +545,100 @@
   SCEVExpander Expander(*SE, DL, "elements");
   TCP.NumElements = Expander.expandCodeFor(Elems, Elems->getType(), InsertPt);
 
-  if (!MatchElemCountLoopSetup(L, TCP.Shuffle, TCP.NumElements))
+  // After expansion NumElements can be an instruction or a value. It is the
+  // start from where we start the pattern matching and the traversal of its
+  // uses, to see where this value is used to define a value that can
+  // correspond to a statement that calculates the iteration count. After
+  // finding it, we cross check and match this with the IR, i.e. the vector
+  // body and the masked load/store instruction, to see that these masked/loads
+  // stores indeed use the iteration count.
+  //
+  // Here's an example, where inner loop j has an upper bound calculated
+  // by S - i and i is the outer loop iterator:
+  //
+  //   void foo (..., int N, int M, int S) {
+  //     for (i = 0; i < N; i++) {
+  //       M = S - i;
+  //       for (j = 0; j < M; j++) {
+  //
+  // And now the SCEV expression looks like this:
+  //
+  //   (1 + ((-4 + (4 * ({(3 + %S),+,-1}<nw> /u 4))<nuw>) /u 4))<nuw><nsw>
+  //
+  // The challenge here is that we have extracted %S as NumElements from the
+  // SCEV expression, which is a scAddRecExpr type, but this does not yet
+  // correspond to the iteration count of the loop. In simpler cases, when we
+  // have a simpler scAddExpr, the NumEmelements directly corresponds to the
+  // iteration count. To cover this more complicated case, we traverse the uses
+  // of %S, until we find a use that is:
+  // 1) loop-invariant,
+  // 2) ignore compares (they are not used to define a value)
+  // 3) contained in a parent loop (we want to find the last use).
+  //
+  // Here's a heavily reduced example that corresponds to the outer loop
+  // preheader and body blocks of the pseudo-code example above:
+  //
+  // outer.for.body.lr.ph:
+  //    [[CONV2]] = sext i16 [[S]] to i32
+  //    [[TMP0]] = add i32 [[CONV2]], 3
+  //    br label [[FOR_BODY]]
+  //  outer.for.body:
+  //    [[TMP2]] = add i32 [[CONV2]], [[TMP1]]
+  //    [[TMP17]] = sub i32 [[CONV2]], [[I_037]]
+  //    [[CMP433]] = icmp slt i32 [[I_037]], [[CONV2]]
+  //    br i1 [[CMP433]], label [[VECTOR_PH]], label [[FOR_END]]
+  //
+  // Variable [[S]] corresponds to %S in the SCEV expression, and used to
+  // define CONV2. Starting in the parent loop, outer.for.body, we look for
+  // uses of CONV2. Ignoring icmps, we find it is used to define [[TMP17]],
+  // which calculates [[CONV2]] - [[I_037]], the iteration count, and
+  // corresponds to M = S - i from the example.
+
+  LLVM_DEBUG(dbgs() << "ARM TP: Matching scalar TC: "; TCP.NumElements->dump());
+  Value *ScalarTC = nullptr;
+  Loop *ParentLoop = L->getParentLoop();
+  while (ParentLoop && !ScalarTC) {
+    for (auto *U : TCP.NumElements->users()) {
+      LLVM_DEBUG(dbgs() << "ARM TP: Analysing user: "; U->dump(););
+      // 1) If the user is not loop invariant, something is happening in the loop
+      // that we don't understand.
+      if (!L->isLoopInvariant(U)) {
+        LLVM_DEBUG(dbgs() << "ARM TP: user not loop invariant\n");
+        return false;
+      }
+
+      // 2) An use can be used by a compare and branch, and this is fine, so just
+      // ignore compares.
+      if (dyn_cast<CmpInst>(U))
+        continue;
+
+      // 3) While there can be several uses in the loop hierarchy, we expect the
+      // instruction that sets the trip count and is a user to be in the parent
+      // loop.
+      if (ParentLoop->contains(dyn_cast<Instruction>(U))) {
+        LLVM_DEBUG(dbgs() << "ARM TP: Set as scalar TC: "; U->dump());
+        ScalarTC = U;
+        break;
+      }
+    }
+    ParentLoop = ParentLoop->getParentLoop();
+  }
+
+  // Now we choose NumElements. This depends if the search for a define of the
+  // tripcount was succesful. If not, and if it is an instruction (not a
+  // value), we bail and can't handle this case. But if an ScalarTC is found, we
+  // will use that.
+  if (!ScalarTC && dyn_cast<Instruction>(TCP.NumElements))
     return false;
+  else if (ScalarTC)
+    TCP.NumElements = ScalarTC;
+  // Else, if we haven't found ScalarTC, we use NumElements as it was, just
+  // as it was expanded from Elems.
 
+  LLVM_DEBUG(dbgs() << "ARM TP: Found NumElements: "; TCP.NumElements->dump());
+
+  if (!MatchElemCountLoopSetup(L, TCP.Shuffle, TCP.NumElements))
+    return false;
   return true;
 }
 
Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested-loop.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested-loop.ll
@@ -0,0 +1,728 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve %s -S -o - | FileCheck %s
+
+; This IR corresponds to a 2d loop, where the inner loop upper bound is
+; determined by the outer loop:
+;
+;   for (i = 0; i < N; i++)
+;     M = Size - i;
+;     for (j = 0; j < M; j++)
+;       // reduction
+;
+; This is results in SCEVAddRecExpr expression type. The value in this SCEV expression
+; does not match the scalar trip count, and requires traversal of the use-def chain.
+; That is, we start at [[CONV]], to find that [[TMP17]] sets the iteration count
+; of the inner loop.
+;
+define dso_local void @SCEVAddRecExpr_2d_i16(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 {
+; CHECK-LABEL: @SCEVAddRecExpr_2d_i16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[N:%.*]] to i32
+; CHECK-NEXT:    [[CMP36:%.*]] = icmp sgt i16 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP36]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END17:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[SIZE:%.*]] to i32
+; CHECK-NEXT:    [[CONV1032:%.*]] = zext i16 [[SCALE:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[CONV2]], 3
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[LSR_IV53:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ [[TMP0]], [[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV48:%.*]] = phi i16* [ [[SCEVGEP49:%.*]], [[FOR_END]] ], [ [[INPUT:%.*]], [[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[I_037:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC16:%.*]], [[FOR_END]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[I_037]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[CONV2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], -4
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = shl i32 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP2]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw i32 [[I_037]], -1
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP0]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = shl nuw i32 [[TMP12]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], -4
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i32 [[TMP14]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = add nuw nsw i32 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[CONV2]], [[I_037]]
+; CHECK-NEXT:    [[CMP433:%.*]] = icmp slt i32 [[I_037]], [[CONV2]]
+; CHECK-NEXT:    br i1 [[CMP433]], label [[VECTOR_PH:%.*]], label [[FOR_END]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP16]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[LSR_IV50:%.*]] = phi i16* [ [[SCEVGEP51:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV48]], [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i16* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[INPUT]], [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = phi i32 [ [[TMP16]], [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = phi i32 [ [[TMP17]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[LSR_IV5052:%.*]] = bitcast i16* [[LSR_IV50]] to <4 x i16>*
+; CHECK-NEXT:    [[LSR_IV47:%.*]] = bitcast i16* [[LSR_IV]] to <4 x i16>*
+; CHECK-NEXT:    [[TMP20:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP21]] = sub i32 [[TMP19]], 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV47]], i32 2, <4 x i1> [[TMP20]], <4 x i16> undef)
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD42:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV5052]], i32 2, <4 x i1> [[TMP20]], <4 x i16> undef)
+; CHECK-NEXT:    [[TMP23:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD42]] to <4 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw <4 x i32> [[TMP23]], [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1032]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = ashr <4 x i32> [[TMP24]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28]] = add <4 x i32> [[TMP27]], [[VEC_PHI]]
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i16, i16* [[LSR_IV]], i32 4
+; CHECK-NEXT:    [[SCEVGEP51]] = getelementptr i16, i16* [[LSR_IV50]], i32 4
+; CHECK-NEXT:    [[TMP29]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP18]], i32 1)
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
+; CHECK-NEXT:    br i1 [[TMP30]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[VEC_PHI_LCSSA:%.*]] = phi <4 x i32> [ [[VEC_PHI]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP28]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP31]], <4 x i32> [[DOTLCSSA]], <4 x i32> [[VEC_PHI_LCSSA]]
+; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP32]])
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TMP34:%.*]] = lshr i32 [[SUM_0_LCSSA]], 16
+; CHECK-NEXT:    [[CONV13:%.*]] = trunc i32 [[TMP34]] to i16
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, i16* [[OUTPUT:%.*]], i32 [[I_037]]
+; CHECK-NEXT:    store i16 [[CONV13]], i16* [[ARRAYIDX14]], align 2
+; CHECK-NEXT:    [[INC16]] = add nuw nsw i32 [[I_037]], 1
+; CHECK-NEXT:    [[SCEVGEP49]] = getelementptr i16, i16* [[LSR_IV48]], i32 1
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV53]], -1
+; CHECK-NEXT:    [[EXITCOND39:%.*]] = icmp eq i32 [[INC16]], [[CONV]]
+; CHECK-NEXT:    br i1 [[EXITCOND39]], label [[FOR_END17]], label [[FOR_BODY]]
+; CHECK:       for.end17:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = sext i16 %N to i32
+  %cmp36 = icmp sgt i16 %N, 0
+  br i1 %cmp36, label %for.body.lr.ph, label %for.end17
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv2 = sext i16 %Size to i32
+  %conv1032 = zext i16 %Scale to i32
+  %0 = add i32 %conv2, 3
+  br label %for.body
+
+for.body:                                         ; preds = %for.end, %for.body.lr.ph
+  %lsr.iv53 = phi i32 [ %lsr.iv.next, %for.end ], [ %0, %for.body.lr.ph ]
+  %lsr.iv48 = phi i16* [ %scevgep49, %for.end ], [ %Input, %for.body.lr.ph ]
+  %i.037 = phi i32 [ 0, %for.body.lr.ph ], [ %inc16, %for.end ]
+  %1 = mul nsw i32 %i.037, -1
+  %2 = add i32 %0, %1
+  %3 = lshr i32 %2, 2
+  %4 = shl nuw i32 %3, 2
+  %5 = add i32 %4, -4
+  %6 = lshr i32 %5, 2
+  %7 = add nuw nsw i32 %6, 1
+  %8 = sub i32 %conv2, %i.037
+  %cmp433 = icmp slt i32 %i.037, %conv2
+  br i1 %cmp433, label %vector.ph, label %for.end
+
+vector.ph:                                        ; preds = %for.body
+  %trip.count.minus.1 = add i32 %8, -1
+  call void @llvm.set.loop.iterations.i32(i32 %7)
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv50 = phi i16* [ %scevgep51, %vector.body ], [ %lsr.iv48, %vector.ph ]
+  %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ]
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %19, %vector.body ]
+  %9 = phi i32 [ %7, %vector.ph ], [ %20, %vector.body ]
+  %lsr.iv5052 = bitcast i16* %lsr.iv50 to <4 x i16>*
+  %lsr.iv47 = bitcast i16* %lsr.iv to <4 x i16>*
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer
+  %12 = icmp ule <4 x i32> %induction, %11
+  %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv47, i32 2, <4 x i1> %12, <4 x i16> undef)
+  %13 = sext <4 x i16> %wide.masked.load to <4 x i32>
+  %wide.masked.load42 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv5052, i32 2, <4 x i1> %12, <4 x i16> undef)
+  %14 = sext <4 x i16> %wide.masked.load42 to <4 x i32>
+  %15 = mul nsw <4 x i32> %14, %13
+  %16 = insertelement <4 x i32> undef, i32 %conv1032, i32 0
+  %17 = shufflevector <4 x i32> %16, <4 x i32> undef, <4 x i32> zeroinitializer
+  %18 = ashr <4 x i32> %15, %17
+  %19 = add <4 x i32> %18, %vec.phi
+  %index.next = add i32 %index, 4
+  %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
+  %scevgep51 = getelementptr i16, i16* %lsr.iv50, i32 4
+  %20 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %9, i32 1)
+  %21 = icmp ne i32 %20, 0
+  br i1 %21, label %vector.body, label %middle.block
+
+middle.block:                                     ; preds = %vector.body
+  %22 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %23 = shufflevector <4 x i32> %22, <4 x i32> undef, <4 x i32> zeroinitializer
+  %24 = icmp ule <4 x i32> %induction, %23
+  %25 = select <4 x i1> %24, <4 x i32> %19, <4 x i32> %vec.phi
+  %26 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %25)
+  br label %for.end
+
+for.end:                                          ; preds = %middle.block, %for.body
+  %Sum.0.lcssa = phi i32 [ 0, %for.body ], [ %26, %middle.block ]
+  %27 = lshr i32 %Sum.0.lcssa, 16
+  %conv13 = trunc i32 %27 to i16
+  %arrayidx14 = getelementptr inbounds i16, i16* %Output, i32 %i.037
+  store i16 %conv13, i16* %arrayidx14, align 2
+  %inc16 = add nuw nsw i32 %i.037, 1
+  %scevgep49 = getelementptr i16, i16* %lsr.iv48, i32 1
+  %lsr.iv.next = add i32 %lsr.iv53, -1
+  %exitcond39 = icmp eq i32 %inc16, %conv
+  br i1 %exitcond39, label %for.end17, label %for.body
+
+for.end17:                                        ; preds = %for.end, %entry
+  ret void
+}
+
+; Slightly different case @SCEVAddRecExpr_2d_i16, where there is a sext using
+; and defining the scalar in the entry block from where we start searching. This
+; is absent here, and so our def-use chain traversal is slightly different.
+;
+define dso_local void @SCEVAddRecExpr_2d_i32(i32* nocapture readonly %Input, i32* nocapture %Output, i32 %Size, i32 %N, i32 %Scale) local_unnamed_addr #0 {
+; CHECK-LABEL: @SCEVAddRecExpr_2d_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP29:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP29]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END11:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 3
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[LSR_IV46:%.*]] = phi i32 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[LSR_IV_NEXT:%.*]], [[FOR_END:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV41:%.*]] = phi i32* [ [[INPUT:%.*]], [[FOR_BODY_PREHEADER]] ], [ [[SCEVGEP42:%.*]], [[FOR_END]] ]
+; CHECK-NEXT:    [[I_030:%.*]] = phi i32 [ [[INC10:%.*]], [[FOR_END]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[I_030]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[SIZE]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], -4
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = shl i32 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP2]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw i32 [[I_030]], -1
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP0]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = shl nuw i32 [[TMP12]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], -4
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i32 [[TMP14]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = add nuw nsw i32 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[SIZE]], [[I_030]]
+; CHECK-NEXT:    [[CMP226:%.*]] = icmp slt i32 [[I_030]], [[SIZE]]
+; CHECK-NEXT:    br i1 [[CMP226]], label [[VECTOR_PH:%.*]], label [[FOR_END]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP16]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[LSR_IV43:%.*]] = phi i32* [ [[SCEVGEP44:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV41]], [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[INPUT]], [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = phi i32 [ [[TMP16]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = phi i32 [ [[TMP17]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[LSR_IV4345:%.*]] = bitcast i32* [[LSR_IV43]] to <4 x i32>*
+; CHECK-NEXT:    [[LSR_IV40:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP20:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP21]] = sub i32 [[TMP19]], 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV40]], i32 4, <4 x i1> [[TMP20]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD35:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV4345]], i32 4, <4 x i1> [[TMP20]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP22:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD35]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x i32> undef, i32 [[SCALE:%.*]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = ashr <4 x i32> [[TMP22]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26]] = add <4 x i32> [[TMP25]], [[VEC_PHI]]
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4
+; CHECK-NEXT:    [[SCEVGEP44]] = getelementptr i32, i32* [[LSR_IV43]], i32 4
+; CHECK-NEXT:    [[TMP27]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP18]], i32 1)
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+; CHECK-NEXT:    br i1 [[TMP28]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[VEC_PHI_LCSSA:%.*]] = phi <4 x i32> [ [[VEC_PHI]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP26]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP29:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP30:%.*]] = select <4 x i1> [[TMP29]], <4 x i32> [[DOTLCSSA]], <4 x i32> [[VEC_PHI_LCSSA]]
+; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP30]])
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[CONV7:%.*]] = ashr i32 [[SUM_0_LCSSA]], 16
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[OUTPUT:%.*]], i32 [[I_030]]
+; CHECK-NEXT:    store i32 [[CONV7]], i32* [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[INC10]] = add nuw nsw i32 [[I_030]], 1
+; CHECK-NEXT:    [[SCEVGEP42]] = getelementptr i32, i32* [[LSR_IV41]], i32 1
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV46]], -1
+; CHECK-NEXT:    [[EXITCOND32:%.*]] = icmp eq i32 [[INC10]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND32]], label [[FOR_END11]], label [[FOR_BODY]]
+; CHECK:       for.end11:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp29 = icmp sgt i32 %N, 0
+  br i1 %cmp29, label %for.body.preheader, label %for.end11
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %Size, 3
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.end
+  %lsr.iv46 = phi i32 [ %0, %for.body.preheader ], [ %lsr.iv.next, %for.end ]
+  %lsr.iv41 = phi i32* [ %Input, %for.body.preheader ], [ %scevgep42, %for.end ]
+  %i.030 = phi i32 [ %inc10, %for.end ], [ 0, %for.body.preheader ]
+  %1 = mul nsw i32 %i.030, -1
+  %2 = add i32 %0, %1
+  %3 = lshr i32 %2, 2
+  %4 = shl nuw i32 %3, 2
+  %5 = add i32 %4, -4
+  %6 = lshr i32 %5, 2
+  %7 = add nuw nsw i32 %6, 1
+  %8 = sub i32 %Size, %i.030
+  %cmp226 = icmp slt i32 %i.030, %Size
+  br i1 %cmp226, label %vector.ph, label %for.end
+
+vector.ph:                                        ; preds = %for.body
+  %trip.count.minus.1 = add i32 %8, -1
+  call void @llvm.set.loop.iterations.i32(i32 %7)
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv43 = phi i32* [ %scevgep44, %vector.body ], [ %lsr.iv41, %vector.ph ]
+  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %Input, %vector.ph ]
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %17, %vector.body ]
+  %9 = phi i32 [ %7, %vector.ph ], [ %18, %vector.body ]
+  %lsr.iv4345 = bitcast i32* %lsr.iv43 to <4 x i32>*
+  %lsr.iv40 = bitcast i32* %lsr.iv to <4 x i32>*
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer
+  %12 = icmp ule <4 x i32> %induction, %11
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv40, i32 4, <4 x i1> %12, <4 x i32> undef)
+  %wide.masked.load35 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv4345, i32 4, <4 x i1> %12, <4 x i32> undef)
+  %13 = mul nsw <4 x i32> %wide.masked.load35, %wide.masked.load
+  %14 = insertelement <4 x i32> undef, i32 %Scale, i32 0
+  %15 = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> zeroinitializer
+  %16 = ashr <4 x i32> %13, %15
+  %17 = add <4 x i32> %16, %vec.phi
+  %index.next = add i32 %index, 4
+  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+  %scevgep44 = getelementptr i32, i32* %lsr.iv43, i32 4
+  %18 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %9, i32 1)
+  %19 = icmp ne i32 %18, 0
+  br i1 %19, label %vector.body, label %middle.block
+
+middle.block:                                     ; preds = %vector.body
+  %20 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %21 = shufflevector <4 x i32> %20, <4 x i32> undef, <4 x i32> zeroinitializer
+  %22 = icmp ule <4 x i32> %induction, %21
+  %23 = select <4 x i1> %22, <4 x i32> %17, <4 x i32> %vec.phi
+  %24 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %23)
+  br label %for.end
+
+for.end:                                          ; preds = %middle.block, %for.body
+  %Sum.0.lcssa = phi i32 [ 0, %for.body ], [ %24, %middle.block ]
+  %conv7 = ashr i32 %Sum.0.lcssa, 16
+  %arrayidx8 = getelementptr inbounds i32, i32* %Output, i32 %i.030
+  store i32 %conv7, i32* %arrayidx8, align 4
+  %inc10 = add nuw nsw i32 %i.030, 1
+  %scevgep42 = getelementptr i32, i32* %lsr.iv41, i32 1
+  %lsr.iv.next = add i32 %lsr.iv46, -1
+  %exitcond32 = icmp eq i32 %inc10, %N
+  br i1 %exitcond32, label %for.end11, label %for.body
+
+for.end11:                                        ; preds = %for.end, %entry
+  ret void
+}
+
+; This is the almost the same as SCEVAddRecExpr_2d_i16, except that a loop invariant
+; statement has been added to vector.body, so we can't tail-predicate this loop.
+;
+define dso_local void @SCEVAddRecExpr_2d_not_invariant(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 {
+; CHECK-LABEL: @SCEVAddRecExpr_2d_not_invariant(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[N:%.*]] to i32
+; CHECK-NEXT:    [[CMP36:%.*]] = icmp sgt i16 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP36]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END17:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[SIZE:%.*]] to i32
+; CHECK-NEXT:    [[CONV1032:%.*]] = zext i16 [[SCALE:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[CONV2]], 3
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[LSR_IV53:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ [[TMP0]], [[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV48:%.*]] = phi i16* [ [[SCEVGEP49:%.*]], [[FOR_END]] ], [ [[INPUT:%.*]], [[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[I_037:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC16:%.*]], [[FOR_END]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[I_037]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], -4
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i32 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 [[CONV2]], [[I_037]]
+; CHECK-NEXT:    [[CMP433:%.*]] = icmp slt i32 [[I_037]], [[CONV2]]
+; CHECK-NEXT:    br i1 [[CMP433]], label [[VECTOR_PH:%.*]], label [[FOR_END]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[TMP8]], -1
+; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP7]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[LSR_IV50:%.*]] = phi i16* [ [[SCEVGEP51:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV48]], [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i16* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[INPUT]], [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i32 [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[CONV2_PHI:%.*]] = phi i32 [ [[CONV2]], [[VECTOR_PH]] ], [ [[CONV2_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[CONV2_NEXT]] = add i32 [[CONV2_PHI]], 4
+; CHECK-NEXT:    [[LSR_IV5052:%.*]] = bitcast i16* [[LSR_IV50]] to <4 x i16>*
+; CHECK-NEXT:    [[LSR_IV47:%.*]] = bitcast i16* [[LSR_IV]] to <4 x i16>*
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = or <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ule <4 x i32> [[INDUCTION]], [[TMP11]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV47]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef)
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD42:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV5052]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef)
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD42]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw <4 x i32> [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1032]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP16]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = ashr <4 x i32> [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19]] = add <4 x i32> [[TMP18]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i16, i16* [[LSR_IV]], i32 4
+; CHECK-NEXT:    [[SCEVGEP51]] = getelementptr i16, i16* [[LSR_IV50]], i32 4
+; CHECK-NEXT:    [[TMP20]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP9]], i32 1)
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ule <4 x i32> [[INDUCTION]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = select <4 x i1> [[TMP24]], <4 x i32> [[TMP19]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP25]])
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = lshr i32 [[SUM_0_LCSSA]], 16
+; CHECK-NEXT:    [[CONV13:%.*]] = trunc i32 [[TMP27]] to i16
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, i16* [[OUTPUT:%.*]], i32 [[I_037]]
+; CHECK-NEXT:    store i16 [[CONV13]], i16* [[ARRAYIDX14]], align 2
+; CHECK-NEXT:    [[INC16]] = add nuw nsw i32 [[I_037]], 1
+; CHECK-NEXT:    [[SCEVGEP49]] = getelementptr i16, i16* [[LSR_IV48]], i32 1
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV53]], -1
+; CHECK-NEXT:    [[EXITCOND39:%.*]] = icmp eq i32 [[INC16]], [[CONV]]
+; CHECK-NEXT:    br i1 [[EXITCOND39]], label [[FOR_END17]], label [[FOR_BODY]]
+; CHECK:       for.end17:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = sext i16 %N to i32
+  %cmp36 = icmp sgt i16 %N, 0
+  br i1 %cmp36, label %for.body.lr.ph, label %for.end17
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv2 = sext i16 %Size to i32
+  %conv1032 = zext i16 %Scale to i32
+  %0 = add i32 %conv2, 3
+  br label %for.body
+
+for.body:                                         ; preds = %for.end, %for.body.lr.ph
+  %lsr.iv53 = phi i32 [ %lsr.iv.next, %for.end ], [ %0, %for.body.lr.ph ]
+  %lsr.iv48 = phi i16* [ %scevgep49, %for.end ], [ %Input, %for.body.lr.ph ]
+  %i.037 = phi i32 [ 0, %for.body.lr.ph ], [ %inc16, %for.end ]
+  %1 = mul nsw i32 %i.037, -1
+  %2 = add i32 %0, %1
+  %3 = lshr i32 %2, 2
+  %4 = shl nuw i32 %3, 2
+  %5 = add i32 %4, -4
+  %6 = lshr i32 %5, 2
+  %7 = add nuw nsw i32 %6, 1
+  %8 = sub i32 %conv2, %i.037
+  %cmp433 = icmp slt i32 %i.037, %conv2
+  br i1 %cmp433, label %vector.ph, label %for.end
+
+vector.ph:                                        ; preds = %for.body
+  %trip.count.minus.1 = add i32 %8, -1
+  call void @llvm.set.loop.iterations.i32(i32 %7)
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv50 = phi i16* [ %scevgep51, %vector.body ], [ %lsr.iv48, %vector.ph ]
+  %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ]
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %19, %vector.body ]
+  %9 = phi i32 [ %7, %vector.ph ], [ %20, %vector.body ]
+
+; Loop invariant statement added here:
+
+  %conv2.phi = phi i32 [ %conv2, %vector.ph ], [ %conv2.next, %vector.body ]
+  %conv2.next = add i32 %conv2.phi, 4
+
+  %lsr.iv5052 = bitcast i16* %lsr.iv50 to <4 x i16>*
+  %lsr.iv47 = bitcast i16* %lsr.iv to <4 x i16>*
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer
+  %12 = icmp ule <4 x i32> %induction, %11
+  %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv47, i32 2, <4 x i1> %12, <4 x i16> undef)
+  %13 = sext <4 x i16> %wide.masked.load to <4 x i32>
+  %wide.masked.load42 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv5052, i32 2, <4 x i1> %12, <4 x i16> undef)
+  %14 = sext <4 x i16> %wide.masked.load42 to <4 x i32>
+  %15 = mul nsw <4 x i32> %14, %13
+  %16 = insertelement <4 x i32> undef, i32 %conv1032, i32 0
+  %17 = shufflevector <4 x i32> %16, <4 x i32> undef, <4 x i32> zeroinitializer
+  %18 = ashr <4 x i32> %15, %17
+  %19 = add <4 x i32> %18, %vec.phi
+  %index.next = add i32 %index, 4
+  %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
+  %scevgep51 = getelementptr i16, i16* %lsr.iv50, i32 4
+  %20 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %9, i32 1)
+  %21 = icmp ne i32 %20, 0
+  br i1 %21, label %vector.body, label %middle.block
+
+middle.block:                                     ; preds = %vector.body
+  %22 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %23 = shufflevector <4 x i32> %22, <4 x i32> undef, <4 x i32> zeroinitializer
+  %24 = icmp ule <4 x i32> %induction, %23
+  %25 = select <4 x i1> %24, <4 x i32> %19, <4 x i32> %vec.phi
+  %26 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %25)
+  br label %for.end
+
+for.end:                                          ; preds = %middle.block, %for.body
+  %Sum.0.lcssa = phi i32 [ 0, %for.body ], [ %26, %middle.block ]
+  %27 = lshr i32 %Sum.0.lcssa, 16
+  %conv13 = trunc i32 %27 to i16
+  %arrayidx14 = getelementptr inbounds i16, i16* %Output, i32 %i.037
+  store i16 %conv13, i16* %arrayidx14, align 2
+  %inc16 = add nuw nsw i32 %i.037, 1
+  %scevgep49 = getelementptr i16, i16* %lsr.iv48, i32 1
+  %lsr.iv.next = add i32 %lsr.iv53, -1
+  %exitcond39 = icmp eq i32 %inc16, %conv
+  br i1 %exitcond39, label %for.end17, label %for.body
+
+for.end17:                                        ; preds = %for.end, %entry
+  ret void
+}
+
+; This IR corresponds to this 3d loop:
+;
+;   for (k = 0; k < N; k++)
+;     for (i = 0; i < N; i++)
+;       M = Size - i;
+;       for (j = 0; j < M; j++)
+;         // reduction
+;
+; Inner loop j depends on its outerloop i, but not on its most outerloop k.
+; Thus, the SCEV expression is also a SCEVAddRecExpr, and we should
+; tail-predicate this.
+;
+define dso_local void @SCEVAddRecExpr_3d(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 {
+; CHECK-LABEL: @SCEVAddRecExpr_3d(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[N:%.*]] to i32
+; CHECK-NEXT:    [[CMP52:%.*]] = icmp sgt i16 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP52]], label [[FOR_COND2_PREHEADER_LR_PH:%.*]], label [[FOR_END26:%.*]]
+; CHECK:       for.cond2.preheader.lr.ph:
+; CHECK-NEXT:    [[CONV7:%.*]] = sext i16 [[SIZE:%.*]] to i32
+; CHECK-NEXT:    [[CONV1645:%.*]] = zext i16 [[SCALE:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[CONV7]], 3
+; CHECK-NEXT:    br label [[FOR_COND2_PREHEADER_US:%.*]]
+; CHECK:       for.cond2.preheader.us:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i16* [ [[SCEVGEP:%.*]], [[FOR_COND2_FOR_INC24_CRIT_EDGE_US:%.*]] ], [ [[INPUT:%.*]], [[FOR_COND2_PREHEADER_LR_PH]] ]
+; CHECK-NEXT:    [[K_053_US:%.*]] = phi i32 [ 0, [[FOR_COND2_PREHEADER_LR_PH]] ], [ [[INC25_US:%.*]], [[FOR_COND2_FOR_INC24_CRIT_EDGE_US]] ]
+; CHECK-NEXT:    br label [[FOR_BODY6_US:%.*]]
+; CHECK:       for.body6.us:
+; CHECK-NEXT:    [[LSR_IV72:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_END_US:%.*]] ], [ [[TMP0]], [[FOR_COND2_PREHEADER_US]] ]
+; CHECK-NEXT:    [[LSR_IV67:%.*]] = phi i16* [ [[SCEVGEP68:%.*]], [[FOR_END_US]] ], [ [[INPUT]], [[FOR_COND2_PREHEADER_US]] ]
+; CHECK-NEXT:    [[I_050_US:%.*]] = phi i32 [ 0, [[FOR_COND2_PREHEADER_US]] ], [ [[INC22_US:%.*]], [[FOR_END_US]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[I_050_US]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[CONV7]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], -4
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = shl i32 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP2]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw i32 [[I_050_US]], -1
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP0]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = shl nuw i32 [[TMP12]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], -4
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i32 [[TMP14]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = add nuw nsw i32 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[CONV7]], [[I_050_US]]
+; CHECK-NEXT:    [[CMP946_US:%.*]] = icmp slt i32 [[I_050_US]], [[CONV7]]
+; CHECK-NEXT:    br i1 [[CMP946_US]], label [[VECTOR_PH:%.*]], label [[FOR_END_US]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP16]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[LSR_IV69:%.*]] = phi i16* [ [[SCEVGEP70:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV67]], [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV64:%.*]] = phi i16* [ [[SCEVGEP65:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV]], [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = phi i32 [ [[TMP16]], [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = phi i32 [ [[TMP17]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[LSR_IV6971:%.*]] = bitcast i16* [[LSR_IV69]] to <4 x i16>*
+; CHECK-NEXT:    [[LSR_IV6466:%.*]] = bitcast i16* [[LSR_IV64]] to <4 x i16>*
+; CHECK-NEXT:    [[TMP20:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP21]] = sub i32 [[TMP19]], 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV6466]], i32 2, <4 x i1> [[TMP20]], <4 x i16> undef)
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD59:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV6971]], i32 2, <4 x i1> [[TMP20]], <4 x i16> undef)
+; CHECK-NEXT:    [[TMP23:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD59]] to <4 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw <4 x i32> [[TMP23]], [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1645]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = ashr <4 x i32> [[TMP24]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28]] = add <4 x i32> [[TMP27]], [[VEC_PHI]]
+; CHECK-NEXT:    [[SCEVGEP65]] = getelementptr i16, i16* [[LSR_IV64]], i32 4
+; CHECK-NEXT:    [[SCEVGEP70]] = getelementptr i16, i16* [[LSR_IV69]], i32 4
+; CHECK-NEXT:    [[TMP29]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP18]], i32 1)
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
+; CHECK-NEXT:    br i1 [[TMP30]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[VEC_PHI_LCSSA:%.*]] = phi <4 x i32> [ [[VEC_PHI]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP28]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP31]], <4 x i32> [[DOTLCSSA]], <4 x i32> [[VEC_PHI_LCSSA]]
+; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP32]])
+; CHECK-NEXT:    br label [[FOR_END_US]]
+; CHECK:       for.end.us:
+; CHECK-NEXT:    [[SUM_0_LCSSA_US:%.*]] = phi i32 [ 0, [[FOR_BODY6_US]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TMP34:%.*]] = lshr i32 [[SUM_0_LCSSA_US]], 16
+; CHECK-NEXT:    [[CONV19_US:%.*]] = trunc i32 [[TMP34]] to i16
+; CHECK-NEXT:    [[ARRAYIDX20_US:%.*]] = getelementptr inbounds i16, i16* [[OUTPUT:%.*]], i32 [[I_050_US]]
+; CHECK-NEXT:    store i16 [[CONV19_US]], i16* [[ARRAYIDX20_US]], align 2
+; CHECK-NEXT:    [[INC22_US]] = add nuw nsw i32 [[I_050_US]], 1
+; CHECK-NEXT:    [[SCEVGEP68]] = getelementptr i16, i16* [[LSR_IV67]], i32 1
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV72]], -1
+; CHECK-NEXT:    [[EXITCOND55:%.*]] = icmp eq i32 [[INC22_US]], [[CONV]]
+; CHECK-NEXT:    br i1 [[EXITCOND55]], label [[FOR_COND2_FOR_INC24_CRIT_EDGE_US]], label [[FOR_BODY6_US]]
+; CHECK:       for.cond2.for.inc24_crit_edge.us:
+; CHECK-NEXT:    [[INC25_US]] = add nuw nsw i32 [[K_053_US]], 1
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i16, i16* [[LSR_IV]], i32 1
+; CHECK-NEXT:    [[EXITCOND56:%.*]] = icmp eq i32 [[INC25_US]], [[CONV]]
+; CHECK-NEXT:    br i1 [[EXITCOND56]], label [[FOR_END26]], label [[FOR_COND2_PREHEADER_US]]
+; CHECK:       for.end26:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = sext i16 %N to i32
+  %cmp52 = icmp sgt i16 %N, 0
+  br i1 %cmp52, label %for.cond2.preheader.lr.ph, label %for.end26
+
+for.cond2.preheader.lr.ph:                        ; preds = %entry
+  %conv7 = sext i16 %Size to i32
+  %conv1645 = zext i16 %Scale to i32
+  %0 = add i32 %conv7, 3
+  br label %for.cond2.preheader.us
+
+for.cond2.preheader.us:                           ; preds = %for.cond2.for.inc24_crit_edge.us, %for.cond2.preheader.lr.ph
+  %lsr.iv = phi i16* [ %scevgep, %for.cond2.for.inc24_crit_edge.us ], [ %Input, %for.cond2.preheader.lr.ph ]
+  %k.053.us = phi i32 [ 0, %for.cond2.preheader.lr.ph ], [ %inc25.us, %for.cond2.for.inc24_crit_edge.us ]
+  br label %for.body6.us
+
+for.body6.us:                                     ; preds = %for.end.us, %for.cond2.preheader.us
+  %lsr.iv72 = phi i32 [ %lsr.iv.next, %for.end.us ], [ %0, %for.cond2.preheader.us ]
+  %lsr.iv67 = phi i16* [ %scevgep68, %for.end.us ], [ %Input, %for.cond2.preheader.us ]
+  %i.050.us = phi i32 [ 0, %for.cond2.preheader.us ], [ %inc22.us, %for.end.us ]
+  %1 = mul nsw i32 %i.050.us, -1
+  %2 = add i32 %0, %1
+  %3 = lshr i32 %2, 2
+  %4 = shl nuw i32 %3, 2
+  %5 = add i32 %4, -4
+  %6 = lshr i32 %5, 2
+  %7 = add nuw nsw i32 %6, 1
+  %8 = sub i32 %conv7, %i.050.us
+  %cmp946.us = icmp slt i32 %i.050.us, %conv7
+  br i1 %cmp946.us, label %vector.ph, label %for.end.us
+
+vector.ph:                                        ; preds = %for.body6.us
+  %trip.count.minus.1 = add i32 %8, -1
+  call void @llvm.set.loop.iterations.i32(i32 %7)
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv69 = phi i16* [ %scevgep70, %vector.body ], [ %lsr.iv67, %vector.ph ]
+  %lsr.iv64 = phi i16* [ %scevgep65, %vector.body ], [ %lsr.iv, %vector.ph ]
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %19, %vector.body ]
+  %9 = phi i32 [ %7, %vector.ph ], [ %20, %vector.body ]
+  %lsr.iv6971 = bitcast i16* %lsr.iv69 to <4 x i16>*
+  %lsr.iv6466 = bitcast i16* %lsr.iv64 to <4 x i16>*
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer
+  %12 = icmp ule <4 x i32> %induction, %11
+  %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv6466, i32 2, <4 x i1> %12, <4 x i16> undef)
+  %13 = sext <4 x i16> %wide.masked.load to <4 x i32>
+  %wide.masked.load59 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv6971, i32 2, <4 x i1> %12, <4 x i16> undef)
+  %14 = sext <4 x i16> %wide.masked.load59 to <4 x i32>
+  %15 = mul nsw <4 x i32> %14, %13
+  %16 = insertelement <4 x i32> undef, i32 %conv1645, i32 0
+  %17 = shufflevector <4 x i32> %16, <4 x i32> undef, <4 x i32> zeroinitializer
+  %18 = ashr <4 x i32> %15, %17
+  %19 = add <4 x i32> %18, %vec.phi
+  %index.next = add i32 %index, 4
+  %scevgep65 = getelementptr i16, i16* %lsr.iv64, i32 4
+  %scevgep70 = getelementptr i16, i16* %lsr.iv69, i32 4
+  %20 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %9, i32 1)
+  %21 = icmp ne i32 %20, 0
+  br i1 %21, label %vector.body, label %middle.block
+
+middle.block:                                     ; preds = %vector.body
+  %22 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %23 = shufflevector <4 x i32> %22, <4 x i32> undef, <4 x i32> zeroinitializer
+  %24 = icmp ule <4 x i32> %induction, %23
+  %25 = select <4 x i1> %24, <4 x i32> %19, <4 x i32> %vec.phi
+  %26 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %25)
+  br label %for.end.us
+
+for.end.us:                                       ; preds = %middle.block, %for.body6.us
+  %Sum.0.lcssa.us = phi i32 [ 0, %for.body6.us ], [ %26, %middle.block ]
+  %27 = lshr i32 %Sum.0.lcssa.us, 16
+  %conv19.us = trunc i32 %27 to i16
+  %arrayidx20.us = getelementptr inbounds i16, i16* %Output, i32 %i.050.us
+  store i16 %conv19.us, i16* %arrayidx20.us, align 2
+  %inc22.us = add nuw nsw i32 %i.050.us, 1
+  %scevgep68 = getelementptr i16, i16* %lsr.iv67, i32 1
+  %lsr.iv.next = add i32 %lsr.iv72, -1
+  %exitcond55 = icmp eq i32 %inc22.us, %conv
+  br i1 %exitcond55, label %for.cond2.for.inc24_crit_edge.us, label %for.body6.us
+
+for.cond2.for.inc24_crit_edge.us:                 ; preds = %for.end.us
+  %inc25.us = add nuw nsw i32 %k.053.us, 1
+  %scevgep = getelementptr i16, i16* %lsr.iv, i32 1
+  %exitcond56 = icmp eq i32 %inc25.us, %conv
+  br i1 %exitcond56, label %for.end26, label %for.cond2.preheader.us
+
+for.end26:                                        ; preds = %for.cond2.for.inc24_crit_edge.us, %entry
+  ret void
+}
+
+declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
+declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
+declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg, <2 x i1>)
+declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>)
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
+declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)