Index: llvm/lib/Target/ARM/MVETailPredication.cpp
===================================================================
--- llvm/lib/Target/ARM/MVETailPredication.cpp
+++ llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -441,6 +441,9 @@
   if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes())))
     return false;
 
+  LLVM_DEBUG(dbgs() << "Comparing SCEV info with IR pattern match\n";
+             dbgs() << "SCEV: "; NumElements->dump();
+             dbgs() << "IR: "; TripCount->dump(); );
   if (TripCount != NumElements || !L->isLoopInvariant(BECount))
     return false;
 
@@ -492,7 +495,13 @@
     } else
       return nullptr;
 
-    if (auto *RoundUp = dyn_cast<SCEVAddExpr>(S->getLHS())) {
+    const SCEV *AddExpr;
+    if (auto *RoundUp = dyn_cast<SCEVAddRecExpr>(S->getLHS()))
+      AddExpr = RoundUp->getOperand(0);
+    else
+      AddExpr = S->getLHS();
+
+    if (auto *RoundUp = dyn_cast<SCEVAddExpr>(AddExpr)) {
       if (auto *Const = dyn_cast<SCEVConstant>(RoundUp->getOperand(0))) {
         if (Const->getAPInt() != (VF->getValue() - 1))
           return nullptr;
@@ -512,6 +521,10 @@
 
   // Search for Elems in the following SCEV:
   // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))<nuw>) /u VF))<nuw><nsw>
+
+  LLVM_DEBUG(dbgs() << "Searching for scalar trip count in:\n";
+             TripCountSE->dump());
+
   const SCEV *Elems = nullptr;
   if (auto *TC = dyn_cast<SCEVAddExpr>(TripCountSE))
     if (auto *Div = dyn_cast<SCEVUDivExpr>(TC->getOperand(1)))
@@ -532,9 +545,68 @@
   SCEVExpander Expander(*SE, DL, "elements");
   TCP.NumElements = Expander.expandCodeFor(Elems, Elems->getType(), InsertPt);
 
-  if (!MatchElemCountLoopSetup(L, TCP.Shuffle, TCP.NumElements))
+  // If the expanded NumElements expression found in the SCEV epxresion is a
+  // value, then we can directly match this with the IR, the vector body and
+  // the masked load/store instruction. This is the case for single loops, or
+  // nested loops with loop iterators that are independent of each other.
+  if (!dyn_cast<Instruction>(TCP.NumElements)) {
+    dbgs() << "it's a value: done!\n";
+    if (!MatchElemCountLoopSetup(L, TCP.Shuffle, TCP.NumElements))
+      return false;
+    return true;
+  }
+
+  // Otherwise, if the expanded NumElements expression is not a value, we are
+  // dealing with nested-loops with dependent loop iterators, and here we
+  // recognise only inner loops which receive there start/stop value from the
+  // outer loop. For such SCEVAddRecExpr, we have to work a lot harder to match
+  // the scalar trip count (TC). I.e., for an inner loop, SCEV will return a
+  // scalar evolution expression/statement with respect to its outer loop, and
+  // this outer-loop value is what we extract from the SCEV expression above.
+  // Since this does not correspond to the trip count of the inner loop, we
+  // traverse this def-use chain. I.e. the use of that outerloop value, will
+  // be used by the instruction that sets the trip count of the inner loop.
+  // Thus, for a nested SCEV expression, we look one loop-nest level up for a
+  // define. Then, we match this value with IR patterns used by the masked
+  // loads/stores to check if we indeed found the scalar loop trip count. This
+  // will be safe because if these values don't match, we bail and don't peform
+  // tail-predication.
+  LLVM_DEBUG(dbgs() << "ARM TP: Matching scalar TC: "; TCP.NumElements->dump());
+  Value *IC = nullptr;
+  Loop *ParentLoop = L->getParentLoop();
+  while (ParentLoop && !IC) {
+    for (auto *U : TCP.NumElements->users()) {
+      LLVM_DEBUG(dbgs() << "ARM TP: Analysing user: "; U->dump(););
+      // If the user is not loop invariant, something is happening in the loop
+      // that we don't understand.
+      if (!L->isLoopInvariant(U)) {
+        LLVM_DEBUG(dbgs() << "ARM TP: user not loop invariant\n");
+        return false;
+      }
+      // An use can be used by a compare and branch, and this is fine, so just
+      // ignore compares.
+      if (dyn_cast<CmpInst>(U))
+        continue;
+
+      // While there can be several uses in the loop hierarchy, we expect the
+      // instruction that sets the trip count and is a user to be in the parent
+      // loop.
+      if (ParentLoop->contains(dyn_cast<Instruction>(U))) {
+        LLVM_DEBUG(dbgs() << "ARM TP: Set as scalar TC: "; U->dump());
+        IC = U;
+        break;
+      }
+    }
+    ParentLoop = ParentLoop->getParentLoop();
+  }
+
+  if (!IC)
     return false;
 
+  LLVM_DEBUG(dbgs() << "ARM TP: FoundNumElements: "; IC->dump());
+  TCP.NumElements = IC;
+  if (!MatchElemCountLoopSetup(L, TCP.Shuffle, TCP.NumElements))
+    return false;
   return true;
 }
 
Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested-loop.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested-loop.ll
@@ -0,0 +1,564 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve %s -S -o - | FileCheck %s
+
+
+; This IR corresponds to a 2d loop, where the inner loop receives its stop
+; value from the outer loop and thus depends on the outer loop:
+;
+;   for (i = 0; i < N; i++)
+;     M = Size - i;
+;     for (j = 0; j < M; j++)
+;       // reduction
+;
+define dso_local void @SCEVAddRecExpr_2d(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 {
+; CHECK-LABEL: @SCEVAddRecExpr_2d(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[N:%.*]] to i32
+; CHECK-NEXT:    [[CMP36:%.*]] = icmp sgt i16 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP36]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END17:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[SIZE:%.*]] to i32
+; CHECK-NEXT:    [[CONV1032:%.*]] = zext i16 [[SCALE:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[CONV2]], 3
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[LSR_IV53:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ [[TMP0]], [[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV48:%.*]] = phi i16* [ [[SCEVGEP49:%.*]], [[FOR_END]] ], [ [[INPUT:%.*]], [[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[I_037:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC16:%.*]], [[FOR_END]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[I_037]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[CONV2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], -4
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = shl i32 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP2]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw i32 [[I_037]], -1
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP0]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = shl nuw i32 [[TMP12]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], -4
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i32 [[TMP14]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = add nuw nsw i32 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[CONV2]], [[I_037]]
+; CHECK-NEXT:    [[CMP433:%.*]] = icmp slt i32 [[I_037]], [[CONV2]]
+; CHECK-NEXT:    br i1 [[CMP433]], label [[VECTOR_PH:%.*]], label [[FOR_END]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP16]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[LSR_IV50:%.*]] = phi i16* [ [[SCEVGEP51:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV48]], [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i16* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[INPUT]], [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = phi i32 [ [[TMP16]], [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = phi i32 [ [[TMP17]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[LSR_IV5052:%.*]] = bitcast i16* [[LSR_IV50]] to <4 x i16>*
+; CHECK-NEXT:    [[LSR_IV47:%.*]] = bitcast i16* [[LSR_IV]] to <4 x i16>*
+; CHECK-NEXT:    [[TMP20:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP21]] = sub i32 [[TMP19]], 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV47]], i32 2, <4 x i1> [[TMP20]], <4 x i16> undef)
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD42:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV5052]], i32 2, <4 x i1> [[TMP20]], <4 x i16> undef)
+; CHECK-NEXT:    [[TMP23:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD42]] to <4 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw <4 x i32> [[TMP23]], [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1032]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = ashr <4 x i32> [[TMP24]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28]] = add <4 x i32> [[TMP27]], [[VEC_PHI]]
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i16, i16* [[LSR_IV]], i32 4
+; CHECK-NEXT:    [[SCEVGEP51]] = getelementptr i16, i16* [[LSR_IV50]], i32 4
+; CHECK-NEXT:    [[TMP29]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP18]], i32 1)
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
+; CHECK-NEXT:    br i1 [[TMP30]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[VEC_PHI_LCSSA:%.*]] = phi <4 x i32> [ [[VEC_PHI]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP28]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP31]], <4 x i32> [[DOTLCSSA]], <4 x i32> [[VEC_PHI_LCSSA]]
+; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP32]])
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TMP34:%.*]] = lshr i32 [[SUM_0_LCSSA]], 16
+; CHECK-NEXT:    [[CONV13:%.*]] = trunc i32 [[TMP34]] to i16
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, i16* [[OUTPUT:%.*]], i32 [[I_037]]
+; CHECK-NEXT:    store i16 [[CONV13]], i16* [[ARRAYIDX14]], align 2
+; CHECK-NEXT:    [[INC16]] = add nuw nsw i32 [[I_037]], 1
+; CHECK-NEXT:    [[SCEVGEP49]] = getelementptr i16, i16* [[LSR_IV48]], i32 1
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV53]], -1
+; CHECK-NEXT:    [[EXITCOND39:%.*]] = icmp eq i32 [[INC16]], [[CONV]]
+; CHECK-NEXT:    br i1 [[EXITCOND39]], label [[FOR_END17]], label [[FOR_BODY]]
+; CHECK:       for.end17:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = sext i16 %N to i32
+  %cmp36 = icmp sgt i16 %N, 0
+  br i1 %cmp36, label %for.body.lr.ph, label %for.end17
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv2 = sext i16 %Size to i32
+  %conv1032 = zext i16 %Scale to i32
+  %0 = add i32 %conv2, 3
+  br label %for.body
+
+for.body:                                         ; preds = %for.end, %for.body.lr.ph
+  %lsr.iv53 = phi i32 [ %lsr.iv.next, %for.end ], [ %0, %for.body.lr.ph ]
+  %lsr.iv48 = phi i16* [ %scevgep49, %for.end ], [ %Input, %for.body.lr.ph ]
+  %i.037 = phi i32 [ 0, %for.body.lr.ph ], [ %inc16, %for.end ]
+  %1 = mul nsw i32 %i.037, -1
+  %2 = add i32 %0, %1
+  %3 = lshr i32 %2, 2
+  %4 = shl nuw i32 %3, 2
+  %5 = add i32 %4, -4
+  %6 = lshr i32 %5, 2
+  %7 = add nuw nsw i32 %6, 1
+  %8 = sub i32 %conv2, %i.037
+  %cmp433 = icmp slt i32 %i.037, %conv2
+  br i1 %cmp433, label %vector.ph, label %for.end
+
+vector.ph:                                        ; preds = %for.body
+  %trip.count.minus.1 = add i32 %8, -1
+  call void @llvm.set.loop.iterations.i32(i32 %7)
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv50 = phi i16* [ %scevgep51, %vector.body ], [ %lsr.iv48, %vector.ph ]
+  %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ]
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %19, %vector.body ]
+  %9 = phi i32 [ %7, %vector.ph ], [ %20, %vector.body ]
+  %lsr.iv5052 = bitcast i16* %lsr.iv50 to <4 x i16>*
+  %lsr.iv47 = bitcast i16* %lsr.iv to <4 x i16>*
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer
+  %12 = icmp ule <4 x i32> %induction, %11
+  %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv47, i32 2, <4 x i1> %12, <4 x i16> undef)
+  %13 = sext <4 x i16> %wide.masked.load to <4 x i32>
+  %wide.masked.load42 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv5052, i32 2, <4 x i1> %12, <4 x i16> undef)
+  %14 = sext <4 x i16> %wide.masked.load42 to <4 x i32>
+  %15 = mul nsw <4 x i32> %14, %13
+  %16 = insertelement <4 x i32> undef, i32 %conv1032, i32 0
+  %17 = shufflevector <4 x i32> %16, <4 x i32> undef, <4 x i32> zeroinitializer
+  %18 = ashr <4 x i32> %15, %17
+  %19 = add <4 x i32> %18, %vec.phi
+  %index.next = add i32 %index, 4
+  %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
+  %scevgep51 = getelementptr i16, i16* %lsr.iv50, i32 4
+  %20 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %9, i32 1)
+  %21 = icmp ne i32 %20, 0
+  br i1 %21, label %vector.body, label %middle.block
+
+middle.block:                                     ; preds = %vector.body
+  %22 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %23 = shufflevector <4 x i32> %22, <4 x i32> undef, <4 x i32> zeroinitializer
+  %24 = icmp ule <4 x i32> %induction, %23
+  %25 = select <4 x i1> %24, <4 x i32> %19, <4 x i32> %vec.phi
+  %26 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %25)
+  br label %for.end
+
+for.end:                                          ; preds = %middle.block, %for.body
+  %Sum.0.lcssa = phi i32 [ 0, %for.body ], [ %26, %middle.block ]
+  %27 = lshr i32 %Sum.0.lcssa, 16
+  %conv13 = trunc i32 %27 to i16
+  %arrayidx14 = getelementptr inbounds i16, i16* %Output, i32 %i.037
+  store i16 %conv13, i16* %arrayidx14, align 2
+  %inc16 = add nuw nsw i32 %i.037, 1
+  %scevgep49 = getelementptr i16, i16* %lsr.iv48, i32 1
+  %lsr.iv.next = add i32 %lsr.iv53, -1
+  %exitcond39 = icmp eq i32 %inc16, %conv
+  br i1 %exitcond39, label %for.end17, label %for.body
+
+for.end17:                                        ; preds = %for.end, %entry
+  ret void
+}
+
+; This is the almost the same as SCEVAddRecExpr_2d, except that a loop invariant
+; statement has been added to vector.body.
+;
+define dso_local void @SCEVAddRecExpr_2d_not_invariant(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 {
+; CHECK-LABEL: @SCEVAddRecExpr_2d_not_invariant(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[N:%.*]] to i32
+; CHECK-NEXT:    [[CMP36:%.*]] = icmp sgt i16 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP36]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END17:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[SIZE:%.*]] to i32
+; CHECK-NEXT:    [[CONV1032:%.*]] = zext i16 [[SCALE:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[CONV2]], 3
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[LSR_IV53:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ [[TMP0]], [[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV48:%.*]] = phi i16* [ [[SCEVGEP49:%.*]], [[FOR_END]] ], [ [[INPUT:%.*]], [[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[I_037:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC16:%.*]], [[FOR_END]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[I_037]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], -4
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i32 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 [[CONV2]], [[I_037]]
+; CHECK-NEXT:    [[CMP433:%.*]] = icmp slt i32 [[I_037]], [[CONV2]]
+; CHECK-NEXT:    br i1 [[CMP433]], label [[VECTOR_PH:%.*]], label [[FOR_END]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[TMP8]], -1
+; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP7]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[LSR_IV50:%.*]] = phi i16* [ [[SCEVGEP51:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV48]], [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i16* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[INPUT]], [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i32 [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[CONV2_PHI:%.*]] = phi i32 [ [[CONV2]], [[VECTOR_PH]] ], [ [[CONV2_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[CONV2_NEXT]] = add i32 [[CONV2_PHI]], 4
+; CHECK-NEXT:    [[LSR_IV5052:%.*]] = bitcast i16* [[LSR_IV50]] to <4 x i16>*
+; CHECK-NEXT:    [[LSR_IV47:%.*]] = bitcast i16* [[LSR_IV]] to <4 x i16>*
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = or <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ule <4 x i32> [[INDUCTION]], [[TMP11]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV47]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef)
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD42:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV5052]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef)
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD42]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw <4 x i32> [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1032]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP16]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = ashr <4 x i32> [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19]] = add <4 x i32> [[TMP18]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i16, i16* [[LSR_IV]], i32 4
+; CHECK-NEXT:    [[SCEVGEP51]] = getelementptr i16, i16* [[LSR_IV50]], i32 4
+; CHECK-NEXT:    [[TMP20]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP9]], i32 1)
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ule <4 x i32> [[INDUCTION]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = select <4 x i1> [[TMP24]], <4 x i32> [[TMP19]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP25]])
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = lshr i32 [[SUM_0_LCSSA]], 16
+; CHECK-NEXT:    [[CONV13:%.*]] = trunc i32 [[TMP27]] to i16
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, i16* [[OUTPUT:%.*]], i32 [[I_037]]
+; CHECK-NEXT:    store i16 [[CONV13]], i16* [[ARRAYIDX14]], align 2
+; CHECK-NEXT:    [[INC16]] = add nuw nsw i32 [[I_037]], 1
+; CHECK-NEXT:    [[SCEVGEP49]] = getelementptr i16, i16* [[LSR_IV48]], i32 1
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV53]], -1
+; CHECK-NEXT:    [[EXITCOND39:%.*]] = icmp eq i32 [[INC16]], [[CONV]]
+; CHECK-NEXT:    br i1 [[EXITCOND39]], label [[FOR_END17]], label [[FOR_BODY]]
+; CHECK:       for.end17:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = sext i16 %N to i32
+  %cmp36 = icmp sgt i16 %N, 0
+  br i1 %cmp36, label %for.body.lr.ph, label %for.end17
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv2 = sext i16 %Size to i32
+  %conv1032 = zext i16 %Scale to i32
+  %0 = add i32 %conv2, 3
+  br label %for.body
+
+for.body:                                         ; preds = %for.end, %for.body.lr.ph
+  %lsr.iv53 = phi i32 [ %lsr.iv.next, %for.end ], [ %0, %for.body.lr.ph ]
+  %lsr.iv48 = phi i16* [ %scevgep49, %for.end ], [ %Input, %for.body.lr.ph ]
+  %i.037 = phi i32 [ 0, %for.body.lr.ph ], [ %inc16, %for.end ]
+  %1 = mul nsw i32 %i.037, -1
+  %2 = add i32 %0, %1
+  %3 = lshr i32 %2, 2
+  %4 = shl nuw i32 %3, 2
+  %5 = add i32 %4, -4
+  %6 = lshr i32 %5, 2
+  %7 = add nuw nsw i32 %6, 1
+  %8 = sub i32 %conv2, %i.037
+  %cmp433 = icmp slt i32 %i.037, %conv2
+  br i1 %cmp433, label %vector.ph, label %for.end
+
+vector.ph:                                        ; preds = %for.body
+  %trip.count.minus.1 = add i32 %8, -1
+  call void @llvm.set.loop.iterations.i32(i32 %7)
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv50 = phi i16* [ %scevgep51, %vector.body ], [ %lsr.iv48, %vector.ph ]
+  %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ]
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %19, %vector.body ]
+  %9 = phi i32 [ %7, %vector.ph ], [ %20, %vector.body ]
+
+; Loop invariant statement added here:
+
+  %conv2.phi = phi i32 [ %conv2, %vector.ph ], [ %conv2.next, %vector.body ]
+  %conv2.next = add i32 %conv2.phi, 4
+
+  %lsr.iv5052 = bitcast i16* %lsr.iv50 to <4 x i16>*
+  %lsr.iv47 = bitcast i16* %lsr.iv to <4 x i16>*
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer
+  %12 = icmp ule <4 x i32> %induction, %11
+  %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv47, i32 2, <4 x i1> %12, <4 x i16> undef)
+  %13 = sext <4 x i16> %wide.masked.load to <4 x i32>
+  %wide.masked.load42 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv5052, i32 2, <4 x i1> %12, <4 x i16> undef)
+  %14 = sext <4 x i16> %wide.masked.load42 to <4 x i32>
+  %15 = mul nsw <4 x i32> %14, %13
+  %16 = insertelement <4 x i32> undef, i32 %conv1032, i32 0
+  %17 = shufflevector <4 x i32> %16, <4 x i32> undef, <4 x i32> zeroinitializer
+  %18 = ashr <4 x i32> %15, %17
+  %19 = add <4 x i32> %18, %vec.phi
+  %index.next = add i32 %index, 4
+  %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
+  %scevgep51 = getelementptr i16, i16* %lsr.iv50, i32 4
+  %20 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %9, i32 1)
+  %21 = icmp ne i32 %20, 0
+  br i1 %21, label %vector.body, label %middle.block
+
+middle.block:                                     ; preds = %vector.body
+  %22 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %23 = shufflevector <4 x i32> %22, <4 x i32> undef, <4 x i32> zeroinitializer
+  %24 = icmp ule <4 x i32> %induction, %23
+  %25 = select <4 x i1> %24, <4 x i32> %19, <4 x i32> %vec.phi
+  %26 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %25)
+  br label %for.end
+
+for.end:                                          ; preds = %middle.block, %for.body
+  %Sum.0.lcssa = phi i32 [ 0, %for.body ], [ %26, %middle.block ]
+  %27 = lshr i32 %Sum.0.lcssa, 16
+  %conv13 = trunc i32 %27 to i16
+  %arrayidx14 = getelementptr inbounds i16, i16* %Output, i32 %i.037
+  store i16 %conv13, i16* %arrayidx14, align 2
+  %inc16 = add nuw nsw i32 %i.037, 1
+  %scevgep49 = getelementptr i16, i16* %lsr.iv48, i32 1
+  %lsr.iv.next = add i32 %lsr.iv53, -1
+  %exitcond39 = icmp eq i32 %inc16, %conv
+  br i1 %exitcond39, label %for.end17, label %for.body
+
+for.end17:                                        ; preds = %for.end, %entry
+  ret void
+}
+
+
+; This IR corresponds to this 3d loop:
+;
+;   for (k = 0; k < N; k++)
+;     for (i = 0; i < N; i++)
+;       M = Size - i;
+;       for (j = 0; j < M; j++)
+;         // reduction
+;
+; Inner loop j depends on its outerloop i, but not on its most outerloop k.
+;
+define dso_local void @SCEVAddRecExpr_3d(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 {
+; CHECK-LABEL: @SCEVAddRecExpr_3d(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[N:%.*]] to i32
+; CHECK-NEXT:    [[CMP52:%.*]] = icmp sgt i16 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP52]], label [[FOR_COND2_PREHEADER_LR_PH:%.*]], label [[FOR_END26:%.*]]
+; CHECK:       for.cond2.preheader.lr.ph:
+; CHECK-NEXT:    [[CONV7:%.*]] = sext i16 [[SIZE:%.*]] to i32
+; CHECK-NEXT:    [[CONV1645:%.*]] = zext i16 [[SCALE:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[CONV7]], 3
+; CHECK-NEXT:    br label [[FOR_COND2_PREHEADER_US:%.*]]
+; CHECK:       for.cond2.preheader.us:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i16* [ [[SCEVGEP:%.*]], [[FOR_COND2_FOR_INC24_CRIT_EDGE_US:%.*]] ], [ [[INPUT:%.*]], [[FOR_COND2_PREHEADER_LR_PH]] ]
+; CHECK-NEXT:    [[K_053_US:%.*]] = phi i32 [ 0, [[FOR_COND2_PREHEADER_LR_PH]] ], [ [[INC25_US:%.*]], [[FOR_COND2_FOR_INC24_CRIT_EDGE_US]] ]
+; CHECK-NEXT:    br label [[FOR_BODY6_US:%.*]]
+; CHECK:       for.body6.us:
+; CHECK-NEXT:    [[LSR_IV72:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_END_US:%.*]] ], [ [[TMP0]], [[FOR_COND2_PREHEADER_US]] ]
+; CHECK-NEXT:    [[LSR_IV67:%.*]] = phi i16* [ [[SCEVGEP68:%.*]], [[FOR_END_US]] ], [ [[INPUT]], [[FOR_COND2_PREHEADER_US]] ]
+; CHECK-NEXT:    [[I_050_US:%.*]] = phi i32 [ 0, [[FOR_COND2_PREHEADER_US]] ], [ [[INC22_US:%.*]], [[FOR_END_US]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[I_050_US]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[CONV7]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], -4
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = shl i32 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP2]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw i32 [[I_050_US]], -1
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP0]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = shl nuw i32 [[TMP12]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], -4
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i32 [[TMP14]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = add nuw nsw i32 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[CONV7]], [[I_050_US]]
+; CHECK-NEXT:    [[CMP946_US:%.*]] = icmp slt i32 [[I_050_US]], [[CONV7]]
+; CHECK-NEXT:    br i1 [[CMP946_US]], label [[VECTOR_PH:%.*]], label [[FOR_END_US]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP16]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[LSR_IV69:%.*]] = phi i16* [ [[SCEVGEP70:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV67]], [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV64:%.*]] = phi i16* [ [[SCEVGEP65:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV]], [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = phi i32 [ [[TMP16]], [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = phi i32 [ [[TMP17]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[LSR_IV6971:%.*]] = bitcast i16* [[LSR_IV69]] to <4 x i16>*
+; CHECK-NEXT:    [[LSR_IV6466:%.*]] = bitcast i16* [[LSR_IV64]] to <4 x i16>*
+; CHECK-NEXT:    [[TMP20:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP21]] = sub i32 [[TMP19]], 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV6466]], i32 2, <4 x i1> [[TMP20]], <4 x i16> undef)
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD59:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV6971]], i32 2, <4 x i1> [[TMP20]], <4 x i16> undef)
+; CHECK-NEXT:    [[TMP23:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD59]] to <4 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw <4 x i32> [[TMP23]], [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1645]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = ashr <4 x i32> [[TMP24]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28]] = add <4 x i32> [[TMP27]], [[VEC_PHI]]
+; CHECK-NEXT:    [[SCEVGEP65]] = getelementptr i16, i16* [[LSR_IV64]], i32 4
+; CHECK-NEXT:    [[SCEVGEP70]] = getelementptr i16, i16* [[LSR_IV69]], i32 4
+; CHECK-NEXT:    [[TMP29]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP18]], i32 1)
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
+; CHECK-NEXT:    br i1 [[TMP30]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[VEC_PHI_LCSSA:%.*]] = phi <4 x i32> [ [[VEC_PHI]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP28]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP31]], <4 x i32> [[DOTLCSSA]], <4 x i32> [[VEC_PHI_LCSSA]]
+; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP32]])
+; CHECK-NEXT:    br label [[FOR_END_US]]
+; CHECK:       for.end.us:
+; CHECK-NEXT:    [[SUM_0_LCSSA_US:%.*]] = phi i32 [ 0, [[FOR_BODY6_US]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TMP34:%.*]] = lshr i32 [[SUM_0_LCSSA_US]], 16
+; CHECK-NEXT:    [[CONV19_US:%.*]] = trunc i32 [[TMP34]] to i16
+; CHECK-NEXT:    [[ARRAYIDX20_US:%.*]] = getelementptr inbounds i16, i16* [[OUTPUT:%.*]], i32 [[I_050_US]]
+; CHECK-NEXT:    store i16 [[CONV19_US]], i16* [[ARRAYIDX20_US]], align 2
+; CHECK-NEXT:    [[INC22_US]] = add nuw nsw i32 [[I_050_US]], 1
+; CHECK-NEXT:    [[SCEVGEP68]] = getelementptr i16, i16* [[LSR_IV67]], i32 1
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV72]], -1
+; CHECK-NEXT:    [[EXITCOND55:%.*]] = icmp eq i32 [[INC22_US]], [[CONV]]
+; CHECK-NEXT:    br i1 [[EXITCOND55]], label [[FOR_COND2_FOR_INC24_CRIT_EDGE_US]], label [[FOR_BODY6_US]]
+; CHECK:       for.cond2.for.inc24_crit_edge.us:
+; CHECK-NEXT:    [[INC25_US]] = add nuw nsw i32 [[K_053_US]], 1
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i16, i16* [[LSR_IV]], i32 1
+; CHECK-NEXT:    [[EXITCOND56:%.*]] = icmp eq i32 [[INC25_US]], [[CONV]]
+; CHECK-NEXT:    br i1 [[EXITCOND56]], label [[FOR_END26]], label [[FOR_COND2_PREHEADER_US]]
+; CHECK:       for.end26:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = sext i16 %N to i32
+  %cmp52 = icmp sgt i16 %N, 0
+  br i1 %cmp52, label %for.cond2.preheader.lr.ph, label %for.end26
+
+for.cond2.preheader.lr.ph:                        ; preds = %entry
+  %conv7 = sext i16 %Size to i32
+  %conv1645 = zext i16 %Scale to i32
+  %0 = add i32 %conv7, 3
+  br label %for.cond2.preheader.us
+
+for.cond2.preheader.us:                           ; preds = %for.cond2.for.inc24_crit_edge.us, %for.cond2.preheader.lr.ph
+  %lsr.iv = phi i16* [ %scevgep, %for.cond2.for.inc24_crit_edge.us ], [ %Input, %for.cond2.preheader.lr.ph ]
+  %k.053.us = phi i32 [ 0, %for.cond2.preheader.lr.ph ], [ %inc25.us, %for.cond2.for.inc24_crit_edge.us ]
+  br label %for.body6.us
+
+for.body6.us:                                     ; preds = %for.end.us, %for.cond2.preheader.us
+  %lsr.iv72 = phi i32 [ %lsr.iv.next, %for.end.us ], [ %0, %for.cond2.preheader.us ]
+  %lsr.iv67 = phi i16* [ %scevgep68, %for.end.us ], [ %Input, %for.cond2.preheader.us ]
+  %i.050.us = phi i32 [ 0, %for.cond2.preheader.us ], [ %inc22.us, %for.end.us ]
+  %1 = mul nsw i32 %i.050.us, -1
+  %2 = add i32 %0, %1
+  %3 = lshr i32 %2, 2
+  %4 = shl nuw i32 %3, 2
+  %5 = add i32 %4, -4
+  %6 = lshr i32 %5, 2
+  %7 = add nuw nsw i32 %6, 1
+  %8 = sub i32 %conv7, %i.050.us
+  %cmp946.us = icmp slt i32 %i.050.us, %conv7
+  br i1 %cmp946.us, label %vector.ph, label %for.end.us
+
+vector.ph:                                        ; preds = %for.body6.us
+  %trip.count.minus.1 = add i32 %8, -1
+  call void @llvm.set.loop.iterations.i32(i32 %7)
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv69 = phi i16* [ %scevgep70, %vector.body ], [ %lsr.iv67, %vector.ph ]
+  %lsr.iv64 = phi i16* [ %scevgep65, %vector.body ], [ %lsr.iv, %vector.ph ]
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %19, %vector.body ]
+  %9 = phi i32 [ %7, %vector.ph ], [ %20, %vector.body ]
+  %lsr.iv6971 = bitcast i16* %lsr.iv69 to <4 x i16>*
+  %lsr.iv6466 = bitcast i16* %lsr.iv64 to <4 x i16>*
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer
+  %12 = icmp ule <4 x i32> %induction, %11
+  %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv6466, i32 2, <4 x i1> %12, <4 x i16> undef)
+  %13 = sext <4 x i16> %wide.masked.load to <4 x i32>
+  %wide.masked.load59 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv6971, i32 2, <4 x i1> %12, <4 x i16> undef)
+  %14 = sext <4 x i16> %wide.masked.load59 to <4 x i32>
+  %15 = mul nsw <4 x i32> %14, %13
+  %16 = insertelement <4 x i32> undef, i32 %conv1645, i32 0
+  %17 = shufflevector <4 x i32> %16, <4 x i32> undef, <4 x i32> zeroinitializer
+  %18 = ashr <4 x i32> %15, %17
+  %19 = add <4 x i32> %18, %vec.phi
+  %index.next = add i32 %index, 4
+  %scevgep65 = getelementptr i16, i16* %lsr.iv64, i32 4
+  %scevgep70 = getelementptr i16, i16* %lsr.iv69, i32 4
+  %20 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %9, i32 1)
+  %21 = icmp ne i32 %20, 0
+  br i1 %21, label %vector.body, label %middle.block
+
+middle.block:                                     ; preds = %vector.body
+  %22 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %23 = shufflevector <4 x i32> %22, <4 x i32> undef, <4 x i32> zeroinitializer
+  %24 = icmp ule <4 x i32> %induction, %23
+  %25 = select <4 x i1> %24, <4 x i32> %19, <4 x i32> %vec.phi
+  %26 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %25)
+  br label %for.end.us
+
+for.end.us:                                       ; preds = %middle.block, %for.body6.us
+  %Sum.0.lcssa.us = phi i32 [ 0, %for.body6.us ], [ %26, %middle.block ]
+  %27 = lshr i32 %Sum.0.lcssa.us, 16
+  %conv19.us = trunc i32 %27 to i16
+  %arrayidx20.us = getelementptr inbounds i16, i16* %Output, i32 %i.050.us
+  store i16 %conv19.us, i16* %arrayidx20.us, align 2
+  %inc22.us = add nuw nsw i32 %i.050.us, 1
+  %scevgep68 = getelementptr i16, i16* %lsr.iv67, i32 1
+  %lsr.iv.next = add i32 %lsr.iv72, -1
+  %exitcond55 = icmp eq i32 %inc22.us, %conv
+  br i1 %exitcond55, label %for.cond2.for.inc24_crit_edge.us, label %for.body6.us
+
+for.cond2.for.inc24_crit_edge.us:                 ; preds = %for.end.us
+  %inc25.us = add nuw nsw i32 %k.053.us, 1
+  %scevgep = getelementptr i16, i16* %lsr.iv, i32 1
+  %exitcond56 = icmp eq i32 %inc25.us, %conv
+  br i1 %exitcond56, label %for.end26, label %for.cond2.preheader.us
+
+for.end26:                                        ; preds = %for.cond2.for.inc24_crit_edge.us, %entry
+  ret void
+}
+
+declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
+declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
+declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg, <2 x i1>)
+declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>)
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
+declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)