Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -514,12 +514,24 @@
       WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
   }
 
-  // Int inductions are special because we only allow one IV.
-  if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
-      ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
+  auto IsStepValueOne = [&] () {
+    if (ID.getConstIntStepValue()->isOne() &&
       isa<Constant>(ID.getStartValue()) &&
-      cast<Constant>(ID.getStartValue())->isNullValue()) {
+      cast<Constant>(ID.getStartValue())->isNullValue())
+      return true;
+    return false;
+  };
 
+  auto IsStepValueMinusOne = [&] () {
+    if (ID.getConstIntStepValue()->isMinusOne())
+      return true;
+    return false;
+  };
+
+  // Int inductions are special because we only allow one IV.
+  if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
+      ID.getConstIntStepValue() &&
+      (IsStepValueOne() || IsStepValueMinusOne())) {
     // Use the phi node with the widest type as induction. Use the last
     // one if there are multiple (no good reason for doing this other
     // than it is expedient). We've checked that it begins at zero and
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1825,13 +1825,19 @@
     return cast<SCEVUnknown>(Step)->getValue();
   };
 
+  auto IsStepValueMinusOne = [&] () -> bool {
+      return ID.getConstIntStepValue() &&
+	     ID.getConstIntStepValue()->isMinusOne(); 
+  };
+
   // The scalar value to broadcast. This is derived from the canonical
   // induction variable. If a truncation type is given, truncate the canonical
   // induction variable and step. Otherwise, derive these values from the
   // induction descriptor.
   auto CreateScalarIV = [&](Value *&Step) -> Value * {
     Value *ScalarIV = Induction;
-    if (IV != OldInduction) {
+
+    if (IV != OldInduction || IsStepValueMinusOne()) {
       ScalarIV = IV->getType()->isIntegerTy()
                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
                      : Builder.CreateCast(Instruction::SIToFP, Induction,
Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; Check that when we can't predicate this loop that it is still vectorised (with an epilogue).
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s --check-prefix=CHECK-NO-FOLD
+; CHECK-NO-FOLD:      vector.body
+; CHECK-NO-FOLD-NOT:  masked
+
+; Check tail-folding and predicated vector body.
 ; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s
 ; RUN: opt < %s -loop-vectorize -disable-mve-tail-predication=false -S | FileCheck %s
 
-; Check that when we can't predicate this loop that it is still vectorised (with
-; an epilogue).
-; TODO: the reason this can't be predicated is because a primary induction
-; variable can't be found (not yet) for this counting down loop. But with that
-; fixed, this should be able to be predicated.
-
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-arm-unknown-eabihf"
 
@@ -18,61 +18,62 @@
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp eq i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP6]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
 ; CHECK:       while.body.preheader:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 16
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[N]], [[N_VEC]]
-; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, i8* [[C:%.*]], i32 [[N_VEC]]
-; CHECK-NEXT:    [[IND_END4:%.*]] = getelementptr i8, i8* [[B:%.*]], i32 [[N_VEC]]
-; CHECK-NEXT:    [[IND_END6:%.*]] = getelementptr i8, i8* [[A:%.*]], i32 [[N_VEC]]
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 15
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, i8* [[C:%.*]], i32 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i8, i8* [[B:%.*]], i32 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END5:%.*]] = getelementptr i8, i8* [[A:%.*]], i32 [[N_VEC]]
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <16 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT8]], <16 x i32> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[INDEX]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <16 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 -1, i32 -2, i32 -3, i32 -4, i32 -5, i32 -6, i32 -7, i32 -8, i32 -9, i32 -10, i32 -11, i32 -12, i32 -13, i32 -14, i32 -15>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[C]], i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, i8* [[B]], i32 [[TMP2]]
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[B]], i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, i8* [[A]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP8]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[NEXT_GEP7]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i8> [[WIDE_LOAD9]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>*
-; CHECK-NEXT:    store <16 x i8> [[TMP8]], <16 x i8>* [[TMP10]], align 1
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, i8* [[A]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule <16 x i32> [[INDUCTION]], [[BROADCAST_SPLAT9]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[NEXT_GEP7]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP6]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[NEXT_GEP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <16 x i8>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP8]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
+; CHECK-NEXT:    [[TMP9:%.*]] = add <16 x i8> [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <16 x i8>*
+; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[TMP9]], <16 x i8>* [[TMP11]], i32 1, <16 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i8* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[C]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i8* [ [[IND_END4]], [[MIDDLE_BLOCK]] ], [ [[B]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i8* [ [[IND_END6]], [[MIDDLE_BLOCK]] ], [ [[A]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[C]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i8* [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[B]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i8* [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ [[A]], [[WHILE_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    [[N_ADDR_010:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[C_ADDR_09:%.*]] = phi i8* [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[B_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[A_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL5]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[B_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[A_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_07]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, i8* [[A_ADDR_07]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[A_ADDR_07]], align 1
 ; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i8, i8* [[B_ADDR_08]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[B_ADDR_08]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[B_ADDR_08]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP14]], [[TMP13]]
 ; CHECK-NEXT:    [[INCDEC_PTR4]] = getelementptr inbounds i8, i8* [[C_ADDR_09]], i32 1
 ; CHECK-NEXT:    store i8 [[ADD]], i8* [[C_ADDR_09]], align 1
 ; CHECK-NEXT:    [[DEC]] = add i32 [[N_ADDR_010]], -1
Index: llvm/test/Transforms/LoopVectorize/reverse_induction.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/reverse_induction.ll
+++ llvm/test/Transforms/LoopVectorize/reverse_induction.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -5,12 +6,82 @@
 ; Make sure consecutive vector generates correct negative indices.
 ; PR15882
 
-; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK: %offset.idx = sub i64 %startval, %index
-; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
-; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
-
 define i32 @reverse_induction_i64(i64 %startval, i32 * %ptr) {
+; CHECK-LABEL: @reverse_induction_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], 1024
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+
+; This is the required offset.idx calculation that needs to be preserved:
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[INDEX]]
+
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 -1, i64 -2, i64 -3>
+; CHECK-NEXT:    [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 -4, i64 -5, i64 -6, i64 -7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -4
+; CHECK-NEXT:    [[OFFSET_IDX3:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> undef, i32 [[OFFSET_IDX3]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION6:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INDUCTION7:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[OFFSET_IDX3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP0]], -1
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -3
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 -4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -3
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4
+; CHECK-NEXT:    [[REVERSE10:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD9]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP14]] = add <4 x i32> [[REVERSE]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP15]] = add <4 x i32> [[REVERSE10]], [[VEC_PHI8]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF12:%.*]] = shufflevector <4 x i32> [[BIN_RDX11]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX13:%.*]] = add <4 x i32> [[BIN_RDX11]], [[RDX_SHUF12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[BIN_RDX13]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEND:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[ADD_I7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_I:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC4:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[REDUX5:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[INC_REDUX:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ADD_I]] = add i64 [[ADD_I7]], -1
+; CHECK-NEXT:    [[KIND__I:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[ADD_I]]
+; CHECK-NEXT:    [[TMP_I1:%.*]] = load i32, i32* [[KIND__I]], align 4
+; CHECK-NEXT:    [[INC_REDUX]] = add i32 [[TMP_I1]], [[REDUX5]]
+; CHECK-NEXT:    [[INC4]] = add i32 [[I_06]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC4]], 1024
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]], !llvm.loop !2
+; CHECK:       loopend:
+; CHECK-NEXT:    [[INC_REDUX_LCSSA:%.*]] = phi i32 [ [[INC_REDUX]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[INC_REDUX_LCSSA]]
+;
 entry:
   br label %for.body
 
@@ -30,13 +101,79 @@
   ret i32 %inc.redux
 }
 
-; CHECK-LABEL: @reverse_induction_i128(
-; CHECK: %index = phi i128 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK: %offset.idx = sub i128 %startval, %index
-; CHECK: %[[a0:.+]] = add i128 %offset.idx, 0
-; CHECK: %[[a4:.+]] = add i128 %offset.idx, -4
-
 define i32 @reverse_induction_i128(i128 %startval, i32 * %ptr) {
+; CHECK-LABEL: @reverse_induction_i128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i128 [[STARTVAL:%.*]], 1024
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i128 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i128 [[STARTVAL]], [[INDEX]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i128> undef, i128 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i128> [[BROADCAST_SPLATINSERT]], <4 x i128> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i128> [[BROADCAST_SPLAT]], <i128 0, i128 -1, i128 -2, i128 -3>
+; CHECK-NEXT:    [[INDUCTION2:%.*]] = add <4 x i128> [[BROADCAST_SPLAT]], <i128 -4, i128 -5, i128 -6, i128 -7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i128 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i128 [[OFFSET_IDX]], -4
+; CHECK-NEXT:    [[OFFSET_IDX3:%.*]] = trunc i128 [[INDEX]] to i32
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> undef, i32 [[OFFSET_IDX3]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION6:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INDUCTION7:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[OFFSET_IDX3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add i128 [[TMP0]], -1
+; CHECK-NEXT:    [[TMP5:%.*]] = add i128 [[TMP1]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i128 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i128 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -3
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 -4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -3
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4
+; CHECK-NEXT:    [[REVERSE10:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD9]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP14]] = add <4 x i32> [[REVERSE]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP15]] = add <4 x i32> [[REVERSE10]], [[VEC_PHI8]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i128 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i128 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF12:%.*]] = shufflevector <4 x i32> [[BIN_RDX11]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX13:%.*]] = add <4 x i32> [[BIN_RDX11]], [[RDX_SHUF12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[BIN_RDX13]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i128 1024, 1024
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEND:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i128 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[ADD_I7:%.*]] = phi i128 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_I:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC4:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[REDUX5:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[INC_REDUX:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ADD_I]] = add i128 [[ADD_I7]], -1
+; CHECK-NEXT:    [[KIND__I:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i128 [[ADD_I]]
+; CHECK-NEXT:    [[TMP_I1:%.*]] = load i32, i32* [[KIND__I]], align 4
+; CHECK-NEXT:    [[INC_REDUX]] = add i32 [[TMP_I1]], [[REDUX5]]
+; CHECK-NEXT:    [[INC4]] = add i32 [[I_06]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC4]], 1024
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]], !llvm.loop !5
+; CHECK:       loopend:
+; CHECK-NEXT:    [[INC_REDUX_LCSSA:%.*]] = phi i32 [ [[INC_REDUX]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[INC_REDUX_LCSSA]]
+;
 entry:
   br label %for.body
 
@@ -56,13 +193,92 @@
   ret i32 %inc.redux
 }
 
-; CHECK-LABEL: @reverse_induction_i16(
-; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK: %offset.idx = sub i16 %startval, {{.*}}
-; CHECK: %[[a0:.+]] = add i16 %offset.idx, 0
-; CHECK: %[[a4:.+]] = add i16 %offset.idx, -4
-
 define i32 @reverse_induction_i16(i16 %startval, i32 * %ptr) {
+; CHECK-LABEL: @reverse_induction_i16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[STARTVAL:%.*]], -1
+; CHECK-NEXT:    [[MUL:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 1, i16 1023)
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i16, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i16, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i16 [[TMP0]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i16 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i16 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 true, i1 [[TMP3]], i1 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 false, [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i16 [[STARTVAL]], 1024
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i32 [[INDEX]] to i16
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i16 [[STARTVAL]], [[TMP8]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> undef, i16 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i16> [[BROADCAST_SPLAT]], <i16 0, i16 -1, i16 -2, i16 -3>
+; CHECK-NEXT:    [[INDUCTION2:%.*]] = add <4 x i16> [[BROADCAST_SPLAT]], <i16 -4, i16 -5, i16 -6, i16 -7>
+; CHECK-NEXT:    [[TMP9:%.*]] = add i16 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = add i16 [[OFFSET_IDX]], -4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION5:%.*]] = add <4 x i32> [[BROADCAST_SPLAT4]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INDUCTION6:%.*]] = add <4 x i32> [[BROADCAST_SPLAT4]], <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = add i16 [[TMP9]], -1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i16 [[TMP10]], -1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i16 [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i16 [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -3
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 -4
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 -3
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP22]], align 4
+; CHECK-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD8]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP23]] = add <4 x i32> [[REVERSE]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP24]] = add <4 x i32> [[REVERSE9]], [[VEC_PHI7]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP24]], [[TMP23]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <4 x i32> [[BIN_RDX10]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX12:%.*]] = add <4 x i32> [[BIN_RDX10]], [[RDX_SHUF11]]
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[BIN_RDX12]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1024, 1024
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEND:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ], [ [[STARTVAL]], [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[ADD_I7:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_I:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC4:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[REDUX5:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[INC_REDUX:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ADD_I]] = add i16 [[ADD_I7]], -1
+; CHECK-NEXT:    [[KIND__I:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i16 [[ADD_I]]
+; CHECK-NEXT:    [[TMP_I1:%.*]] = load i32, i32* [[KIND__I]], align 4
+; CHECK-NEXT:    [[INC_REDUX]] = add i32 [[TMP_I1]], [[REDUX5]]
+; CHECK-NEXT:    [[INC4]] = add i32 [[I_06]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC4]], 1024
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]], !llvm.loop !7
+; CHECK:       loopend:
+; CHECK-NEXT:    [[INC_REDUX_LCSSA:%.*]] = phi i32 [ [[INC_REDUX]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[INC_REDUX_LCSSA]]
+;
 entry:
   br label %for.body
 
@@ -99,13 +315,72 @@
 ;   }
 ; }
 
-; CHECK-LABEL: @reverse_forward_induction_i64_i8(
-; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK: %offset.idx = sub i64 1023, %index
-; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
-; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
-
 define void @reverse_forward_induction_i64_i8() {
+; CHECK-LABEL: @reverse_forward_induction_i64_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 -1, i64 -2, i64 -3>
+; CHECK-NEXT:    [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 -4, i64 -5, i64 -6, i64 -7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -4
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i8> [[VEC_IND]], <i8 4, i8 4, i8 4, i8 4>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[VEC_IND]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i8> [[STEP_ADD]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -3
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[REVERSE]], <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 -4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -3
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[REVERSE4]], <4 x i32>* [[TMP13]], align 4
+; CHECK-NEXT:    [[OFFSET_IDX5:%.*]] = sub i64 1023, [[INDEX]]
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[OFFSET_IDX5]] to i32
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> undef, i32 [[TMP14]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION8:%.*]] = add <4 x i32> [[BROADCAST_SPLAT7]], <i32 0, i32 -1, i32 -2, i32 -3>
+; CHECK-NEXT:    [[INDUCTION9:%.*]] = add <4 x i32> [[BROADCAST_SPLAT7]], <i32 -4, i32 -5, i32 -6, i32 -7>
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], -4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[STEP_ADD]], <i8 4, i8 4, i8 4, i8 4>
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[FORWARD_INDUCTION_05:%.*]] = phi i8 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[INC]] = add i8 [[FORWARD_INDUCTION_05]], 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[INC]] to i32
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[CONV]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP18]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_END]], !llvm.loop !9
+; CHECK:       while.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %while.body
 
@@ -125,13 +400,72 @@
   ret void
 }
 
-; CHECK-LABEL: @reverse_forward_induction_i64_i8_signed(
-; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK: %offset.idx = sub i64 1023, %index
-; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
-; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
-
 define void @reverse_forward_induction_i64_i8_signed() {
+; CHECK-LABEL: @reverse_forward_induction_i64_i8_signed(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 -127, i8 -126, i8 -125, i8 -124>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 -1, i64 -2, i64 -3>
+; CHECK-NEXT:    [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 -4, i64 -5, i64 -6, i64 -7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -4
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i8> [[VEC_IND]], <i8 4, i8 4, i8 4, i8 4>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[VEC_IND]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i8> [[STEP_ADD]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i8> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -3
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[REVERSE]], <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 -4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -3
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[REVERSE4]], <4 x i32>* [[TMP13]], align 4
+; CHECK-NEXT:    [[OFFSET_IDX5:%.*]] = sub i64 1023, [[INDEX]]
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[OFFSET_IDX5]] to i32
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> undef, i32 [[TMP14]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION8:%.*]] = add <4 x i32> [[BROADCAST_SPLAT7]], <i32 0, i32 -1, i32 -2, i32 -3>
+; CHECK-NEXT:    [[INDUCTION9:%.*]] = add <4 x i32> [[BROADCAST_SPLAT7]], <i32 -4, i32 -5, i32 -6, i32 -7>
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], -4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[STEP_ADD]], <i8 4, i8 4, i8 4, i8 4>
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i8 [ -127, [[MIDDLE_BLOCK]] ], [ -127, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[FORWARD_INDUCTION_05:%.*]] = phi i8 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[INC]] = add i8 [[FORWARD_INDUCTION_05]], 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[INC]] to i32
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[CONV]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP18]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_END]], !llvm.loop !11
+; CHECK:       while.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %while.body