Index: llvm/lib/Analysis/IVDescriptors.cpp
===================================================================
--- llvm/lib/Analysis/IVDescriptors.cpp
+++ llvm/lib/Analysis/IVDescriptors.cpp
@@ -1058,7 +1058,7 @@
   // to check for a pair of icmp/select, for which we use getNextInstruction and
   // isCorrectOpcode functions to step the right number of instruction, and
   // check the icmp/select pair.
-  // FIXME: We also do not attempt to look through Phi/Select's yet, which might
+  // FIXME: We also do not attempt to look through Select's yet, which might
   // be part of the reduction chain, or attempt to looks through And's to find a
   // smaller bitwidth. Subs are also currently not allowed (which are usually
   // treated as part of a add reduction) as they are expected to generally be
@@ -1068,16 +1068,21 @@
   if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp)
     ExpectedUses = 2;
 
-  auto getNextInstruction = [&](Instruction *Cur) {
-    if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp) {
-      // We are expecting a icmp/select pair, which we go to the next select
-      // instruction if we can. We already know that Cur has 2 uses.
-      if (isa<SelectInst>(*Cur->user_begin()))
-        return cast<Instruction>(*Cur->user_begin());
-      else
-        return cast<Instruction>(*std::next(Cur->user_begin()));
+  auto getNextInstruction = [&](Instruction *Cur) -> Instruction * {
+    for (auto User : Cur->users()) {
+      Instruction *UI = cast<Instruction>(User);
+      if (isa<PHINode>(UI))
+        continue;
+      if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp) {
+        // We are expecting a icmp/select pair, which we go to the next select
+        // instruction if we can. We already know that Cur has 2 uses.
+        if (isa<SelectInst>(UI))
+          return UI;
+        continue;
+      }
+      return UI;
     }
-    return cast<Instruction>(*Cur->user_begin());
+    return nullptr;
   };
   auto isCorrectOpcode = [&](Instruction *Cur) {
     if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp) {
@@ -1092,22 +1097,45 @@
     return Cur->getOpcode() == RedOp;
   };
 
+  // Attempt to look through Phis which are part of the reduction chain
+  unsigned ExtraPhiUses = 0;
+  Instruction *RdxInstr = LoopExitInstr;
+  if (auto ExitPhi = dyn_cast<PHINode>(LoopExitInstr)) {
+    if (ExitPhi->getNumIncomingValues() != 2)
+      return {};
+
+    Instruction *Inc0 = dyn_cast<Instruction>(ExitPhi->getIncomingValue(0));
+    Instruction *Inc1 = dyn_cast<Instruction>(ExitPhi->getIncomingValue(1));
+
+    Instruction *Chain = nullptr;
+    if (Inc0 && Inc0 == Phi)
+      Chain = Inc1;
+    else if (Inc1 && Inc1 == Phi)
+      Chain = Inc0;
+    else
+      return {};
+
+    RdxInstr = Chain;
+    ExtraPhiUses = 1;
+  }
+
   // The loop exit instruction we check first (as a quick test) but add last. We
   // check the opcode is correct (and dont allow them to be Subs) and that they
   // have expected to have the expected number of uses. They will have one use
   // from the phi and one from a LCSSA value, no matter the type.
-  if (!isCorrectOpcode(LoopExitInstr) || !LoopExitInstr->hasNUses(2))
+  if (!isCorrectOpcode(RdxInstr) || !LoopExitInstr->hasNUses(2))
     return {};
 
-  // Check that the Phi has one (or two for min/max) uses.
-  if (!Phi->hasNUses(ExpectedUses))
+  // Check that the Phi has one (or two for min/max or conditional) uses.
+  if (!Phi->hasNUses(ExpectedUses + ExtraPhiUses))
     return {};
+
   Instruction *Cur = getNextInstruction(Phi);
 
   // Each other instruction in the chain should have the expected number of uses
   // and be the correct opcode.
-  while (Cur != LoopExitInstr) {
-    if (!isCorrectOpcode(Cur) || !Cur->hasNUses(ExpectedUses))
+  while (Cur != RdxInstr) {
+    if (!Cur || !isCorrectOpcode(Cur) || !Cur->hasNUses(ExpectedUses))
       return {};
 
     ReductionOperations.push_back(Cur);
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8593,6 +8593,17 @@
     return Operands[0];
   }
 
+  // For in-loop reductions, we do not need to create an additional select.
+  if (Phi->getNumIncomingValues() == 2) {
+    PHINode *In0 = dyn_cast_or_null<PHINode>(Operands[0]->getUnderlyingValue());
+    if (In0 && CM.isInLoopReduction(In0))
+      return Operands[1];
+
+    PHINode *In1 = dyn_cast_or_null<PHINode>(Operands[1]->getUnderlyingValue());
+    if (In1 && CM.isInLoopReduction(In1))
+      return Operands[0];
+  }
+
   // We know that all PHIs in non-header blocks are converted into selects, so
   // we don't have to worry about the insertion order and we can just use the
   // builder. At this point we generate the predication tree. There may be
@@ -9423,7 +9434,7 @@
           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
 
-      auto *CondOp = CM.foldTailByMasking()
+      auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
                          : nullptr;
 
Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -S | FileCheck %s
+
+define float @cond_fadd(float* noalias nocapture readonly %a, float* noalias nocapture readonly %cond, i64 %N){
+; CHECK-LABEL: @cond_fadd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[COND:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, float* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = select fast <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP14]] = fadd fast float [[TMP13]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[COND]], i64 [[INDVARS]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP18]], 2.000000e+00
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[FADD:%.*]] = fadd fast float [[RDX]], [[TMP19]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RES]] = phi float [ [[FADD]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[RES_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.inc ]
+  %rdx = phi float [ 1.000000e+00, %entry ], [ %res, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %cond, i64 %indvars
+  %0 = load float, float* %arrayidx
+  %tobool = fcmp une float %0, 2.000000e+00
+  br i1 %tobool, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars
+  %1 = load float, float* %arrayidx2
+  %fadd = fadd fast float %rdx, %1
+  br label %for.inc
+
+for.inc:
+  %res = phi float [ %fadd, %if.then ], [ %rdx, %for.body ]
+  %indvars.next = add nuw nsw i64 %indvars, 1
+  %exitcond.not = icmp eq i64 %indvars.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret float %res
+}
+
+define float @cond_cmp_sel(float* noalias %a, float* noalias %cond, i64 %N) {
+; CHECK-LABEL: @cond_cmp_sel(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[COND:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, float* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = select fast <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 0xFFF0000000000000, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[TMP12]])
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP13]], [[VEC_PHI]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP13]], float [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi float [ [[RES:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[COND]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP17]], 3.000000e+00
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[FCMP:%.*]] = fcmp fast olt float [[RDX]], [[TMP18]]
+; CHECK-NEXT:    [[FSEL:%.*]] = select fast i1 [[FCMP]], float [[RDX]], float [[TMP18]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RES]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[FSEL]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[RES_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %rdx = phi float [ %res, %for.inc ], [ 1.000000e+00, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %cond, i64 %iv
+  %0 = load float, float* %arrayidx
+  %tobool = fcmp une float %0, 3.000000e+00
+  br i1 %tobool, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv
+  %1 = load float, float* %arrayidx2
+  %fcmp = fcmp fast olt float %rdx, %1
+  %fsel = select fast i1 %fcmp, float %rdx, float %1
+  br label %for.inc
+
+for.inc:
+  %res = phi float [ %rdx, %for.body ], [ %fsel, %if.then ]
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret float %res
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -590,33 +590,29 @@
 ; CHECK-NEXT:    br label %vector.body
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 7, i32 0), %vector.ph ], [ [[PREDPHI:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[N]])
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 5, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 4 x i1> [[TMP12]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP19:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP18]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i32> [[TMP17]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]]
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 7, %vector.ph ], [ [[TMP16:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[N]])
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 5, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP13]], i32 4, <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP14:%.*]] = select <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD1]], <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; CHECK-NEXT:    [[TMP16]] = xor i32 [[TMP15]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP20]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label %scalar.ph
-;
 entry:
   br label %for.body
 
Index: llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
@@ -0,0 +1,729 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -dce -instcombine -S | FileCheck %s
+
+define float @cond_fadd(float* noalias nocapture readonly %a, float* noalias nocapture readonly %cond, i64 %N){
+; CHECK-LABEL: @cond_fadd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[COND:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp une <4 x float> [[WIDE_LOAD]], <float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
+; CHECK:       pred.load.if1:
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP11]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
+; CHECK:       pred.load.continue2:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ [[TMP7]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; CHECK:       pred.load.if3:
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP17]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
+; CHECK:       pred.load.continue4:
+; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x float> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP18]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3
+; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP23]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.continue6:
+; CHECK-NEXT:    [[TMP25:%.*]] = phi <4 x float> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = select fast <4 x i1> [[TMP2]], <4 x float> [[TMP25]], <4 x float> zeroinitializer
+; CHECK-NEXT:    [[TMP27]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[TMP26]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP27]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[COND]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP29]], 5.000000e+00
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[FADD:%.*]] = fadd fast float [[RDX]], [[TMP30]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RES]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[FADD]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[RES_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %rdx = phi float [ 1.000000e+00, %entry ], [ %res, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %cond, i64 %iv
+  %0 = load float, float* %arrayidx
+  %tobool = fcmp une float %0, 5.000000e+00
+  br i1 %tobool, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv
+  %1 = load float, float* %arrayidx2
+  %fadd = fadd fast float %rdx, %1
+  br label %for.inc
+
+for.inc:
+  %res = phi float [ %rdx, %for.body ], [ %fadd, %if.then ]
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret float %res
+}
+
+define float @cond_cmp_sel(float* noalias %a, float* noalias %cond, i64 %N) {
+; CHECK-LABEL: @cond_cmp_sel(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[COND:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp une <4 x float> [[WIDE_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
+; CHECK:       pred.load.if1:
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP11]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
+; CHECK:       pred.load.continue2:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ [[TMP7]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; CHECK:       pred.load.if3:
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP17]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
+; CHECK:       pred.load.continue4:
+; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x float> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP18]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3
+; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP23]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.continue6:
+; CHECK-NEXT:    [[TMP25:%.*]] = phi <4 x float> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = select fast <4 x i1> [[TMP2]], <4 x float> [[TMP25]], <4 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>
+; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP26]])
+; CHECK-NEXT:    [[TMP28]] = call fast float @llvm.minnum.f32(float [[TMP27]], float [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[COND]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP30]], 3.000000e+00
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP32:%.*]] = call fast float @llvm.minnum.f32(float [[RDX]], float [[TMP31]])
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RES]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP32]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[RES_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %rdx = phi float [ 1.000000e+00, %entry ], [ %res, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %cond, i64 %iv
+  %0 = load float, float* %arrayidx
+  %tobool = fcmp une float %0, 3.000000e+00
+  br i1 %tobool, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv
+  %1 = load float, float* %arrayidx2
+  %fcmp = fcmp fast olt float %rdx, %1
+  %fsel = select fast i1 %fcmp, float %rdx, float %1
+  br label %for.inc
+
+for.inc:
+  %res = phi float [ %rdx, %for.body ], [ %fsel, %if.then ]
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret float %res
+}
+
+define i32 @conditional_and(i32* noalias %A, i32* noalias %B, i32 %cond, i64 noundef %N) #0 {
+; CHECK-LABEL: @conditional_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[COND:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 7, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
+; CHECK:       pred.load.if1:
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP11]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
+; CHECK:       pred.load.continue2:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i32> [ [[TMP7]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; CHECK:       pred.load.if3:
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP17]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
+; CHECK:       pred.load.continue4:
+; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP18]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3
+; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP23]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.continue6:
+; CHECK-NEXT:    [[TMP25:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP25]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP26]])
+; CHECK-NEXT:    [[TMP28]] = and i32 [[TMP27]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 7, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP30]], [[COND]]
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[TMP31]], [[RDX]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RES]] = phi i32 [ [[AND]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RES_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %rdx = phi i32 [ 7, %entry ], [ %res, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %iv
+  %0 = load i32, i32* %arrayidx
+  %tobool = icmp eq i32 %0, %cond
+  br i1 %tobool, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %iv
+  %1 = load i32, i32* %arrayidx2
+  %and = and i32 %1, %rdx
+  br label %for.inc
+
+for.inc:
+  %res = phi i32 [ %and, %if.then ], [ %rdx, %for.body ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret i32 %res
+}
+
+define i32 @simple_chained_rdx(i32* noalias %a, i32* noalias %b, i32* noalias %cond, i64 noundef %N) {
+; CHECK-LABEL: @simple_chained_rdx(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 5, [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[PRED_LOAD_CONTINUE14]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP5]], i64 1
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
+; CHECK:       pred.load.if1:
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP13]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
+; CHECK:       pred.load.continue2:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP10]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP5]], i64 2
+; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; CHECK:       pred.load.if3:
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP18]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
+; CHECK:       pred.load.continue4:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x i32> [ [[TMP15]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP5]], i64 3
+; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
+; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP23]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.continue6:
+; CHECK-NEXT:    [[TMP25:%.*]] = phi <4 x i32> [ [[TMP20]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP25]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP26]])
+; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    br i1 [[TMP29]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
+; CHECK:       pred.load.if7:
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> poison, i32 [[TMP31]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
+; CHECK:       pred.load.continue8:
+; CHECK-NEXT:    [[TMP33:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE6]] ], [ [[TMP32]], [[PRED_LOAD_IF7]] ]
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP5]], i64 1
+; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
+; CHECK:       pred.load.if9:
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
+; CHECK:       pred.load.continue10:
+; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP37]], [[PRED_LOAD_IF9]] ]
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x i1> [[TMP5]], i64 2
+; CHECK-NEXT:    br i1 [[TMP39]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
+; CHECK:       pred.load.if11:
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4
+; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
+; CHECK:       pred.load.continue12:
+; CHECK-NEXT:    [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP42]], [[PRED_LOAD_IF11]] ]
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x i1> [[TMP5]], i64 3
+; CHECK-NEXT:    br i1 [[TMP44]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]]
+; CHECK:       pred.load.if13:
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[TMP46]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
+; CHECK:       pred.load.continue14:
+; CHECK-NEXT:    [[TMP48:%.*]] = phi <4 x i32> [ [[TMP43]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP47]], [[PRED_LOAD_IF13]] ]
+; CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP48]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP50:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP49]])
+; CHECK-NEXT:    [[TMP51]] = add i32 [[TMP50]], [[TMP28]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP51]], [[MIDDLE_BLOCK]] ], [ 5, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[RES:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP53]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP54:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP54]], [[RDX]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP55:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[TMP55]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RES]] = phi i32 [ [[ADD3]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RES_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.inc ], [ 0, %entry ]
+  %rdx = phi i32 [ %res, %for.inc ], [ 5, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %cond, i64 %iv
+  %0 = load i32, i32* %arrayidx
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+  %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %iv
+  %1 = load i32, i32* %arrayidx1
+  %add = add nsw i32 %1, %rdx
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %2 = load i32, i32* %arrayidx2
+  %add3 = add nsw i32 %add, %2
+  br label %for.inc
+
+for.inc:
+  %res = phi i32 [ %add3, %if.then ], [ %rdx, %for.body ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret i32 %res
+}
+
+;
+; Negative Tests
+;
+
+;
+; Reduction not performed in loop as the phi has more than two incoming values
+;
+define i64 @nested_cond_and(i64* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, i64* noalias nocapture readonly %cond, i64 %N){
+; CHECK-LABEL: @nested_cond_and(
+; CHECK:       vector.body:
+; CHECK-NOT:     @llvm.vector.reduce.and
+; CHECK:       middle.block:
+; CHECK:         @llvm.vector.reduce.and
+; CHECK:       scalar.ph
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %rdx = phi i64 [ 5, %entry ], [ %res, %for.inc ]
+  %arrayidx = getelementptr inbounds i64, i64* %cond, i64 %iv
+  %0 = load i64, i64* %arrayidx
+  %tobool = icmp eq i64 %0, 0
+  br i1 %tobool, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx2 = getelementptr inbounds i64, i64* %a, i64 %iv
+  %1 = load i64, i64* %arrayidx2
+  %and1 = and i64 %rdx, %1
+  %tobool2 = icmp eq i64 %1, 3
+  br i1 %tobool2, label %if.then.2, label %for.inc
+
+if.then.2:
+  %arrayidx3 = getelementptr inbounds i64, i64* %b, i64 %iv
+  %2 = load i64, i64* %arrayidx3
+  %and2 = and i64 %rdx, %2
+  br label %for.inc
+
+for.inc:
+  %res = phi i64 [ %and2, %if.then.2 ], [ %and1, %if.then ], [ %rdx, %for.body ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret i64 %res
+}
+
+; Chain of conditional & unconditional reductions. We currently only support conditional reductions
+; if they are the last in the chain, i.e. the loop exit instruction is a Phi node. Therefore we reject
+; the Phi (%rdx1) as it has more than one use.
+;
+define i32 @cond-uncond(i32* noalias %src1, i32* noalias %src2, i32* noalias %cond, i64 noundef %n) #0 {
+; CHECK-LABEL: @cond-uncond(
+; CHECK:       pred.load.continue6:
+; CHECK-NOT:     @llvm.vector.reduce.add
+; CHECK:       middle.block:
+; CHECK:         @llvm.vector.reduce.add
+; CHECK:       scalar.ph
+entry:
+  br label %for.body
+
+for.body:
+  %rdx1 = phi i32 [ %add2, %if.end ], [ 0, %entry ]
+  %iv = phi i64 [ %iv.next, %if.end ], [ 0, %entry]
+  %arrayidx = getelementptr inbounds i32, i32* %cond, i64 %iv
+  %0 = load i32, i32* %arrayidx
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:
+  %arrayidx1 = getelementptr inbounds i32, i32* %src2, i64 %iv
+  %1 = load i32, i32* %arrayidx1
+  %add = add nsw i32 %1, %rdx1
+  br label %if.end
+
+if.end:
+  %res = phi i32 [ %add, %if.then ], [ %rdx1, %for.body ]
+  %arrayidx2 = getelementptr inbounds i32, i32* %src1, i64 %iv
+  %2 = load i32, i32* %arrayidx2
+  %add2 = add nsw i32 %2, %res
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add2
+}
+
+;
+; Chain of two conditional reductions. We do not vectorise this with in-loop reductions as neither
+; of the incoming values of the LoopExitInstruction (%res) is the reduction Phi (%rdx1).
+;
+define float @cond_cond(float* noalias %src1, float* noalias %src2, float* noalias %cond, i64 %n) #0 {
+; CHECK-LABEL: @cond_cond(
+; CHECK:       pred.load.continue14:
+; CHECK-NOT:     @llvm.vector.reduce.fadd
+; CHECK:       middle.block:
+; CHECK:         @llvm.vector.reduce.fadd
+; CHECK:       scalar.ph
+entry:
+  br label %for.body
+
+for.body:
+  %rdx1 = phi float [ %res, %for.inc ], [ 2.000000e+00, %entry ]
+  %iv = phi i64 [ %iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %cond, i64 %iv
+  %0 = load float, float* %arrayidx
+  %cmp1 = fcmp fast oeq float %0, 3.000000e+00
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %arrayidx2 = getelementptr inbounds float, float* %src1, i64 %iv
+  %1 = load float, float* %arrayidx2
+  %add = fadd fast float %1, %rdx1
+  br label %if.end
+
+if.end:
+  %rdx2 = phi float [ %add, %if.then ], [ %rdx1, %for.body ]
+  %cmp5 = fcmp fast oeq float %0, 7.000000e+00
+  br i1 %cmp5, label %if.then6, label %for.inc
+
+if.then6:
+  %arrayidx7 = getelementptr inbounds float, float* %src2, i64 %iv
+  %2 = load float, float* %arrayidx7
+  %add2 = fadd fast float %2, %rdx2
+  br label %for.inc
+
+for.inc:
+  %res = phi float [ %add2, %if.then6 ], [ %rdx2, %if.end ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret float %res
+}
+
+;
+; Chain of an unconditional & a conditional reduction. We do not vectorise this in-loop as neither of the
+; incoming values of the LoopExitInstruction (%res) is the reduction Phi (%rdx).
+;
+define i32 @uncond_cond(i32* noalias %src1, i32* noalias %src2, i32* noalias %cond, i64 %N) #0 {
+; CHECK-LABEL: @uncond_cond(
+; CHECK:       pred.load.continue7:
+; CHECK-NOT:     @llvm.vector.reduce.add
+; CHECK:       middle.block:
+; CHECK:         @llvm.vector.reduce.add
+; CHECK:       scalar.ph
+entry:
+  br label %for.body
+
+for.body:
+  %rdx = phi i32 [ %res, %for.inc ], [ 0, %entry ]
+  %iv = phi i64 [ %iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %src1, i64 %iv
+  %0 = load i32, i32* %arrayidx
+  %add1 = add nsw i32 %0, %rdx
+  %arrayidx1 = getelementptr inbounds i32, i32* %cond, i64 %iv
+  %1 = load i32, i32* %arrayidx1
+  %tobool.not = icmp eq i32 %1, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+  %arrayidx2 = getelementptr inbounds i32, i32* %src2, i64 %iv
+  %2 = load i32, i32* %arrayidx2
+  %add2 = add nsw i32 %2, %add1
+  br label %for.inc
+
+for.inc:
+  %res = phi i32 [ %add2, %if.then ], [ %add1, %for.body ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret i32 %res
+}
+
+;
+; Chain of multiple unconditional & conditional reductions. Does not vectorise in-loop as when we look back
+; through the chain and check the number of uses of %add1, we find more than the expected one use.
+;
+define i32 @uncond_cond_uncond(i32* noalias %src1, i32* noalias %src2, i32* noalias %cond, i64 noundef %N) {
+; CHECK-LABEL: @uncond_cond_uncond(
+; CHECK:       pred.load.continue7:
+; CHECK-NOT:     @llvm.vector.reduce.add
+; CHECK:       middle.block:
+; CHECK:         @llvm.vector.reduce.add
+; CHECK:       scalar.ph
+entry:
+  br label %for.body
+
+for.body:
+  %rdx = phi i32 [ %add3, %if.end ], [ 0, %entry ]
+  %iv = phi i64 [ %iv.next, %if.end ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %src1, i64 %iv
+  %0 = load i32, i32* %arrayidx
+  %add1 = add nsw i32 %0, %rdx
+  %arrayidx1 = getelementptr inbounds i32, i32* %cond, i64 %iv
+  %1 = load i32, i32* %arrayidx1
+  %tobool.not = icmp eq i32 %1, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:
+  %arrayidx2 = getelementptr inbounds i32, i32* %src2, i64 %iv
+  %2 = load i32, i32* %arrayidx2
+  %add2 = add nsw i32 %2, %add1
+  br label %if.end
+
+if.end:
+  %res = phi i32 [ %add2, %if.then ], [ %add1, %for.body ]
+  %add3 = add nsw i32 %res, %0
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add3
+}
Index: llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
+++ llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
@@ -297,6 +297,264 @@
   ret i32 %sum.0.lcssa
 }
 
+define i32 @cond_rdx_pred(i32 %cond, i32* noalias %a, i64 %N) {
+; CHECK-LABEL: @cond_rdx_pred(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 15
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], -16
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[COND:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i32> poison, i32 [[COND]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x i32> poison, i32 [[COND]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x i32> poison, i32 [[COND]], i64 0
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE44:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE44]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 4, [[VECTOR_PH]] ], [ [[TMP113:%.*]], [[PRED_LOAD_CONTINUE44]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP116:%.*]], [[PRED_LOAD_CONTINUE44]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP119:%.*]], [[PRED_LOAD_CONTINUE44]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP122:%.*]], [[PRED_LOAD_CONTINUE44]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i64> [[VEC_IND]], <i64 12, i64 12, i64 12, i64 12>
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLATINSERT7]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP4]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLATINSERT9]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLATINSERT11]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i1> [[TMP8]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLATINSERT13]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP0]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP7]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP12]], i64 0
+; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP19]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP12]], i64 1
+; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
+; CHECK:       pred.load.if15:
+; CHECK-NEXT:    [[TMP22:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP24]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
+; CHECK:       pred.load.continue16:
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <4 x i32> [ [[TMP20]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF15]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[TMP12]], i64 2
+; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
+; CHECK:       pred.load.if17:
+; CHECK-NEXT:    [[TMP28:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP30]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE18]]
+; CHECK:       pred.load.continue18:
+; CHECK-NEXT:    [[TMP32:%.*]] = phi <4 x i32> [ [[TMP26]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP31]], [[PRED_LOAD_IF17]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP12]], i64 3
+; CHECK-NEXT:    br i1 [[TMP33]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
+; CHECK:       pred.load.if19:
+; CHECK-NEXT:    [[TMP34:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP34]]
+; CHECK-NEXT:    [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP36]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE20]]
+; CHECK:       pred.load.continue20:
+; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP32]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP37]], [[PRED_LOAD_IF19]] ]
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x i1> [[TMP13]], i64 0
+; CHECK-NEXT:    br i1 [[TMP39]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
+; CHECK:       pred.load.if21:
+; CHECK-NEXT:    [[TMP40:%.*]] = or i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP40]]
+; CHECK-NEXT:    [[TMP42:%.*]] = load i32, i32* [[TMP41]], align 4
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> poison, i32 [[TMP42]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE22]]
+; CHECK:       pred.load.continue22:
+; CHECK-NEXT:    [[TMP44:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE20]] ], [ [[TMP43]], [[PRED_LOAD_IF21]] ]
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i1> [[TMP13]], i64 1
+; CHECK-NEXT:    br i1 [[TMP45]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
+; CHECK:       pred.load.if23:
+; CHECK-NEXT:    [[TMP46:%.*]] = or i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP46]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i32, i32* [[TMP47]], align 4
+; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <4 x i32> [[TMP44]], i32 [[TMP48]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE24]]
+; CHECK:       pred.load.continue24:
+; CHECK-NEXT:    [[TMP50:%.*]] = phi <4 x i32> [ [[TMP44]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP49]], [[PRED_LOAD_IF23]] ]
+; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <4 x i1> [[TMP13]], i64 2
+; CHECK-NEXT:    br i1 [[TMP51]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
+; CHECK:       pred.load.if25:
+; CHECK-NEXT:    [[TMP52:%.*]] = or i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP52]]
+; CHECK-NEXT:    [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i32> [[TMP50]], i32 [[TMP54]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE26]]
+; CHECK:       pred.load.continue26:
+; CHECK-NEXT:    [[TMP56:%.*]] = phi <4 x i32> [ [[TMP50]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP55]], [[PRED_LOAD_IF25]] ]
+; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <4 x i1> [[TMP13]], i64 3
+; CHECK-NEXT:    br i1 [[TMP57]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
+; CHECK:       pred.load.if27:
+; CHECK-NEXT:    [[TMP58:%.*]] = or i64 [[INDEX]], 7
+; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP58]]
+; CHECK-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP59]], align 4
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i32> [[TMP56]], i32 [[TMP60]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE28]]
+; CHECK:       pred.load.continue28:
+; CHECK-NEXT:    [[TMP62:%.*]] = phi <4 x i32> [ [[TMP56]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP61]], [[PRED_LOAD_IF27]] ]
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <4 x i1> [[TMP14]], i64 0
+; CHECK-NEXT:    br i1 [[TMP63]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
+; CHECK:       pred.load.if29:
+; CHECK-NEXT:    [[TMP64:%.*]] = or i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP64]]
+; CHECK-NEXT:    [[TMP66:%.*]] = load i32, i32* [[TMP65]], align 4
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <4 x i32> poison, i32 [[TMP66]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE30]]
+; CHECK:       pred.load.continue30:
+; CHECK-NEXT:    [[TMP68:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE28]] ], [ [[TMP67]], [[PRED_LOAD_IF29]] ]
+; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <4 x i1> [[TMP14]], i64 1
+; CHECK-NEXT:    br i1 [[TMP69]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
+; CHECK:       pred.load.if31:
+; CHECK-NEXT:    [[TMP70:%.*]] = or i64 [[INDEX]], 9
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP70]]
+; CHECK-NEXT:    [[TMP72:%.*]] = load i32, i32* [[TMP71]], align 4
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP72]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE32]]
+; CHECK:       pred.load.continue32:
+; CHECK-NEXT:    [[TMP74:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE30]] ], [ [[TMP73]], [[PRED_LOAD_IF31]] ]
+; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <4 x i1> [[TMP14]], i64 2
+; CHECK-NEXT:    br i1 [[TMP75]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
+; CHECK:       pred.load.if33:
+; CHECK-NEXT:    [[TMP76:%.*]] = or i64 [[INDEX]], 10
+; CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP76]]
+; CHECK-NEXT:    [[TMP78:%.*]] = load i32, i32* [[TMP77]], align 4
+; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <4 x i32> [[TMP74]], i32 [[TMP78]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE34]]
+; CHECK:       pred.load.continue34:
+; CHECK-NEXT:    [[TMP80:%.*]] = phi <4 x i32> [ [[TMP74]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP79]], [[PRED_LOAD_IF33]] ]
+; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <4 x i1> [[TMP14]], i64 3
+; CHECK-NEXT:    br i1 [[TMP81]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]]
+; CHECK:       pred.load.if35:
+; CHECK-NEXT:    [[TMP82:%.*]] = or i64 [[INDEX]], 11
+; CHECK-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP82]]
+; CHECK-NEXT:    [[TMP84:%.*]] = load i32, i32* [[TMP83]], align 4
+; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP80]], i32 [[TMP84]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE36]]
+; CHECK:       pred.load.continue36:
+; CHECK-NEXT:    [[TMP86:%.*]] = phi <4 x i32> [ [[TMP80]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP85]], [[PRED_LOAD_IF35]] ]
+; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <4 x i1> [[TMP15]], i64 0
+; CHECK-NEXT:    br i1 [[TMP87]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]]
+; CHECK:       pred.load.if37:
+; CHECK-NEXT:    [[TMP88:%.*]] = or i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP89:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP88]]
+; CHECK-NEXT:    [[TMP90:%.*]] = load i32, i32* [[TMP89]], align 4
+; CHECK-NEXT:    [[TMP91:%.*]] = insertelement <4 x i32> poison, i32 [[TMP90]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE38]]
+; CHECK:       pred.load.continue38:
+; CHECK-NEXT:    [[TMP92:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE36]] ], [ [[TMP91]], [[PRED_LOAD_IF37]] ]
+; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <4 x i1> [[TMP15]], i64 1
+; CHECK-NEXT:    br i1 [[TMP93]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]]
+; CHECK:       pred.load.if39:
+; CHECK-NEXT:    [[TMP94:%.*]] = or i64 [[INDEX]], 13
+; CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP94]]
+; CHECK-NEXT:    [[TMP96:%.*]] = load i32, i32* [[TMP95]], align 4
+; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP96]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE40]]
+; CHECK:       pred.load.continue40:
+; CHECK-NEXT:    [[TMP98:%.*]] = phi <4 x i32> [ [[TMP92]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP97]], [[PRED_LOAD_IF39]] ]
+; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <4 x i1> [[TMP15]], i64 2
+; CHECK-NEXT:    br i1 [[TMP99]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]]
+; CHECK:       pred.load.if41:
+; CHECK-NEXT:    [[TMP100:%.*]] = or i64 [[INDEX]], 14
+; CHECK-NEXT:    [[TMP101:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP100]]
+; CHECK-NEXT:    [[TMP102:%.*]] = load i32, i32* [[TMP101]], align 4
+; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP102]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE42]]
+; CHECK:       pred.load.continue42:
+; CHECK-NEXT:    [[TMP104:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP103]], [[PRED_LOAD_IF41]] ]
+; CHECK-NEXT:    [[TMP105:%.*]] = extractelement <4 x i1> [[TMP15]], i64 3
+; CHECK-NEXT:    br i1 [[TMP105]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44]]
+; CHECK:       pred.load.if43:
+; CHECK-NEXT:    [[TMP106:%.*]] = or i64 [[INDEX]], 15
+; CHECK-NEXT:    [[TMP107:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP106]]
+; CHECK-NEXT:    [[TMP108:%.*]] = load i32, i32* [[TMP107]], align 4
+; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <4 x i32> [[TMP104]], i32 [[TMP108]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE44]]
+; CHECK:       pred.load.continue44:
+; CHECK-NEXT:    [[TMP110:%.*]] = phi <4 x i32> [ [[TMP104]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP109]], [[PRED_LOAD_IF43]] ]
+; CHECK-NEXT:    [[TMP111:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> [[TMP38]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP112:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP111]])
+; CHECK-NEXT:    [[TMP113]] = mul i32 [[TMP112]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP114:%.*]] = select <4 x i1> [[TMP13]], <4 x i32> [[TMP62]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP115:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP114]])
+; CHECK-NEXT:    [[TMP116]] = mul i32 [[TMP115]], [[VEC_PHI4]]
+; CHECK-NEXT:    [[TMP117:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> [[TMP86]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP118:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP117]])
+; CHECK-NEXT:    [[TMP119]] = mul i32 [[TMP118]], [[VEC_PHI5]]
+; CHECK-NEXT:    [[TMP120:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP110]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP121:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP120]])
+; CHECK-NEXT:    [[TMP122]] = mul i32 [[TMP121]], [[VEC_PHI6]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 16, i64 16, i64 16, i64 16>
+; CHECK-NEXT:    [[TMP123:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP123]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = mul i32 [[TMP116]], [[TMP113]]
+; CHECK-NEXT:    [[BIN_RDX45:%.*]] = mul i32 [[TMP119]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX46:%.*]] = mul i32 [[TMP122]], [[BIN_RDX45]]
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i32 [ undef, [[FOR_INC]] ], [ [[BIN_RDX46]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RES_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %sum = phi i32 [ %res, %for.inc ], [ 4, %entry ]
+  %cmp1 = icmp sgt i32 %cond, 7
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %load = load i32, i32* %arrayidx
+  %mul = mul nsw i32 %load, %sum
+  br label %for.inc
+
+for.inc:
+  %res = phi i32 [ %mul, %if.then ], [ %sum, %for.body ]
+  %inc = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !6
+
+for.end:
+  ret i32 %res
+}
+
 !6 = distinct !{!6, !7, !8}
 !7 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
 !8 = !{!"llvm.loop.vectorize.enable", i1 true}