diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -240,6 +240,10 @@
   Optional<ElementCount> BestVF = None;
   unsigned BestUF = 0;
 
+  /// Candidates for sinking scalar operands after VPlan code-generation, which
+  /// cannot be handled by the VPlan-based sinkScalarOperands yet.
+  SetVector<VPValue *> SinkCandidates;
+
 public:
   LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
                            const TargetTransformInfo *TTI,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -523,7 +523,7 @@
   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
   /// Instr's operands.
   void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
-                            const VPIteration &Instance, bool IfPredicateInstr,
+                            const VPIteration &Instance,
                             VPTransformState &State);
 
   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
@@ -612,9 +612,9 @@
   /// block.
   void fixLCSSAPHIs(VPTransformState &State);
 
-  /// Iteratively sink the scalarized operands of a predicated instruction into
-  /// the block that was created for it.
-  void sinkScalarOperands(Instruction *PredInst);
+  /// Iteratively sink the scalarized operands of a sink candidate instruction
+  /// into the block that was created for it.
+  void sinkScalarOperands(Instruction *SinkCandidate);
   void sinkScalarOperands(VPlan &Plan);
 
   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
@@ -839,9 +839,6 @@
   /// The induction variable of the old basic block.
   PHINode *OldInduction = nullptr;
 
-  /// Store instructions that were predicated.
-  SmallVector<Instruction *, 4> PredicatedInstructions;
-
   /// Trip count of the original loop.
   Value *TripCount = nullptr;
 
@@ -2976,7 +2973,6 @@
 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
                                                VPUser &User,
                                                const VPIteration &Instance,
-                                               bool IfPredicateInstr,
                                                VPTransformState &State) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
 
@@ -3018,10 +3014,6 @@
   // If we just cloned a new assumption, add it the assumption cache.
   if (auto *II = dyn_cast<AssumeInst>(Cloned))
     AC->registerAssumption(II);
-
-  // End if-block.
-  if (IfPredicateInstr)
-    PredicatedInstructions.push_back(Cloned);
 }
 
 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
@@ -4030,7 +4022,17 @@
                  IVEndValues[Entry.first], LoopMiddleBlock);
 
   fixLCSSAPHIs(State);
-  for (Instruction *PI : PredicatedInstructions)
+  SetVector<Instruction *> SinkCands;
+  for (VPValue *VPCand : State.SinkCandidates) {
+    for (unsigned Part = 0; Part < State.UF; ++Part)
+      for (unsigned Lane = 0; Lane < State.VF.getKnownMinValue(); ++Lane)
+        if (State.hasScalarValue(VPCand, {Part, Lane})) {
+          if (Instruction *I =
+                  dyn_cast<Instruction>(State.get(VPCand, {Part, Lane})))
+            SinkCands.insert(I);
+        }
+  }
+  for (Instruction *PI : SinkCands)
     sinkScalarOperands(&*PI);
 
   // Remove redundant induction instructions.
@@ -4502,13 +4504,24 @@
   }
 }
 
-void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
-  // The basic block and loop containing the predicated instruction.
-  auto *PredBB = PredInst->getParent();
+void InnerLoopVectorizer::sinkScalarOperands(Instruction *SinkCandidate) {
+  BasicBlock *PredBB = nullptr;
+  // Find the predicated block containing the uses of SinkCandidate. If there
+  // are users in different blocks, bail out.
+  for (Use &U : SinkCandidate->uses()) {
+    auto *I = cast<Instruction>(U.getUser());
+    assert(!isa<PHINode>(I));
+    BasicBlock *BB = I->getParent();
+    if (!PredBB)
+      PredBB = BB;
+    if (PredBB != BB)
+      return;
+  }
   auto *VectorLoop = LI->getLoopFor(PredBB);
 
-  // Initialize a worklist with the operands of the predicated instruction.
-  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
+  // Initialize a worklist.
+  SetVector<Value *> Worklist;
+  Worklist.insert(SinkCandidate);
 
   // Holds instructions that we need to analyze again. An instruction may be
   // reanalyzed if we don't yet know if we can sink it or not.
@@ -7891,6 +7904,7 @@
   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
   State.TripCount = ILV.getOrCreateTripCount(nullptr);
   State.CanonicalIV = ILV.Induction;
+  State.SinkCandidates = SinkCandidates;
 
   ILV.printDebugTracesAtStart();
 
@@ -9026,7 +9040,7 @@
     }
   }
 
-  VPlanTransforms::sinkScalarOperands(*Plan);
+  VPlanTransforms::sinkScalarOperands(*Plan, SinkCandidates);
 
   std::string PlanName;
   raw_string_ostream RSO(PlanName);
@@ -9281,7 +9295,7 @@
   if (State.Instance) { // Generate a single instance.
     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
-                                    *State.Instance, IsPredicated, State);
+                                    *State.Instance, State);
     // Insert scalar instance packing it into a vector.
     if (AlsoPack && State.VF.isVector()) {
       // If we're constructing lane 0, initialize to start from poison.
@@ -9305,8 +9319,7 @@
   for (unsigned Part = 0; Part < State.UF; ++Part)
     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
-                                      VPIteration(Part, Lane), IsPredicated,
-                                      State);
+                                      VPIteration(Part, Lane), State);
 }
 
 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -338,6 +338,10 @@
 
   /// Pointer to the VPlan code is generated for.
   VPlan *Plan;
+
+  /// Candidates for sinking scalar operands after VPlan code-generation, which
+  /// cannot be handled by the VPlan-based sinkScalarOperands yet.
+  SetVector<VPValue *> SinkCandidates;
 };
 
 /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -14,6 +14,7 @@
 #define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
 
 #include "VPlan.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 
 namespace llvm {
@@ -29,7 +30,8 @@
       LoopVectorizationLegality::InductionList &Inductions,
       SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE);
 
-  static bool sinkScalarOperands(VPlan &Plan);
+  static bool sinkScalarOperands(VPlan &Plan,
+                                 SetVector<VPValue *> &SinkCandidates);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -100,7 +100,8 @@
   }
 }
 
-bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) {
+bool VPlanTransforms::sinkScalarOperands(VPlan &Plan,
+                                         SetVector<VPValue *> &SinkCandidates) {
   ReversePostOrderTraversal<VPBlockRecursiveTraversalWrapper<VPBlockBase *>>
       RPOT(VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
 
@@ -118,19 +119,26 @@
 
       while (!WorkList.empty()) {
         auto *C = WorkList.pop_back_val();
-        auto *Current = dyn_cast_or_null<VPReplicateRecipe>(C->Def);
-        if (!Current || Current->getParent() == RepR->getParent() ||
+        auto *Current = dyn_cast_or_null<VPRecipeBase>(C->Def);
+        if (!Current || Current->getNumDefinedValues() == 0 ||
+            Current->getParent() == RepR->getParent() ||
             Current->mayHaveSideEffects() || Current->mayReadOrWriteMemory())
           continue;
 
-        if (any_of(Current->users(), [VPBB](VPUser *U) {
+        // If we encounter a recipe we cannot sink directly, queue it for
+        // post-processing after code-generation, where instructions for a
+        // sub-set of lanes can be sunk.
+        if (!isa<VPReplicateRecipe>(Current) ||
+            any_of(Current->getVPValue(0)->users(), [VPBB](VPUser *U) {
               auto *UI = dyn_cast<VPRecipeBase>(U);
               return UI && UI->getParent() != VPBB;
-            }))
+            })) {
+          for (VPValue *D : Current->definedValues())
+            SinkCandidates.insert(D);
           continue;
+        }
 
         Current->moveBefore(*RepR->getParent(), RepR->getParent()->begin());
-        Current->setIsPredicated();
         WorkList.insert(Current->op_begin(), Current->op_end());
         Changed = true;
       }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -64,9 +64,10 @@
 ; instead scalarized if Cost-Model so decided as part of its
 ; sink-scalar-operands optimization for predicated instructions.
 ;
+; SINK-GATHER-LABEL: @scalarize_and_sink_gather
 ; SINK-GATHER: vector.body:
-; SINK-GATHER: pred.udiv.if:
 ; SINK-GATHER:   %[[T0:.+]] = load i32, i32* %{{.*}}, align 4
+; SINK-GATHER: pred.udiv.if:
 ; SINK-GATHER:   %{{.*}} = udiv i32 %[[T0]], %{{.*}}
 ; SINK-GATHER: pred.udiv.continue:
 define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -34,7 +34,7 @@
 ; UNROLL:       pred.store.continue3:
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; UNROLL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; UNROLL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; UNROLL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; UNROLL:       middle.block:
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 128, 128
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
@@ -51,7 +51,7 @@
 ; UNROLL:       for.inc:
 ; UNROLL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; UNROLL-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128
-; UNROLL-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]]
+; UNROLL-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; UNROLL:       for.end:
 ; UNROLL-NEXT:    ret i32 0
 ;
@@ -84,7 +84,7 @@
 ; UNROLL-NOSIMPLIFY:       pred.store.continue3:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       middle.block:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[CMP_N:%.*]] = icmp eq i64 128, 128
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -104,7 +104,7 @@
 ; UNROLL-NOSIMPLIFY:       for.inc:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       for.end:
 ; UNROLL-NOSIMPLIFY-NEXT:    ret i32 0
 ;
@@ -139,7 +139,7 @@
 ; VEC:       pred.store.continue2:
 ; VEC-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; VEC-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; VEC-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; VEC-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VEC:       middle.block:
 ; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 128, 128
 ; VEC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
@@ -156,7 +156,7 @@
 ; VEC:       for.inc:
 ; VEC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; VEC-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128
-; VEC-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]]
+; VEC-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; VEC:       for.end:
 ; VEC-NEXT:    ret i32 0
 ;
@@ -229,7 +229,7 @@
 ; UNROLL-NEXT:    [[PREDPHI5]] = select i1 undef, i32 [[VEC_PHI2]], i32 [[TMP9]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; UNROLL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP3:!llvm.loop !.*]]
+; UNROLL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; UNROLL:       middle.block:
 ; UNROLL-NEXT:    [[BIN_RDX:%.*]] = add i32 [[PREDPHI5]], [[PREDPHI]]
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
@@ -302,7 +302,7 @@
 ; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI5]] = select i1 undef, i32 [[VEC_PHI2]], i32 [[TMP8]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP3:!llvm.loop !.*]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       middle.block:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[BIN_RDX:%.*]] = add i32 [[PREDPHI5]], [[PREDPHI]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
@@ -326,7 +326,7 @@
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDVARS_IV_NEXT4]] = add nsw i64 [[INDVARS_IV3]], 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV3]] to i32
 ; UNROLL-NOSIMPLIFY-NEXT:    [[CMP13:%.*]] = icmp slt i32 [[TMP1]], 0
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[CMP13]], label [[FOR_BODY14]], label [[FOR_INC26_LOOPEXIT]], [[LOOP4:!llvm.loop !.*]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[CMP13]], label [[FOR_BODY14]], label [[FOR_INC26_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       for.inc26.loopexit:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INEWCHUNKS_2_LCSSA:%.*]] = phi i32 [ [[INEWCHUNKS_2]], [[FOR_INC23]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_INC26]]
@@ -367,7 +367,7 @@
 ; VEC-NEXT:    [[PREDPHI]] = select <2 x i1> undef, <2 x i32> [[VEC_PHI]], <2 x i32> [[TMP8]]
 ; VEC-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; VEC-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VEC-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
+; VEC-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VEC:       middle.block:
 ; VEC-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI]])
 ; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
@@ -436,28 +436,28 @@
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
+; UNROLL-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
+; UNROLL-NEXT:    [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1
 ; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 undef, [[INDEX]]
 ; UNROLL-NEXT:    [[INDUCTION3:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; UNROLL-NEXT:    [[INDUCTION4:%.*]] = add i64 [[OFFSET_IDX]], -1
+; UNROLL-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION]]
+; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]]
+; UNROLL-NEXT:    [[TMP2:%.*]] = load i8, i8* [[TMP0]], align 1
+; UNROLL-NEXT:    [[TMP3:%.*]] = load i8, i8* [[TMP1]], align 1
 ; UNROLL-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE6]]
 ; UNROLL:       pred.store.if:
-; UNROLL-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION]]
-; UNROLL-NEXT:    [[TMP1:%.*]] = load i8, i8* [[TMP0]], align 1
-; UNROLL-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
-; UNROLL-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
-; UNROLL-NEXT:    store i8 [[TMP3]], i8* [[TMP0]], align 1
-; UNROLL-NEXT:    [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]]
-; UNROLL-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
-; UNROLL-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; UNROLL-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP2]] to i32
+; UNROLL-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; UNROLL-NEXT:    store i8 [[TMP5]], i8* [[TMP0]], align 1
+; UNROLL-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP3]] to i32
 ; UNROLL-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
-; UNROLL-NEXT:    store i8 [[TMP7]], i8* [[TMP4]], align 1
+; UNROLL-NEXT:    store i8 [[TMP7]], i8* [[TMP1]], align 1
 ; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; UNROLL:       pred.store.continue6:
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; UNROLL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
-; UNROLL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
+; UNROLL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; UNROLL:       middle.block:
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 undef, undef
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
@@ -476,7 +476,7 @@
 ; UNROLL-NEXT:    [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
 ; UNROLL-NEXT:    [[TMP7]] = add i64 [[TMP1]], -1
 ; UNROLL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 0
-; UNROLL-NEXT:    br i1 [[TMP8]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]]
+; UNROLL-NEXT:    br i1 [[TMP8]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; UNROLL:       for.end:
 ; UNROLL-NEXT:    ret void
 ;
@@ -487,32 +487,32 @@
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NOSIMPLIFY:       vector.body:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
+; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 undef, [[INDEX]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION3:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION4:%.*]] = add i64 [[OFFSET_IDX]], -1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = load i8, i8* [[TMP0]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = load i8, i8* [[TMP1]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = load i8, i8* [[TMP0]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
-; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP3]], i8* [[TMP0]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP2]] to i32
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP5]], i8* [[TMP0]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; UNROLL-NOSIMPLIFY:       pred.store.continue:
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if5:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP3]] to i32
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
-; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP7]], i8* [[TMP4]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP7]], i8* [[TMP1]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; UNROLL-NOSIMPLIFY:       pred.store.continue6:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       middle.block:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[CMP_N:%.*]] = icmp eq i64 undef, undef
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -535,7 +535,7 @@
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7]] = add i64 [[TMP1]], -1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 0
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP8]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP6:!llvm.loop !.*]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP8]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       for.end:
 ; UNROLL-NOSIMPLIFY-NEXT:    ret void
 ;
@@ -575,7 +575,7 @@
 ; VEC:       pred.store.continue3:
 ; VEC-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; VEC-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
-; VEC-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
+; VEC-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VEC:       middle.block:
 ; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 undef, undef
 ; VEC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
@@ -594,7 +594,7 @@
 ; VEC-NEXT:    [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
 ; VEC-NEXT:    [[TMP7]] = add i64 [[TMP1]], -1
 ; VEC-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 0
-; VEC-NEXT:    br i1 [[TMP8]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP6:!llvm.loop !.*]]
+; VEC-NEXT:    br i1 [[TMP8]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VEC:       for.end:
 ; VEC-NEXT:    ret void
 ;
@@ -637,16 +637,16 @@
 ; UNROLL-NEXT:    [[INDUCTION4:%.*]] = add i64 [[OFFSET_IDX]], -1
 ; UNROLL-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION]]
 ; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]]
+; UNROLL-NEXT:    [[TMP2:%.*]] = load i8, i8* [[TMP0]], align 1
+; UNROLL-NEXT:    [[TMP3:%.*]] = load i8, i8* [[TMP1]], align 1
 ; UNROLL-NEXT:    store i8 0, i8* [[TMP0]], align 1
 ; UNROLL-NEXT:    store i8 0, i8* [[TMP1]], align 1
 ; UNROLL-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE6]]
 ; UNROLL:       pred.store.if:
-; UNROLL-NEXT:    [[TMP2:%.*]] = load i8, i8* [[TMP0]], align 1
-; UNROLL-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i32
-; UNROLL-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
-; UNROLL-NEXT:    store i8 [[TMP4]], i8* [[TMP0]], align 1
-; UNROLL-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP1]], align 1
-; UNROLL-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; UNROLL-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP2]] to i32
+; UNROLL-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; UNROLL-NEXT:    store i8 [[TMP5]], i8* [[TMP0]], align 1
+; UNROLL-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP3]] to i32
 ; UNROLL-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
 ; UNROLL-NEXT:    store i8 [[TMP7]], i8* [[TMP1]], align 1
 ; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE6]]
@@ -691,20 +691,20 @@
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION4:%.*]] = add i64 [[OFFSET_IDX]], -1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = load i8, i8* [[TMP0]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = load i8, i8* [[TMP1]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    store i8 0, i8* [[TMP0]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    store i8 0, i8* [[TMP1]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if:
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = load i8, i8* [[TMP0]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i32
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
-; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP4]], i8* [[TMP0]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP2]] to i32
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP5]], i8* [[TMP0]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; UNROLL-NOSIMPLIFY:       pred.store.continue:
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if5:
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP1]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP3]] to i32
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
 ; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP7]], i8* [[TMP1]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE6]]