diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -623,10 +623,6 @@
   /// the block that was created for it.
   void sinkScalarOperands(Instruction *PredInst);
 
-  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
-  /// represented as.
-  void truncateToMinimalBitwidths(VPTransformState &State);
-
   /// Returns (and creates if needed) the trip count of the widened loop.
   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
 
@@ -3386,151 +3382,8 @@
   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
 }
 
-void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
-  // For every instruction `I` in MinBWs, truncate the operands, create a
-  // truncated version of `I` and reextend its result. InstCombine runs
-  // later and will remove any ext/trunc pairs.
-  SmallPtrSet<Value *, 4> Erased;
-  for (const auto &KV : Cost->getMinimalBitwidths()) {
-    // If the value wasn't vectorized, we must maintain the original scalar
-    // type. The absence of the value from State indicates that it
-    // wasn't vectorized.
-    // FIXME: Should not rely on getVPValue at this point.
-    VPValue *Def = State.Plan->getVPValue(KV.first, true);
-    if (!State.hasAnyVectorValue(Def))
-      continue;
-    // If the instruction is defined outside the loop, only update the first
-    // part; the first part will be re-used for all other parts.
-    unsigned UFToUse = OrigLoop->contains(KV.first) ? UF : 1;
-    for (unsigned Part = 0; Part < UFToUse; ++Part) {
-      Value *I = State.get(Def, Part);
-      if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
-        continue;
-      Type *OriginalTy = I->getType();
-      Type *ScalarTruncatedTy =
-          IntegerType::get(OriginalTy->getContext(), KV.second);
-      auto *TruncatedTy = VectorType::get(
-          ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
-      if (TruncatedTy == OriginalTy)
-        continue;
-
-      IRBuilder<> B(cast<Instruction>(I));
-      auto ShrinkOperand = [&](Value *V) -> Value * {
-        if (auto *ZI = dyn_cast<ZExtInst>(V))
-          if (ZI->getSrcTy() == TruncatedTy)
-            return ZI->getOperand(0);
-        return B.CreateZExtOrTrunc(V, TruncatedTy);
-      };
-
-      // The actual instruction modification depends on the instruction type,
-      // unfortunately.
-      Value *NewI = nullptr;
-      if (auto *BO = dyn_cast<BinaryOperator>(I)) {
-        Value *Op0 = ShrinkOperand(BO->getOperand(0));
-        Value *Op1 = ShrinkOperand(BO->getOperand(1));
-        NewI = B.CreateBinOp(BO->getOpcode(), Op0, Op1);
-
-        // Any wrapping introduced by shrinking this operation shouldn't be
-        // considered undefined behavior. So, we can't unconditionally copy
-        // arithmetic wrapping flags to NewI.
-        cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
-      } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
-        Value *Op0 = ShrinkOperand(BO->getOperand(0));
-        Value *Op1 = ShrinkOperand(BO->getOperand(1));
-        NewI = B.CreateICmp(CI->getPredicate(), Op0, Op1);
-      } else if (auto *SI = dyn_cast<SelectInst>(I)) {
-        Value *TV = ShrinkOperand(SI->getTrueValue());
-        Value *FV = ShrinkOperand(SI->getFalseValue());
-        NewI = B.CreateSelect(SI->getCondition(), TV, FV);
-      } else if (auto *CI = dyn_cast<CastInst>(I)) {
-        switch (CI->getOpcode()) {
-        default:
-          llvm_unreachable("Unhandled cast!");
-        case Instruction::Trunc:
-          NewI = ShrinkOperand(CI->getOperand(0));
-          break;
-        case Instruction::SExt:
-          NewI = B.CreateSExtOrTrunc(
-              CI->getOperand(0),
-              smallestIntegerVectorType(OriginalTy, TruncatedTy));
-          break;
-        case Instruction::ZExt:
-          NewI = B.CreateZExtOrTrunc(
-              CI->getOperand(0),
-              smallestIntegerVectorType(OriginalTy, TruncatedTy));
-          break;
-        }
-      } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
-        auto Elements0 =
-            cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
-        auto *O0 = B.CreateZExtOrTrunc(
-            SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
-        auto Elements1 =
-            cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
-        auto *O1 = B.CreateZExtOrTrunc(
-            SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
-
-        NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
-      } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
-        // Don't do anything with the operands, just extend the result.
-        continue;
-      } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
-        auto Elements =
-            cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
-        auto *O0 = B.CreateZExtOrTrunc(
-            IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
-        auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
-        NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
-      } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
-        auto Elements =
-            cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
-        auto *O0 = B.CreateZExtOrTrunc(
-            EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
-        NewI = B.CreateExtractElement(O0, EE->getOperand(2));
-      } else {
-        // If we don't know what to do, be conservative and don't do anything.
-        continue;
-      }
-
-      // Lastly, extend the result.
-      NewI->takeName(cast<Instruction>(I));
-      Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
-      I->replaceAllUsesWith(Res);
-      cast<Instruction>(I)->eraseFromParent();
-      Erased.insert(I);
-      State.reset(Def, Res, Part);
-    }
-  }
-
-  // We'll have created a bunch of ZExts that are now parentless. Clean up.
-  for (const auto &KV : Cost->getMinimalBitwidths()) {
-    // If the value wasn't vectorized, we must maintain the original scalar
-    // type. The absence of the value from State indicates that it
-    // wasn't vectorized.
-    // FIXME: Should not rely on getVPValue at this point.
-    VPValue *Def = State.Plan->getVPValue(KV.first, true);
-    if (!State.hasAnyVectorValue(Def))
-      continue;
-    unsigned UFToUse = OrigLoop->contains(KV.first) ? UF : 1;
-    for (unsigned Part = 0; Part < UFToUse; ++Part) {
-      Value *I = State.get(Def, Part);
-      ZExtInst *Inst = dyn_cast<ZExtInst>(I);
-      if (Inst && Inst->use_empty()) {
-        Value *NewI = Inst->getOperand(0);
-        Inst->eraseFromParent();
-        State.reset(Def, NewI, Part);
-      }
-    }
-  }
-}
-
 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
                                             VPlan &Plan) {
-  // Insert truncates and extends for any truncated instructions as hints to
-  // InstCombine.
-  if (VF.isVector())
-    truncateToMinimalBitwidths(State);
-
   // Fix widened non-induction PHIs by setting up the PHI operands.
   if (EnableVPlanNativePath)
     fixNonInductionPHIs(Plan, State);
@@ -8671,7 +8524,7 @@
     VFRange SubRange = {VF, MaxVFTimes2};
     if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
       // Now optimize the initial VPlan.
-      VPlanTransforms::optimize(*Plan, *PSE.getSE());
+      VPlanTransforms::optimize(*Plan, *PSE.getSE(), CM.getMinimalBitwidths());
       assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
       VPlans.push_back(std::move(Plan));
     }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -274,10 +274,6 @@
            I->second[Part];
   }
 
-  bool hasAnyVectorValue(VPValue *Def) const {
-    return Data.PerPartOutput.contains(Def);
-  }
-
   bool hasScalarValue(VPValue *Def, VPIteration Instance) {
     auto I = Data.PerPartScalars.find(Def);
     if (I == Data.PerPartScalars.end())
@@ -2706,6 +2702,8 @@
   VPBasicBlock *getPreheader() { return Preheader; }
   const VPBasicBlock *getPreheader() const { return Preheader; }
 
+  ArrayRef<VPValue *> getLiveIns() const { return VPLiveInsToFree; }
+
 private:
   /// Add to the given dominator tree the header block and every new basic block
   /// that was created between it and the latch block, inclusive.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -59,7 +59,8 @@
   /// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe
   /// optimizations, dead recipe removal, replicate region optimizations and
   /// block merging.
-  static void optimize(VPlan &Plan, ScalarEvolution &SE);
+  static void optimize(VPlan &Plan, ScalarEvolution &SE,
+                       const MapVector<Instruction *, uint64_t> &MinBWs);
 
   /// Wrap predicated VPReplicateRecipes with a mask operand in an if-then
   /// region block and remove the mask operand. Optimize the created regions by
@@ -79,6 +80,12 @@
                                 bool UseActiveLaneMaskForControlFlow,
                                 bool DataAndControlFlowWithoutRuntimeCheck);
 
+  /// Insert truncates and extends for any truncated instructions as hints to
+  /// InstCombine.
+  static void
+  truncateToMinimalBitwidths(VPlan &Plan,
+                             const MapVector<Instruction *, uint64_t> &MinBWs);
+
 private:
   /// Remove redundant VPBasicBlocks by merging them into their predecessor if
   /// the predecessor has a single successor.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -868,12 +868,130 @@
   }
 }
 
-void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) {
+void VPlanTransforms::truncateToMinimalBitwidths(
+    VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
+#ifndef NDEBUG
+  unsigned ProcessedRecipes = 0;
+#endif
+  VPBasicBlock *PH =
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());
+
+  // First truncate live-ins that represent relevant instructions.
+  for (VPValue *VPV : Plan.getLiveIns()) {
+    auto *LiveInInst = dyn_cast<Instruction>(VPV->getLiveInIRValue());
+    unsigned NewResSizeInBits = MinBWs.lookup(LiveInInst);
+    if (!LiveInInst || !NewResSizeInBits)
+      continue;
+
+    Type *ResTy = LiveInInst->getType();
+    if (!ResTy->isIntegerTy())
+      continue;
+
+    LLVMContext &Ctx = ResTy->getContext();
+    auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits);
+    auto *Shrunk = new VPWidenCastRecipe(Instruction::Trunc, VPV, NewResTy);
+    PH->appendRecipe(Shrunk);
+    VPV->replaceAllUsesWith(Shrunk);
+    Shrunk->setOperand(0, VPV);
+#ifndef NDEBUG
+    ProcessedRecipes++;
+#endif
+  }
+
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_deep(Plan.getEntry()))) {
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+      if (auto *Mem = dyn_cast<VPWidenMemoryInstructionRecipe>(&R)) {
+#ifndef NDEBUG
+        ProcessedRecipes += MinBWs.count(&Mem->getIngredient());
+#endif
+        continue;
+      }
+      if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
+               VPWidenSelectRecipe>(&R))
+        continue;
+
+      VPValue *ResultVPV = R.getVPSingleValue();
+      auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
+      unsigned NewResSizeInBits = MinBWs.lookup(UI);
+      if (!UI || !NewResSizeInBits)
+        continue;
+
+#ifndef NDEBUG
+      ProcessedRecipes++;
+#endif
+
+      // Only widen recipes are handled at the moment, but there may be entries
+      // for replicate recipes in MinBWs. Skip those here, after incrementing
+      // ProcessedRecipes.
+      if (isa<VPReplicateRecipe>(&R))
+        continue;
+      unsigned ResSizeInBits = getTypeSizeInBits(ResultVPV);
+      Type *ResTy = UI->getType();
+      assert(ResTy->isIntegerTy() && "only integer types supported");
+      if (ResSizeInBits == NewResSizeInBits)
+        continue;
+
+      LLVMContext &Ctx = ResTy->getContext();
+      auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits);
+
+      // Try to replace wider SExt/ZExts with narrower ones if possible.
+      if (auto *VPC = dyn_cast<VPWidenCastRecipe>(&R)) {
+        unsigned Opc = VPC->getOpcode();
+        if (Opc == Instruction::SExt || Opc == Instruction::ZExt) {
+          assert(ResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
+          // SExt/Zext is redundant - stick with its operand.
+          Instruction::CastOps Opcode = VPC->getOpcode();
+          VPValue *Op = R.getOperand(0);
+          if (getTypeSizeInBits(Op) > NewResSizeInBits)
+            Opcode = Instruction::Trunc;
+          auto *C = new VPWidenCastRecipe(Opcode, Op, NewResTy);
+          C->insertBefore(VPC);
+          VPC->replaceAllUsesWith(C);
+          continue;
+        }
+      }
+
+      // Shrink operands by introducing truncates as needed.
+      unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
+      for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
+        auto *Op = R.getOperand(Idx);
+        unsigned OpSizeInBits = getTypeSizeInBits(Op);
+        if (OpSizeInBits == NewResSizeInBits)
+          continue;
+        assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
+        auto *Shrunk = new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy);
+        Shrunk->insertBefore(&R);
+        R.setOperand(Idx, Shrunk);
+      }
+
+      if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
+        VPW->dropPoisonGeneratingFlags();
+
+      // Extend result to original width.
+      auto *Ext = new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, ResTy);
+      Ext->insertAfter(&R);
+      ResultVPV->replaceAllUsesWith(Ext);
+      Ext->setOperand(0, ResultVPV);
+    }
+  }
+
+  assert(MinBWs.size() == ProcessedRecipes &&
+         "some entries in MinBWs haven't been processed");
+}
+
+void VPlanTransforms::optimize(
+    VPlan &Plan, ScalarEvolution &SE,
+    const MapVector<Instruction *, uint64_t> &MinBWs) {
   removeRedundantCanonicalIVs(Plan);
   removeRedundantInductionCasts(Plan);
-
   optimizeInductions(Plan, SE);
+
+  if (!Plan.hasVF(ElementCount::getFixed(1)))
+    truncateToMinimalBitwidths(Plan, MinBWs);
+
   simplifyRecipes(Plan);
+
   removeDeadRecipes(Plan);
 
   createAndOptimizeReplicateRegions(Plan);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
@@ -28,25 +28,24 @@
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw <16 x i16> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw <16 x i16> [[TMP4]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP5]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i16> [[TMP6]] to <16 x i8>
-; CHECK-NEXT:    store <16 x i8> [[TMP7]], ptr [[TMP2]], align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP7]], ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i16>
-; CHECK-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16>
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw <16 x i16> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = lshr <16 x i16> [[TMP11]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc <16 x i16> [[TMP12]] to <16 x i8>
-; CHECK-NEXT:    store <16 x i8> [[TMP13]], ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw <16 x i16> [[TMP9]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = lshr <16 x i16> [[TMP10]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc <16 x i16> [[TMP11]] to <16 x i8>
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -60,27 +59,26 @@
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX7]]
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i8>, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX7]]
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = zext <8 x i8> [[WIDE_LOAD8]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX7]]
 ; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <8 x i8>, ptr [[TMP16]], align 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = zext <8 x i8> [[WIDE_LOAD9]] to <8 x i16>
-; CHECK-NEXT:    [[TMP18:%.*]] = zext <8 x i8> [[WIDE_LOAD8]] to <8 x i16>
-; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw <8 x i16> [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = lshr <8 x i16> [[TMP19]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc <8 x i16> [[TMP20]] to <8 x i8>
-; CHECK-NEXT:    store <8 x i8> [[TMP21]], ptr [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX7]]
-; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x i8>, ptr [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP23:%.*]] = zext <8 x i8> [[WIDE_LOAD10]] to <8 x i16>
-; CHECK-NEXT:    [[TMP24:%.*]] = zext <8 x i8> [[WIDE_LOAD8]] to <8 x i16>
-; CHECK-NEXT:    [[TMP25:%.*]] = mul nuw <8 x i16> [[TMP23]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = lshr <8 x i16> [[TMP25]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-; CHECK-NEXT:    [[TMP27:%.*]] = trunc <8 x i16> [[TMP26]] to <8 x i8>
-; CHECK-NEXT:    store <8 x i8> [[TMP27]], ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw <8 x i16> [[TMP17]], [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = lshr <8 x i16> [[TMP18]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+; CHECK-NEXT:    [[TMP20:%.*]] = trunc <8 x i16> [[TMP19]] to <8 x i8>
+; CHECK-NEXT:    store <8 x i8> [[TMP20]], ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX7]]
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = zext <8 x i8> [[WIDE_LOAD10]] to <8 x i16>
+; CHECK-NEXT:    [[TMP23:%.*]] = mul nuw <8 x i16> [[TMP22]], [[TMP15]]
+; CHECK-NEXT:    [[TMP24:%.*]] = lshr <8 x i16> [[TMP23]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+; CHECK-NEXT:    [[TMP25:%.*]] = trunc <8 x i16> [[TMP24]] to <8 x i8>
+; CHECK-NEXT:    store <8 x i8> [[TMP25]], ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 8
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]]
+; CHECK-NEXT:    br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[N_VEC5]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N6]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -94,18 +92,18 @@
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP29]] to i32
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP27]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP30:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP30]] to i32
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP28]] to i32
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV3]], [[CONV]]
 ; CHECK-NEXT:    [[SHR_26:%.*]] = lshr i32 [[MUL]], 8
 ; CHECK-NEXT:    [[CONV4:%.*]] = trunc i32 [[SHR_26]] to i8
 ; CHECK-NEXT:    store i8 [[CONV4]], ptr [[ARRAYIDX2]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX8]], align 1
-; CHECK-NEXT:    [[CONV9:%.*]] = zext i8 [[TMP31]] to i32
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX8]], align 1
+; CHECK-NEXT:    [[CONV9:%.*]] = zext i8 [[TMP29]] to i32
 ; CHECK-NEXT:    [[MUL10:%.*]] = mul nuw nsw i32 [[CONV9]], [[CONV]]
 ; CHECK-NEXT:    [[SHR11_27:%.*]] = lshr i32 [[MUL10]], 8
 ; CHECK-NEXT:    [[CONV12:%.*]] = trunc i32 [[SHR11_27]] to i8
@@ -158,54 +156,57 @@
 ; CHECK-LABEL: define void @test_shrink_zext_in_preheader
 ; CHECK-SAME: (ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[A:%.*]], i16 [[B:%.*]]) {
 ; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[CONV10:%.*]] = zext i16 [[B]] to i32
 ; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV10]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i16> undef, i16 [[B]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i16> [[TMP0]], <16 x i16> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i16>
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT1]], <16 x i32> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i16>
-; CHECK-NEXT:    [[TMP2:%.*]] = mul <16 x i16> [[BROADCAST_SPLAT2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i16> [[BROADCAST_SPLAT2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr <16 x i16> [[TMP2]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i16> [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <16 x i16> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP4]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i16> [[TMP5]] to <16 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr <16 x i16> [[TMP5]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i16> [[TMP6]] to <16 x i8>
-; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[INDEX]] to i64
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP9]]
-; CHECK-NEXT:    store <16 x i8> [[TMP7]], ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 16
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <16 x i16> [[TMP7]] to <16 x i8>
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[INDEX]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP10]]
 ; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 16
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP12]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i64 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX4:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = trunc i32 [[A]] to i16
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x i16> undef, i16 [[TMP14]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = mul <8 x i16> [[TMP15]], [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = lshr <8 x i16> [[TMP16]], <i16 8, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
-; CHECK-NEXT:    [[TMP18:%.*]] = trunc <8 x i16> [[TMP17]] to <8 x i8>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i8> [[TMP18]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = sext i32 [[INDEX4]] to i64
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP20]]
-; CHECK-NEXT:    store <8 x i8> [[TMP19]], ptr [[TMP21]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i32 [[INDEX4]], 8
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT9]], 1000
-; CHECK-NEXT:    br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[INDEX6:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc i32 [[A]] to i16
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x i16> undef, i16 [[TMP15]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = mul <8 x i16> [[TMP16]], [[TMP14]]
+; CHECK-NEXT:    [[TMP18:%.*]] = lshr <8 x i16> [[TMP17]], <i16 8, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc <8 x i16> [[TMP18]] to <8 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i8> [[TMP19]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext i32 [[INDEX6]] to i64
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP21]]
+; CHECK-NEXT:    store <8 x i8> [[TMP20]], ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i32 [[INDEX6]], 8
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT9]], 1000
+; CHECK-NEXT:    br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -307,17 +307,15 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i16> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc <16 x i32> [[TMP5]] to <16 x i16>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <16 x i16> [[TMP6]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i16> [[TMP7]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc <16 x i32> [[TMP8]] to <16 x i16>
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <16 x i16> [[TMP9]], ptr [[TMP11]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i16> [[TMP4]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <16 x i16> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <16 x i16> [[TMP7]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -332,22 +330,20 @@
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX5]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i8>, ptr [[TMP15]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = zext <8 x i8> [[WIDE_LOAD6]] to <8 x i16>
-; CHECK-NEXT:    [[TMP17:%.*]] = zext <8 x i16> [[TMP16]] to <8 x i32>
-; CHECK-NEXT:    [[TMP18:%.*]] = trunc <8 x i32> [[TMP17]] to <8 x i16>
-; CHECK-NEXT:    [[TMP19:%.*]] = add <8 x i16> [[TMP18]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
-; CHECK-NEXT:    [[TMP20:%.*]] = zext <8 x i16> [[TMP19]] to <8 x i32>
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc <8 x i32> [[TMP20]] to <8 x i16>
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0
-; CHECK-NEXT:    store <8 x i16> [[TMP21]], ptr [[TMP23]], align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX5]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = zext <8 x i8> [[WIDE_LOAD6]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = add <8 x i16> [[TMP14]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <8 x i16> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc <8 x i32> [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    store <8 x i16> [[TMP17]], ptr [[TMP19]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT7]] = add nuw i64 [[INDEX5]], 8
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP20]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N4]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -361,8 +357,8 @@
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP25]] to i32
+; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP21]] to i32
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[CONV]], 2
 ; CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDVARS_IV]]
@@ -485,52 +481,48 @@
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV13]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLATINSERT]] to <16 x i8>
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i8>
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i32> poison, i32 [[CONV11]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <16 x i32> [[BROADCAST_SPLATINSERT2]] to <16 x i8>
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT3]] to <16 x i32>
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT2]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT3]] to <16 x i8>
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = shl <16 x i8> [[TMP8]], <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = shl <16 x i8> [[WIDE_LOAD]], <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[TMP6]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i32> [[TMP7]] to <16 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <16 x i8> [[TMP8]], <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc <16 x i32> [[TMP10]] to <16 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = add <16 x i8> [[TMP11]], <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
-; CHECK-NEXT:    [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i8> [[TMP8]], <i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51>
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i8> [[WIDE_LOAD]], <i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51>
+; CHECK-NEXT:    [[TMP12:%.*]] = zext <16 x i8> [[TMP11]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <16 x i32> [[TMP12]] to <16 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = mul <16 x i8> [[TMP13]], <i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60>
 ; CHECK-NEXT:    [[TMP15:%.*]] = zext <16 x i8> [[TMP14]] to <16 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = trunc <16 x i32> [[TMP15]] to <16 x i8>
-; CHECK-NEXT:    [[TMP17:%.*]] = mul <16 x i8> [[TMP16]], <i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60>
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc <16 x i32> [[TMP10]] to <16 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = and <16 x i8> [[TMP16]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc <16 x i32> [[TMP13]] to <16 x i8>
-; CHECK-NEXT:    [[TMP20:%.*]] = trunc <16 x i32> [[TMP2]] to <16 x i8>
-; CHECK-NEXT:    [[TMP21:%.*]] = and <16 x i8> [[TMP19]], [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = zext <16 x i8> [[TMP21]] to <16 x i32>
-; CHECK-NEXT:    [[TMP23:%.*]] = trunc <16 x i32> [[TMP18]] to <16 x i8>
-; CHECK-NEXT:    [[TMP24:%.*]] = and <16 x i8> [[TMP23]], <i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4>
-; CHECK-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32>
-; CHECK-NEXT:    [[TMP26:%.*]] = trunc <16 x i32> [[TMP25]] to <16 x i8>
-; CHECK-NEXT:    [[TMP27:%.*]] = trunc <16 x i32> [[TMP4]] to <16 x i8>
-; CHECK-NEXT:    [[TMP28:%.*]] = xor <16 x i8> [[TMP26]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32>
-; CHECK-NEXT:    [[TMP30:%.*]] = trunc <16 x i32> [[TMP29]] to <16 x i8>
-; CHECK-NEXT:    [[TMP31:%.*]] = trunc <16 x i32> [[TMP22]] to <16 x i8>
-; CHECK-NEXT:    [[TMP32:%.*]] = mul <16 x i8> [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32>
-; CHECK-NEXT:    [[TMP34:%.*]] = trunc <16 x i32> [[TMP33]] to <16 x i8>
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i32 0
-; CHECK-NEXT:    store <16 x i8> [[TMP34]], ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc <16 x i32> [[TMP15]] to <16 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i8> [[TMP19]], <i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4>
+; CHECK-NEXT:    [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[TMP22:%.*]] = trunc <16 x i32> [[TMP21]] to <16 x i8>
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <16 x i8> [[TMP22]], [[TMP2]]
+; CHECK-NEXT:    [[TMP24:%.*]] = zext <16 x i8> [[TMP23]] to <16 x i32>
+; CHECK-NEXT:    [[TMP25:%.*]] = trunc <16 x i32> [[TMP24]] to <16 x i8>
+; CHECK-NEXT:    [[TMP26:%.*]] = trunc <16 x i32> [[TMP18]] to <16 x i8>
+; CHECK-NEXT:    [[TMP27:%.*]] = mul <16 x i8> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = zext <16 x i8> [[TMP27]] to <16 x i32>
+; CHECK-NEXT:    [[TMP29:%.*]] = trunc <16 x i32> [[TMP28]] to <16 x i8>
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-NEXT:    store <16 x i8> [[TMP29]], ptr [[TMP31]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -542,53 +534,49 @@
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x i32> poison, i32 [[CONV13]], i64 0
-; CHECK-NEXT:    [[TMP38:%.*]] = trunc <8 x i32> [[BROADCAST_SPLATINSERT8]] to <8 x i8>
-; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x i8> [[TMP38]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP39:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT9]] to <8 x i32>
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <8 x i32> poison, i32 [[CONV11]], i64 0
-; CHECK-NEXT:    [[TMP40:%.*]] = trunc <8 x i32> [[BROADCAST_SPLATINSERT10]] to <8 x i8>
-; CHECK-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <8 x i8> [[TMP40]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP41:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT11]] to <8 x i32>
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i32> poison, i32 [[CONV13]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT7]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT8]] to <8 x i8>
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i32> poison, i32 [[CONV11]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT9]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT10]] to <8 x i8>
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP42:%.*]] = add i64 [[INDEX7]], 0
-; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP42]]
-; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i8, ptr [[TMP43]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = load <8 x i8>, ptr [[TMP44]], align 1
-; CHECK-NEXT:    [[TMP46:%.*]] = shl <8 x i8> [[TMP45]], <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+; CHECK-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[INDEX11]], 0
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP35]]
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP36]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i8>, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP38:%.*]] = shl <8 x i8> [[WIDE_LOAD12]], <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+; CHECK-NEXT:    [[TMP39:%.*]] = zext <8 x i8> [[TMP38]] to <8 x i32>
+; CHECK-NEXT:    [[TMP40:%.*]] = trunc <8 x i32> [[TMP39]] to <8 x i8>
+; CHECK-NEXT:    [[TMP41:%.*]] = add <8 x i8> [[TMP40]], <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
+; CHECK-NEXT:    [[TMP42:%.*]] = zext <8 x i8> [[TMP41]] to <8 x i32>
+; CHECK-NEXT:    [[TMP43:%.*]] = or <8 x i8> [[WIDE_LOAD12]], <i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51>
+; CHECK-NEXT:    [[TMP44:%.*]] = zext <8 x i8> [[TMP43]] to <8 x i32>
+; CHECK-NEXT:    [[TMP45:%.*]] = trunc <8 x i32> [[TMP44]] to <8 x i8>
+; CHECK-NEXT:    [[TMP46:%.*]] = mul <8 x i8> [[TMP45]], <i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60>
 ; CHECK-NEXT:    [[TMP47:%.*]] = zext <8 x i8> [[TMP46]] to <8 x i32>
-; CHECK-NEXT:    [[TMP48:%.*]] = trunc <8 x i32> [[TMP47]] to <8 x i8>
-; CHECK-NEXT:    [[TMP49:%.*]] = add <8 x i8> [[TMP48]], <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
+; CHECK-NEXT:    [[TMP48:%.*]] = trunc <8 x i32> [[TMP42]] to <8 x i8>
+; CHECK-NEXT:    [[TMP49:%.*]] = and <8 x i8> [[TMP48]], [[TMP33]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = zext <8 x i8> [[TMP49]] to <8 x i32>
-; CHECK-NEXT:    [[TMP51:%.*]] = or <8 x i8> [[TMP45]], <i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51>
-; CHECK-NEXT:    [[TMP52:%.*]] = zext <8 x i8> [[TMP51]] to <8 x i32>
-; CHECK-NEXT:    [[TMP53:%.*]] = trunc <8 x i32> [[TMP52]] to <8 x i8>
-; CHECK-NEXT:    [[TMP54:%.*]] = mul <8 x i8> [[TMP53]], <i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60>
-; CHECK-NEXT:    [[TMP55:%.*]] = zext <8 x i8> [[TMP54]] to <8 x i32>
-; CHECK-NEXT:    [[TMP56:%.*]] = trunc <8 x i32> [[TMP50]] to <8 x i8>
-; CHECK-NEXT:    [[TMP57:%.*]] = trunc <8 x i32> [[TMP39]] to <8 x i8>
-; CHECK-NEXT:    [[TMP58:%.*]] = and <8 x i8> [[TMP56]], [[TMP57]]
-; CHECK-NEXT:    [[TMP59:%.*]] = zext <8 x i8> [[TMP58]] to <8 x i32>
-; CHECK-NEXT:    [[TMP60:%.*]] = trunc <8 x i32> [[TMP55]] to <8 x i8>
-; CHECK-NEXT:    [[TMP61:%.*]] = and <8 x i8> [[TMP60]], <i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4>
-; CHECK-NEXT:    [[TMP62:%.*]] = zext <8 x i8> [[TMP61]] to <8 x i32>
-; CHECK-NEXT:    [[TMP63:%.*]] = trunc <8 x i32> [[TMP62]] to <8 x i8>
-; CHECK-NEXT:    [[TMP64:%.*]] = trunc <8 x i32> [[TMP41]] to <8 x i8>
-; CHECK-NEXT:    [[TMP65:%.*]] = xor <8 x i8> [[TMP63]], [[TMP64]]
-; CHECK-NEXT:    [[TMP66:%.*]] = zext <8 x i8> [[TMP65]] to <8 x i32>
-; CHECK-NEXT:    [[TMP67:%.*]] = trunc <8 x i32> [[TMP66]] to <8 x i8>
-; CHECK-NEXT:    [[TMP68:%.*]] = trunc <8 x i32> [[TMP59]] to <8 x i8>
-; CHECK-NEXT:    [[TMP69:%.*]] = mul <8 x i8> [[TMP67]], [[TMP68]]
-; CHECK-NEXT:    [[TMP70:%.*]] = zext <8 x i8> [[TMP69]] to <8 x i32>
-; CHECK-NEXT:    [[TMP71:%.*]] = trunc <8 x i32> [[TMP70]] to <8 x i8>
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP42]]
-; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[TMP72]], i32 0
-; CHECK-NEXT:    store <8 x i8> [[TMP71]], ptr [[TMP73]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT12]] = add nuw i64 [[INDEX7]], 8
-; CHECK-NEXT:    [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[TMP74]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[TMP51:%.*]] = trunc <8 x i32> [[TMP47]] to <8 x i8>
+; CHECK-NEXT:    [[TMP52:%.*]] = and <8 x i8> [[TMP51]], <i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4>
+; CHECK-NEXT:    [[TMP53:%.*]] = zext <8 x i8> [[TMP52]] to <8 x i32>
+; CHECK-NEXT:    [[TMP54:%.*]] = trunc <8 x i32> [[TMP53]] to <8 x i8>
+; CHECK-NEXT:    [[TMP55:%.*]] = xor <8 x i8> [[TMP54]], [[TMP34]]
+; CHECK-NEXT:    [[TMP56:%.*]] = zext <8 x i8> [[TMP55]] to <8 x i32>
+; CHECK-NEXT:    [[TMP57:%.*]] = trunc <8 x i32> [[TMP56]] to <8 x i8>
+; CHECK-NEXT:    [[TMP58:%.*]] = trunc <8 x i32> [[TMP50]] to <8 x i8>
+; CHECK-NEXT:    [[TMP59:%.*]] = mul <8 x i8> [[TMP57]], [[TMP58]]
+; CHECK-NEXT:    [[TMP60:%.*]] = zext <8 x i8> [[TMP59]] to <8 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = trunc <8 x i32> [[TMP60]] to <8 x i8>
+; CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP35]]
+; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[TMP62]], i32 0
+; CHECK-NEXT:    store <8 x i8> [[TMP61]], ptr [[TMP63]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX11]], 8
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC5]]
+; CHECK-NEXT:    br i1 [[TMP64]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[CMP_N6]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -602,8 +590,8 @@
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP75:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP75]] to i32
+; CHECK-NEXT:    [[TMP65:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP65]] to i32
 ; CHECK-NEXT:    [[ADD:%.*]] = shl i32 [[CONV]], 4
 ; CHECK-NEXT:    [[CONV2:%.*]] = add nuw nsw i32 [[ADD]], 32
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[CONV]], 51
@@ -673,58 +661,52 @@
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV13]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLATINSERT]] to <16 x i8>
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i8>
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i32> poison, i32 [[CONV11]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <16 x i32> [[BROADCAST_SPLATINSERT2]] to <16 x i8>
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT3]] to <16 x i32>
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT2]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT3]] to <16 x i8>
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i16>, ptr [[TMP7]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i16> [[WIDE_LOAD]] to <16 x i8>
-; CHECK-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <16 x i32> [[TMP9]] to <16 x i8>
-; CHECK-NEXT:    [[TMP11:%.*]] = shl <16 x i8> [[TMP10]], <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
-; CHECK-NEXT:    [[TMP12:%.*]] = zext <16 x i8> [[TMP11]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc <16 x i32> [[TMP12]] to <16 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = add <16 x i8> [[TMP13]], <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
-; CHECK-NEXT:    [[TMP15:%.*]] = zext <16 x i8> [[TMP14]] to <16 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = and <16 x i8> [[TMP8]], <i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52>
-; CHECK-NEXT:    [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i32>
-; CHECK-NEXT:    [[TMP18:%.*]] = trunc <16 x i32> [[TMP17]] to <16 x i8>
-; CHECK-NEXT:    [[TMP19:%.*]] = or <16 x i8> [[TMP18]], <i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51>
-; CHECK-NEXT:    [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32>
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc <16 x i32> [[TMP20]] to <16 x i8>
-; CHECK-NEXT:    [[TMP22:%.*]] = mul <16 x i8> [[TMP21]], <i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60>
-; CHECK-NEXT:    [[TMP23:%.*]] = zext <16 x i8> [[TMP22]] to <16 x i32>
-; CHECK-NEXT:    [[TMP24:%.*]] = trunc <16 x i32> [[TMP15]] to <16 x i8>
-; CHECK-NEXT:    [[TMP25:%.*]] = trunc <16 x i32> [[TMP2]] to <16 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = and <16 x i8> [[TMP24]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32>
-; CHECK-NEXT:    [[TMP28:%.*]] = trunc <16 x i32> [[TMP23]] to <16 x i8>
-; CHECK-NEXT:    [[TMP29:%.*]] = and <16 x i8> [[TMP28]], <i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4>
-; CHECK-NEXT:    [[TMP30:%.*]] = zext <16 x i8> [[TMP29]] to <16 x i32>
-; CHECK-NEXT:    [[TMP31:%.*]] = trunc <16 x i32> [[TMP30]] to <16 x i8>
-; CHECK-NEXT:    [[TMP32:%.*]] = trunc <16 x i32> [[TMP4]] to <16 x i8>
-; CHECK-NEXT:    [[TMP33:%.*]] = xor <16 x i8> [[TMP31]], [[TMP32]]
-; CHECK-NEXT:    [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32>
-; CHECK-NEXT:    [[TMP35:%.*]] = trunc <16 x i32> [[TMP34]] to <16 x i8>
-; CHECK-NEXT:    [[TMP36:%.*]] = trunc <16 x i32> [[TMP27]] to <16 x i8>
-; CHECK-NEXT:    [[TMP37:%.*]] = mul <16 x i8> [[TMP35]], [[TMP36]]
-; CHECK-NEXT:    [[TMP38:%.*]] = zext <16 x i8> [[TMP37]] to <16 x i32>
-; CHECK-NEXT:    [[TMP39:%.*]] = trunc <16 x i32> [[TMP38]] to <16 x i8>
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP40]], i32 0
-; CHECK-NEXT:    store <16 x i8> [[TMP39]], ptr [[TMP41]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i16>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc <16 x i16> [[WIDE_LOAD]] to <16 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <16 x i8> [[TMP6]], <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[TMP7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <16 x i32> [[TMP8]] to <16 x i8>
+; CHECK-NEXT:    [[TMP10:%.*]] = add <16 x i8> [[TMP9]], <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i8> [[TMP6]], <i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52>
+; CHECK-NEXT:    [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <16 x i32> [[TMP13]] to <16 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i8> [[TMP14]], <i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51>
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <16 x i8> [[TMP15]] to <16 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc <16 x i32> [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = mul <16 x i8> [[TMP17]], <i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60>
+; CHECK-NEXT:    [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = trunc <16 x i32> [[TMP11]] to <16 x i8>
+; CHECK-NEXT:    [[TMP21:%.*]] = and <16 x i8> [[TMP20]], [[TMP1]]
+; CHECK-NEXT:    [[TMP22:%.*]] = zext <16 x i8> [[TMP21]] to <16 x i32>
+; CHECK-NEXT:    [[TMP23:%.*]] = trunc <16 x i32> [[TMP19]] to <16 x i8>
+; CHECK-NEXT:    [[TMP24:%.*]] = and <16 x i8> [[TMP23]], <i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4>
+; CHECK-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32>
+; CHECK-NEXT:    [[TMP26:%.*]] = trunc <16 x i32> [[TMP25]] to <16 x i8>
+; CHECK-NEXT:    [[TMP27:%.*]] = xor <16 x i8> [[TMP26]], [[TMP2]]
+; CHECK-NEXT:    [[TMP28:%.*]] = zext <16 x i8> [[TMP27]] to <16 x i32>
+; CHECK-NEXT:    [[TMP29:%.*]] = trunc <16 x i32> [[TMP28]] to <16 x i8>
+; CHECK-NEXT:    [[TMP30:%.*]] = trunc <16 x i32> [[TMP22]] to <16 x i8>
+; CHECK-NEXT:    [[TMP31:%.*]] = mul <16 x i8> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = zext <16 x i8> [[TMP31]] to <16 x i32>
+; CHECK-NEXT:    [[TMP33:%.*]] = trunc <16 x i32> [[TMP32]] to <16 x i8>
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i32 0
+; CHECK-NEXT:    store <16 x i8> [[TMP33]], ptr [[TMP35]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -736,59 +718,53 @@
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i32> poison, i32 [[CONV13]], i64 0
-; CHECK-NEXT:    [[TMP43:%.*]] = trunc <8 x i32> [[BROADCAST_SPLATINSERT9]] to <8 x i8>
-; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i8> [[TMP43]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP44:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT10]] to <8 x i32>
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x i32> poison, i32 [[CONV11]], i64 0
-; CHECK-NEXT:    [[TMP45:%.*]] = trunc <8 x i32> [[BROADCAST_SPLATINSERT11]] to <8 x i8>
-; CHECK-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x i8> [[TMP45]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP46:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT12]] to <8 x i32>
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i32> poison, i32 [[CONV13]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT7]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT8]] to <8 x i8>
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i32> poison, i32 [[CONV11]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT9]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT10]] to <8 x i8>
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP47:%.*]] = add i64 [[INDEX7]], 0
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i16, ptr [[TMP48]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP49]], align 2
-; CHECK-NEXT:    [[TMP50:%.*]] = trunc <8 x i16> [[WIDE_LOAD8]] to <8 x i8>
-; CHECK-NEXT:    [[TMP51:%.*]] = zext <8 x i8> [[TMP50]] to <8 x i32>
-; CHECK-NEXT:    [[TMP52:%.*]] = trunc <8 x i32> [[TMP51]] to <8 x i8>
-; CHECK-NEXT:    [[TMP53:%.*]] = shl <8 x i8> [[TMP52]], <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
-; CHECK-NEXT:    [[TMP54:%.*]] = zext <8 x i8> [[TMP53]] to <8 x i32>
-; CHECK-NEXT:    [[TMP55:%.*]] = trunc <8 x i32> [[TMP54]] to <8 x i8>
-; CHECK-NEXT:    [[TMP56:%.*]] = add <8 x i8> [[TMP55]], <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
-; CHECK-NEXT:    [[TMP57:%.*]] = zext <8 x i8> [[TMP56]] to <8 x i32>
-; CHECK-NEXT:    [[TMP58:%.*]] = and <8 x i8> [[TMP50]], <i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52>
-; CHECK-NEXT:    [[TMP59:%.*]] = zext <8 x i8> [[TMP58]] to <8 x i32>
-; CHECK-NEXT:    [[TMP60:%.*]] = trunc <8 x i32> [[TMP59]] to <8 x i8>
-; CHECK-NEXT:    [[TMP61:%.*]] = or <8 x i8> [[TMP60]], <i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51>
-; CHECK-NEXT:    [[TMP62:%.*]] = zext <8 x i8> [[TMP61]] to <8 x i32>
-; CHECK-NEXT:    [[TMP63:%.*]] = trunc <8 x i32> [[TMP62]] to <8 x i8>
-; CHECK-NEXT:    [[TMP64:%.*]] = mul <8 x i8> [[TMP63]], <i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60>
-; CHECK-NEXT:    [[TMP65:%.*]] = zext <8 x i8> [[TMP64]] to <8 x i32>
-; CHECK-NEXT:    [[TMP66:%.*]] = trunc <8 x i32> [[TMP57]] to <8 x i8>
-; CHECK-NEXT:    [[TMP67:%.*]] = trunc <8 x i32> [[TMP44]] to <8 x i8>
-; CHECK-NEXT:    [[TMP68:%.*]] = and <8 x i8> [[TMP66]], [[TMP67]]
-; CHECK-NEXT:    [[TMP69:%.*]] = zext <8 x i8> [[TMP68]] to <8 x i32>
-; CHECK-NEXT:    [[TMP70:%.*]] = trunc <8 x i32> [[TMP65]] to <8 x i8>
-; CHECK-NEXT:    [[TMP71:%.*]] = and <8 x i8> [[TMP70]], <i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4>
-; CHECK-NEXT:    [[TMP72:%.*]] = zext <8 x i8> [[TMP71]] to <8 x i32>
-; CHECK-NEXT:    [[TMP73:%.*]] = trunc <8 x i32> [[TMP72]] to <8 x i8>
-; CHECK-NEXT:    [[TMP74:%.*]] = trunc <8 x i32> [[TMP46]] to <8 x i8>
-; CHECK-NEXT:    [[TMP75:%.*]] = xor <8 x i8> [[TMP73]], [[TMP74]]
-; CHECK-NEXT:    [[TMP76:%.*]] = zext <8 x i8> [[TMP75]] to <8 x i32>
-; CHECK-NEXT:    [[TMP77:%.*]] = trunc <8 x i32> [[TMP76]] to <8 x i8>
-; CHECK-NEXT:    [[TMP78:%.*]] = trunc <8 x i32> [[TMP69]] to <8 x i8>
-; CHECK-NEXT:    [[TMP79:%.*]] = mul <8 x i8> [[TMP77]], [[TMP78]]
-; CHECK-NEXT:    [[TMP80:%.*]] = zext <8 x i8> [[TMP79]] to <8 x i32>
-; CHECK-NEXT:    [[TMP81:%.*]] = trunc <8 x i32> [[TMP80]] to <8 x i8>
-; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP47]]
-; CHECK-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[TMP82]], i32 0
-; CHECK-NEXT:    store <8 x i8> [[TMP81]], ptr [[TMP83]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX7]], 8
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[TMP84]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP39:%.*]] = add i64 [[INDEX11]], 0
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP39]]
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i16, ptr [[TMP40]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i16>, ptr [[TMP41]], align 2
+; CHECK-NEXT:    [[TMP42:%.*]] = trunc <8 x i16> [[WIDE_LOAD12]] to <8 x i8>
+; CHECK-NEXT:    [[TMP43:%.*]] = shl <8 x i8> [[TMP42]], <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+; CHECK-NEXT:    [[TMP44:%.*]] = zext <8 x i8> [[TMP43]] to <8 x i32>
+; CHECK-NEXT:    [[TMP45:%.*]] = trunc <8 x i32> [[TMP44]] to <8 x i8>
+; CHECK-NEXT:    [[TMP46:%.*]] = add <8 x i8> [[TMP45]], <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
+; CHECK-NEXT:    [[TMP47:%.*]] = zext <8 x i8> [[TMP46]] to <8 x i32>
+; CHECK-NEXT:    [[TMP48:%.*]] = and <8 x i8> [[TMP42]], <i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52, i8 -52>
+; CHECK-NEXT:    [[TMP49:%.*]] = zext <8 x i8> [[TMP48]] to <8 x i32>
+; CHECK-NEXT:    [[TMP50:%.*]] = trunc <8 x i32> [[TMP49]] to <8 x i8>
+; CHECK-NEXT:    [[TMP51:%.*]] = or <8 x i8> [[TMP50]], <i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51>
+; CHECK-NEXT:    [[TMP52:%.*]] = zext <8 x i8> [[TMP51]] to <8 x i32>
+; CHECK-NEXT:    [[TMP53:%.*]] = trunc <8 x i32> [[TMP52]] to <8 x i8>
+; CHECK-NEXT:    [[TMP54:%.*]] = mul <8 x i8> [[TMP53]], <i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60, i8 60>
+; CHECK-NEXT:    [[TMP55:%.*]] = zext <8 x i8> [[TMP54]] to <8 x i32>
+; CHECK-NEXT:    [[TMP56:%.*]] = trunc <8 x i32> [[TMP47]] to <8 x i8>
+; CHECK-NEXT:    [[TMP57:%.*]] = and <8 x i8> [[TMP56]], [[TMP37]]
+; CHECK-NEXT:    [[TMP58:%.*]] = zext <8 x i8> [[TMP57]] to <8 x i32>
+; CHECK-NEXT:    [[TMP59:%.*]] = trunc <8 x i32> [[TMP55]] to <8 x i8>
+; CHECK-NEXT:    [[TMP60:%.*]] = and <8 x i8> [[TMP59]], <i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4>
+; CHECK-NEXT:    [[TMP61:%.*]] = zext <8 x i8> [[TMP60]] to <8 x i32>
+; CHECK-NEXT:    [[TMP62:%.*]] = trunc <8 x i32> [[TMP61]] to <8 x i8>
+; CHECK-NEXT:    [[TMP63:%.*]] = xor <8 x i8> [[TMP62]], [[TMP38]]
+; CHECK-NEXT:    [[TMP64:%.*]] = zext <8 x i8> [[TMP63]] to <8 x i32>
+; CHECK-NEXT:    [[TMP65:%.*]] = trunc <8 x i32> [[TMP64]] to <8 x i8>
+; CHECK-NEXT:    [[TMP66:%.*]] = trunc <8 x i32> [[TMP58]] to <8 x i8>
+; CHECK-NEXT:    [[TMP67:%.*]] = mul <8 x i8> [[TMP65]], [[TMP66]]
+; CHECK-NEXT:    [[TMP68:%.*]] = zext <8 x i8> [[TMP67]] to <8 x i32>
+; CHECK-NEXT:    [[TMP69:%.*]] = trunc <8 x i32> [[TMP68]] to <8 x i8>
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP39]]
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i8, ptr [[TMP70]], i32 0
+; CHECK-NEXT:    store <8 x i8> [[TMP69]], ptr [[TMP71]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX11]], 8
+; CHECK-NEXT:    [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC5]]
+; CHECK-NEXT:    br i1 [[TMP72]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[CMP_N6]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -802,8 +778,8 @@
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP85:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP85]] to i32
+; CHECK-NEXT:    [[TMP73:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP73]] to i32
 ; CHECK-NEXT:    [[ADD:%.*]] = shl i32 [[CONV]], 4
 ; CHECK-NEXT:    [[CONV2:%.*]] = add nsw i32 [[ADD]], 32
 ; CHECK-NEXT:    [[OR:%.*]] = and i32 [[CONV]], 204
diff --git a/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll b/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll
--- a/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll
+++ b/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll
@@ -328,16 +328,14 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i16>
-; CHECK-NEXT:    [[TMP7:%.*]] = lshr <4 x i16> [[TMP6]], <i16 4, i16 4, i16 4, i16 4>
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i16> [[TMP7]] to <4 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i16>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i16> [[TMP9]] to <4 x i8>
-; CHECK-NEXT:    store <4 x i8> [[TMP10]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr <4 x i16> [[TMP4]], <i16 4, i16 4, i16 4, i16 4>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i16> [[TMP7]] to <4 x i8>
+; CHECK-NEXT:    store <4 x i8> [[TMP8]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph: