diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9343,6 +9343,7 @@
 
   VPlanTransforms::sinkScalarOperands(*Plan);
   VPlanTransforms::mergeReplicateRegions(*Plan);
+  VPlanTransforms::optimizeRecipes(*Plan);
 
   std::string PlanName;
   raw_string_ostream RSO(PlanName);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -32,6 +32,9 @@
   static bool sinkScalarOperands(VPlan &Plan);
 
   static bool mergeReplicateRegions(VPlan &Plan);
+
+  /// Apply local optimizations to the recipes in \p Plan.
+  static void optimizeRecipes(VPlan &Plan);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -282,3 +282,39 @@
     delete ToDelete;
   return Changed;
 }
+
+void VPlanTransforms::optimizeRecipes(VPlan &Plan) {
+  auto Iter = depth_first(
+      VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
+    for (auto &Recipe : make_early_inc_range(*VPBB)) {
+      // Try to widen replicate recipes, if it the recipes has users that use
+      // the vector value.
+      if (auto *RepR = dyn_cast<VPReplicateRecipe>(&Recipe)) {
+        if (RepR->isPredicated() || RepR->isUniform())
+          continue;
+
+        if (!isa<GetElementPtrInst>(RepR->getUnderlyingValue()))
+          continue;
+
+        auto IsUsedAsVector = [RepR](VPUser *U) {
+          auto *UI = dyn_cast<VPRecipeBase>(U);
+          if (!UI)
+            return false;
+          if (auto *Mem = dyn_cast<VPWidenMemoryInstructionRecipe>(UI))
+            return Mem->getNumOperands() > 1 && RepR == Mem->getOperand(1);
+          return isa<VPWidenRecipe>(UI);
+        };
+        if (!any_of(RepR->users(), IsUsedAsVector))
+          continue;
+
+        auto *W = new VPWidenGEPRecipe(
+            cast<GetElementPtrInst>(RepR->getUnderlyingValue()),
+            RepR->operands());
+        W->insertBefore(RepR);
+        RepR->replaceAllUsesWith(W);
+        RepR->eraseFromParent();
+      }
+    }
+  }
+}
diff --git a/llvm/test/Transforms/LoopVectorize/gep-used-as-scalar-and-vector.ll b/llvm/test/Transforms/LoopVectorize/gep-used-as-scalar-and-vector.ll
--- a/llvm/test/Transforms/LoopVectorize/gep-used-as-scalar-and-vector.ll
+++ b/llvm/test/Transforms/LoopVectorize/gep-used-as-scalar-and-vector.ll
@@ -17,23 +17,20 @@
 ; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, i8* %src, i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX1]], 1
 ; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, i8* %src, i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP6]], i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP7]], i64 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i8*> poison, i8* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i8*> [[TMP6]], i8* [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8** [[TMP8]] to <2 x i8*>*
-; CHECK-NEXT:    store <2 x i8*> [[TMP7]], <2 x i8*>* [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[NEXT_GEP6]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <2 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, <2 x i8>* [[TMP11]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i8> [[WIDE_LOAD]], <i8 1, i8 1>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP10]] to <2 x i8>*
-; CHECK-NEXT:    store <2 x i8> [[TMP12]], <2 x i8>* [[TMP13]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP6]], <2 x i64> <i64 1, i64 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8** [[TMP5]] to <2 x i8*>*
+; CHECK-NEXT:    store <2 x i8*> [[TMP4]], <2 x i8*>* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[NEXT_GEP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, <2 x i8>* [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i8> [[WIDE_LOAD]], <i8 1, i8 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>*
+; CHECK-NEXT:    store <2 x i8> [[TMP9]], <2 x i8>* [[TMP10]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 2
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]],
-; CHECK-NEXT:    br i1 [[TMP14]], label %middle.block, label %vector.body
-;
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT2]],
+; CHECK-NEXT:    br i1 [[TMP11]], label %middle.block, label %vector.body
+
 
 entry:
   br label %loop.body