diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6881,6 +6881,42 @@
         (void)E;
         return VecCost - ScalarCost;
       };
+  // Calculate cost difference from vectorizing set of GEPs.
+  // Negative value means vectorizing is profitable.
+  auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
+    InstructionCost CostSavings = 0;
+    for (Value *V : Ptrs) {
+      if (V == BasePtr)
+        continue;
+      auto *Ptr = dyn_cast<GetElementPtrInst>(V);
+      // GEPs may contain just addresses without instructions, considered free.
+      // GEPs with all constant indices also considered to have zero cost.
+      if (!Ptr || Ptr->hasAllConstantIndices())
+        continue;
+
+      // Here we differentiate two cases: when GEPs represent a regular
+      // vectorization tree node (and hence vectorized) and when the set is
+      // arguments of a set of loads or stores being vectorized. In the former
+      // case all the scalar GEPs will be removed as a result of vectorization.
+      // For any external uses of some lanes extract element instructions will
+      // be generated (which cost is estimated separately). For the latter case
+      // since the set of GEPs itself is not vectorized those used more than
+      // once will remain staying in vectorized code as well. So we should not
+      // count them as savings.
+      if (!Ptr->hasOneUse() && isa<LoadInst, StoreInst>(VL0))
+        continue;
+
+      // TODO: it is target dependent, so need to implement and then use a TTI
+      // interface.
+      CostSavings += TTI->getArithmeticInstrCost(Instruction::Add,
+                                                 Ptr->getType(), CostKind);
+    }
+    LLVM_DEBUG(dbgs() << "SLP: Calculated GEPs cost savings or Tree:\n";
+               E->dump());
+    LLVM_DEBUG(dbgs() << "SLP:     GEP cost saving = " << CostSavings << "\n");
+    return InstructionCost() - CostSavings;
+  };
+
   switch (ShuffleOrOp) {
   case Instruction::PHI: {
     // Count reused scalars.
@@ -7155,52 +7191,39 @@
   case Instruction::AShr:
   case Instruction::And:
   case Instruction::Or:
-  case Instruction::Xor:
-  case Instruction::GetElementPtr: {
-    unsigned Opcode = ShuffleOrOp == Instruction::GetElementPtr
-                          ? static_cast<unsigned>(Instruction::Add)
-                          : ShuffleOrOp;
+  case Instruction::Xor: {
     auto GetScalarCost = [=](unsigned Idx) {
-      auto *VI = dyn_cast<Instruction>(VL[Idx]);
-      // GEPs may contain just addresses without instructions, consider
-      // their cost 0.
-      if (!VI)
-        return InstructionCost();
+      auto *VI = cast<Instruction>(VL[Idx]);
       unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
       TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
       TTI::OperandValueInfo Op2Info =
           TTI::getOperandInfo(VI->getOperand(OpIdx));
       SmallVector<const Value *> Operands(VI->operand_values());
-      return TTI->getArithmeticInstrCost(Opcode, ScalarTy, CostKind, Op1Info,
-                                         Op2Info, Operands, VI);
+      return TTI->getArithmeticInstrCost(ShuffleOrOp, ScalarTy, CostKind,
+                                         Op1Info, Op2Info, Operands, VI);
     };
     auto GetVectorCost = [=](InstructionCost CommonCost) {
       unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
       TTI::OperandValueInfo Op1Info = getOperandInfo(VL, 0);
       TTI::OperandValueInfo Op2Info = getOperandInfo(VL, OpIdx);
-      return TTI->getArithmeticInstrCost(Opcode, VecTy, CostKind, Op1Info,
+      return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
                                          Op2Info) +
              CommonCost;
     };
     return GetCostDiff(GetScalarCost, GetVectorCost);
   }
+  case Instruction::GetElementPtr: {
+    return CommonCost + GetGEPCostDiff(VL, VL0);
+  }
   case Instruction::Load: {
     auto GetScalarCost = [=](unsigned Idx) {
       auto *VI = cast<LoadInst>(VL[Idx]);
-      InstructionCost GEPCost = 0;
-      if (VI != VL0) {
-        auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand());
-        if (Ptr && Ptr->hasOneUse() && !Ptr->hasAllConstantIndices())
-          GEPCost = TTI->getArithmeticInstrCost(Instruction::Add,
-                                                Ptr->getType(), CostKind);
-      }
-      return GEPCost +
-             TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),
+      return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),
                                   VI->getPointerAddressSpace(), CostKind,
                                   TTI::OperandValueInfo(), VI);
     };
+    auto *LI0 = cast<LoadInst>(VL0);
     auto GetVectorCost = [=](InstructionCost CommonCost) {
-      auto *LI0 = cast<LoadInst>(VL0);
       InstructionCost VecLdCost;
       if (E->State == TreeEntry::Vectorize) {
         VecLdCost = TTI->getMemoryOpCost(
@@ -7218,34 +7241,46 @@
       }
       return VecLdCost + CommonCost;
     };
-    return GetCostDiff(GetScalarCost, GetVectorCost);
+
+    InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
+    // If this node generates masked gather load then it is not a terminal node.
+    // Hence address operand cost is estimated separately.
+    if (E->State == TreeEntry::ScatterVectorize)
+      return Cost;
+
+    // Estimate cost of GEPs since this tree node is a terminator.
+    SmallVector<Value *> PointerOps(VL.size());
+    for (auto [I, V] : enumerate(VL))
+      PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
+    return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
   }
   case Instruction::Store: {
     bool IsReorder = !E->ReorderIndices.empty();
-    auto *SI = cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
     auto GetScalarCost = [=](unsigned Idx) {
       auto *VI = cast<StoreInst>(VL[Idx]);
-      InstructionCost GEPCost = 0;
-      if (VI != SI) {
-        auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand());
-        if (Ptr && Ptr->hasOneUse() && !Ptr->hasAllConstantIndices())
-          GEPCost = TTI->getArithmeticInstrCost(Instruction::Add,
-                                                Ptr->getType(), CostKind);
-      }
       TTI::OperandValueInfo OpInfo = getOperandInfo(VI, 0);
-      return GEPCost + TTI->getMemoryOpCost(
-                           Instruction::Store, ScalarTy, VI->getAlign(),
-                           VI->getPointerAddressSpace(), CostKind, OpInfo, VI);
+      return TTI->getMemoryOpCost(Instruction::Store, ScalarTy, VI->getAlign(),
+                                  VI->getPointerAddressSpace(), CostKind,
+                                  OpInfo, VI);
     };
+    auto *BaseSI =
+        cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
     auto GetVectorCost = [=](InstructionCost CommonCost) {
       // We know that we can merge the stores. Calculate the cost.
       TTI::OperandValueInfo OpInfo = getOperandInfo(VL, 0);
-      return TTI->getMemoryOpCost(Instruction::Store, VecTy, SI->getAlign(),
-                                  SI->getPointerAddressSpace(), CostKind,
+      return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
+                                  BaseSI->getPointerAddressSpace(), CostKind,
                                   OpInfo) +
              CommonCost;
     };
-    return GetCostDiff(GetScalarCost, GetVectorCost);
+    SmallVector<Value *> PointerOps(VL.size());
+    for (auto [I, V] : enumerate(VL)) {
+      unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
+      PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
+    }
+
+    return GetCostDiff(GetScalarCost, GetVectorCost) +
+           GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
   }
   case Instruction::Call: {
     auto GetScalarCost = [=](unsigned Idx) {
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=haswell < %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=haswell -slp-threshold=-3 < %s | FileCheck %s
 @e = dso_local local_unnamed_addr global i32 0, align 4
 @f = dso_local local_unnamed_addr global i32 0, align 4
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -22,7 +22,7 @@
   ; YAML-NEXT: Function:        test
   ; YAML-NEXT: Args:
   ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-  ; YAML-NEXT:   - Cost:            '-17'
+  ; YAML-NEXT:   - Cost:            '-3'
   ; YAML-NEXT:   - String:          ' and with tree size '
   ; YAML-NEXT:   - TreeSize:        '7'
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-12 | FileCheck %s
+; RUN: opt -S -passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-13 | FileCheck %s
 
 define void @test(i1 %c, ptr %arg) {
 ; CHECK-LABEL: @test(