Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2457,7 +2457,31 @@
 
   unsigned BundleWidth = VectorizableTree[0].Scalars.size();
 
-  for (TreeEntry &TE : VectorizableTree) {
+  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
+    TreeEntry &TE = VectorizableTree[I];
+
+    // We create duplicate tree entries for gather sequences that have multiple
+    // uses. However, we should only compute the cost of duplicate sequences
+    // once. For example, if we have a build vector (i.e., insertelement
+    // sequence) that is used by more than one vector instruction, we only need
+    // to compute the cost of the insertelement instructions once. The
+    // redundent instructions will be eliminated by CSE.
+    //
+    // We should consider not creating duplicate tree entries for gather
+    // sequences, and instead add additional edges to the tree representing
+    // their uses. Since such an approach results in fewer total entries,
+    // existing heuristics based on tree size may yeild different results.
+    //
+    if (TE.NeedToGather) {
+      auto It =
+          std::find_if(std::next(VectorizableTree.begin(), I + 1),
+                       VectorizableTree.end(), [TE](TreeEntry &Entry) {
+                         return Entry.NeedToGather && Entry.isSame(TE.Scalars);
+                       });
+      if (It != VectorizableTree.end())
+        continue;
+    }
+
     int C = getEntryCost(&TE);
     DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
                  << *TE.Scalars[0] << ".\n");
Index: test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -S -slp-vectorizer -instcombine -pass-remarks-output=%t | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=REMARK %s
+; RUN: opt < %s -S -passes='slp-vectorizer,instcombine' -pass-remarks-output=%t | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=REMARK %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL:  @gather_multiple_use(
+; CHECK-NEXT:     [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+; CHECK-NEXT:     [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 1
+; CHECK-NEXT:     [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 %b, i32 2
+; CHECK-NEXT:     [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 %d, i32 3
+; CHECK-NEXT:     [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:     [[TMP6:%.*]] = and <4 x i32> [[TMP5]], <i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:     [[TMP7:%.*]] = mul nuw <4 x i32> [[TMP6]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:     [[TMP8:%.*]] = add <4 x i32> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:     [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:     [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP9]])
+; CHECK-NEXT:     ret i32 [[TMP10]]
+;
+; REMARK-LABEL: Function: gather_multiple_use
+; REMARK:       Args:
+; REMARK-NEXT:    - String: 'Vectorized horizontal reduction with cost '
+; REMARK-NEXT:    - Cost: '-7'
+;
+define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
+  %tmp00 = lshr i32 %a, 15
+  %tmp01 = and i32 %tmp00, 65537
+  %tmp02 = mul nuw i32 %tmp01, 65535
+  %tmp03 = add i32 %tmp02, %a
+  %tmp04 = xor i32 %tmp03, %tmp02
+  %tmp05 = lshr i32 %c, 15
+  %tmp06 = and i32 %tmp05, 65537
+  %tmp07 = mul nuw i32 %tmp06, 65535
+  %tmp08 = add i32 %tmp07, %c
+  %tmp09 = xor i32 %tmp08, %tmp07
+  %tmp10 = lshr i32 %b, 15
+  %tmp11 = and i32 %tmp10, 65537
+  %tmp12 = mul nuw i32 %tmp11, 65535
+  %tmp13 = add i32 %tmp12, %b
+  %tmp14 = xor i32 %tmp13, %tmp12
+  %tmp15 = lshr i32 %d, 15
+  %tmp16 = and i32 %tmp15, 65537
+  %tmp17 = mul nuw i32 %tmp16, 65535
+  %tmp18 = add i32 %tmp17, %d
+  %tmp19 = xor i32 %tmp18, %tmp17
+  %tmp20 = add i32 %tmp09, %tmp04
+  %tmp21 = add i32 %tmp20, %tmp14
+  %tmp22 = add i32 %tmp21, %tmp19
+  ret i32 %tmp22
+}
Index: test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
+++ test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
@@ -141,15 +141,16 @@
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
+; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
 ; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
 ; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X]], [[X]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sdiv i8 [[TMP2]], [[TMP5]]
-; CHECK-NEXT:    ret i8 [[TMP6]]
+; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
+; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i8 [[TMP3]]
 ;
   %x0 = extractelement <4 x i8> %x, i32 0
   br label %bb1