diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -70,9 +70,11 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/FormattedStream.h"
 #ifdef EXPENSIVE_CHECKS
 #include "llvm/IR/Verifier.h"
 #endif
+#include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -117,6 +119,10 @@
                      cl::desc("Only vectorize if you gain more than this "
                               "number "));
 
+static cl::opt<bool>
+    SLPDumpIRWithCosts("slp-dump-ir-with-costs", cl::init(false), cl::Hidden,
+                       cl::desc("Dump IR annotated with SLP tree costs"));
+
 static cl::opt<bool>
 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
                    cl::desc("Attempt to vectorize horizontal reductions"));
@@ -2044,6 +2050,16 @@
 
   ~BoUpSLP();
 
+#ifndef NDEBUG
+  llvm::Optional<InstructionCost>
+  getVectorizedInstrCost(const Instruction *I) const {
+    auto It = VectorizedInstrCostMap.find(I);
+    return It != VectorizedInstrCostMap.end()
+               ? llvm::Optional<InstructionCost>(It->second)
+               : None;
+  }
+#endif
+
 private:
   /// Check if the operands on the edges \p Edges of the \p UserTE allows
   /// reordering (i.e. the operands can be reordered because they have only one
@@ -2246,6 +2262,9 @@
     /// have multiple users so the data structure is not truly a tree.
     SmallVector<EdgeInfo, 1> UserTreeIndices;
 
+    /// The cost gain (vector - scalar) if we decide to vectorize this entry.
+    InstructionCost Cost;
+
     /// The index of this treeEntry in VectorizableTree.
     int Idx = -1;
 
@@ -2373,6 +2392,10 @@
       return FoundLane;
     }
 
+    void setCost(InstructionCost Cost) { this->Cost = Cost; }
+
+    InstructionCost getCost() const { return Cost; }
+
 #ifndef NDEBUG
     /// Debug printer.
     LLVM_DUMP_METHOD void dump() const {
@@ -2397,6 +2420,9 @@
         dbgs() << "NeedToGather\n";
         break;
       }
+      dbgs() << "Cost: "
+             << (Cost.getValue() ? std::to_string(*Cost.getValue()) : "-")
+             << "\n";
       dbgs() << "MainOp: ";
       if (MainOp)
         dbgs() << *MainOp << "\n";
@@ -2549,6 +2575,12 @@
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
+#ifndef NDEBUG
+  /// Helper map for holding the cost of vectorized instructions. Used only in
+  /// lit tests with -slp-dump-ir-with-costs.
+  SmallDenseMap<const Instruction *, InstructionCost> VectorizedInstrCostMap;
+#endif
+
   /// This POD struct describes one external user in the vectorized tree.
   struct ExternalUser {
     ExternalUser(Value *S, llvm::User *U, int L)
@@ -6188,6 +6220,7 @@
     TreeEntry &TE = *VectorizableTree[I];
 
     InstructionCost C = getEntryCost(&TE, VectorizedVals);
+    TE.setCost(C);
     Cost += C;
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
                       << " for bundle that starts with " << *TE.Scalars[0]
@@ -6905,6 +6938,11 @@
     ShuffleBuilder.addMask(ReuseShuffleIndicies);
     Vec = ShuffleBuilder.finalize(Vec);
   }
+#ifndef NDEBUG
+  if (SLPDumpIRWithCosts)
+    if (Instruction *I = dyn_cast<Instruction>(Vec))
+      VectorizedInstrCostMap[I] = getGatherCost(VL);
+#endif
   return Vec;
 }
 
@@ -7495,7 +7533,18 @@
 
 Value *BoUpSLP::vectorizeTree() {
   ExtraValueToDebugLocsMap ExternallyUsedValues;
-  return vectorizeTree(ExternallyUsedValues);
+  Value *V = vectorizeTree(ExternallyUsedValues);
+#ifndef NDEBUG
+  if (SLPDumpIRWithCosts) {
+    // Collect the cost of all vectorized instructions.
+    for (const auto &TE : VectorizableTree) {
+      if (Value *VectorizedValue = TE->VectorizedValue)
+        if (Instruction *I = dyn_cast<Instruction>(VectorizedValue))
+          VectorizedInstrCostMap[I] = TE->getCost();
+    }
+  }
+#endif
+  return V;
 }
 
 Value *
@@ -8830,6 +8879,24 @@
     R.optimizeGatherSequence();
     LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
   }
+
+  if (SLPDumpIRWithCosts) {
+    /// Helper class for annotating vector instructions with their entry cost.
+    class SLPAssemblyAnnotationWriter : public AssemblyAnnotationWriter {
+      BoUpSLP *R;
+
+    public:
+      SLPAssemblyAnnotationWriter(BoUpSLP *R) : R(R) {}
+      void printInfoComment(const Value &V,
+                            formatted_raw_ostream &OS) override {
+        if (const Instruction *I = dyn_cast<Instruction>(&V))
+          if (Optional<InstructionCost> CostOpt = R->getVectorizedInstrCost(I))
+            OS << " Cost:" << CostOpt->getValue();
+      }
+    };
+    SLPAssemblyAnnotationWriter SLPAAW(&R);
+    F.print(outs(), &SLPAAW, true, true);
+  }
   return Changed;
 }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -slp-threshold=-999 < %s | FileCheck %s
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -slp-threshold=-999 -disable-output -slp-dump-ir-with-costs < %s | FileCheck %s
 
 
 ; S[0] = %v1 + %v2
@@ -18,13 +18,24 @@
 ; CHECK-NEXT:    [[V1:%.*]] = sub i64 [[A0]], 1
 ; CHECK-NEXT:    [[V2:%.*]] = sub i64 [[B0]], 1
 ; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds i64, i64* [[S:%.*]], i64 0
+; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 1
+; CHECK-NEXT:    [[IDXS2:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 2
+; CHECK-NEXT:    [[IDXS3:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 3
+; CHECK-NEXT:    [[ADD3:%.*]] = add i64 [[V1]], [[V2]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[V1]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer Cost:5
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[V2]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer Cost:5
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], [[SHUFFLE1]] Cost:0
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[V2]], [[V1]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i64 [[V2]], [[V1]]
+; CHECK-NEXT:    [[ADD0:%.*]] = add i64 [[V1]], [[V2]]
+; CHECK-NEXT:    store i64 [[ADD3]], i64* [[IDXS3]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[IDXS0]] to <4 x i64>*
-; CHECK-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* [[TMP3]], align 8 Cost:-2
+; CHECK-NEXT:    store i64 [[ADD2]], i64* [[IDXS2]], align 8
+; CHECK-NEXT:    store i64 [[ADD1]], i64* [[IDXS1]], align 8
+; CHECK-NEXT:    store i64 [[ADD0]], i64* [[IDXS0]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -65,20 +76,35 @@
 ; CHECK-NEXT:    [[A0:%.*]] = load i16, i16* [[A:%.*]], align 8
 ; CHECK-NEXT:    [[V1:%.*]] = sext i16 [[A0]] to i32
 ; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 0
+; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 1
+; CHECK-NEXT:    [[IDXS2:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 2
+; CHECK-NEXT:    [[IDXS3:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 3
 ; CHECK-NEXT:    [[B0:%.*]] = load i16, i16* [[B:%.*]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = load i16, i16* [[C:%.*]], align 8
 ; CHECK-NEXT:    [[D0:%.*]] = load i16, i16* [[D:%.*]], align 8
 ; CHECK-NEXT:    [[E0:%.*]] = load i16, i16* [[E:%.*]], align 8
+; CHECK-NEXT:    [[V4:%.*]] = sext i16 [[D0]] to i32
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B0]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[C0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[E0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[D0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[D0]], i32 3 Cost:4
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> Cost:1
+; CHECK-NEXT:    [[V5:%.*]] = sext i16 [[E0]] to i32
+; CHECK-NEXT:    [[V3:%.*]] = sext i16 [[C0]] to i32
+; CHECK-NEXT:    [[V2:%.*]] = sext i16 [[B0]] to i32
+; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[V1]], [[V4]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[V1]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[SHUFFLE]], [[TMP4]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer Cost:2
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[SHUFFLE]], [[TMP4]] Cost:-3
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[V5]], [[V1]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[V3]], [[V1]]
+; CHECK-NEXT:    [[ADD0:%.*]] = add i32 [[V1]], [[V2]]
+; CHECK-NEXT:    store i32 [[ADD3]], i32* [[IDXS3]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[IDXS0]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 8
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 8 Cost:-3
+; CHECK-NEXT:    store i32 [[ADD2]], i32* [[IDXS2]], align 8
+; CHECK-NEXT:    store i32 [[ADD1]], i32* [[IDXS1]], align 8
+; CHECK-NEXT:    store i32 [[ADD0]], i32* [[IDXS0]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry: