Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4443,19 +4443,45 @@
   unsigned Sz = R.getVectorElementSize(I0);
   unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
   unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
-  if (MaxVF < 2)
-    return false;
+  if (MaxVF < 2) {
+     R.getORE()->emit(
+         OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
+         << "Cannot SLP vectorize list: vectorization factor "
+         << "less than 2 is non-sense");
+     return false;
+  }
 
   for (Value *V : VL) {
     Type *Ty = V->getType();
-    if (!isValidElementType(Ty))
+    if (!isValidElementType(Ty)) {
+      std::string type_str;
+      llvm::raw_string_ostream rso(type_str);
+      Ty->print(rso);
+      // NOTE: the following will give user internal llvm type name, which may not be useful
+      R.getORE()->emit(
+          OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
+          << "Cannot SLP vectorize list: type "
+          << rso.str() + " is unsupported by vectorizer");
       return false;
+    }
     Instruction *Inst = dyn_cast<Instruction>(V);
-    if (!Inst || Inst->getOpcode() != Opcode0)
+
+    if (!Inst)
+        return false;
+    if (Inst->getOpcode() != Opcode0) {
+      R.getORE()->emit(
+          OptimizationRemarkMissed(SV_NAME, "InequableTypes", I0)
+          << "Cannot SLP vectorize list: not all of the "
+          << "parts of scalar instructions are of the same type: "
+          << ore::NV("Instruction1Opcode", I0) << " and "
+          << ore::NV("Instruction2Opcode", Inst));
       return false;
+    }
   }
 
   bool Changed = false;
+  bool CandidateFound = false;
+  int MinCost = SLPCostThreshold;
 
   // Keep track of values that were deleted by vectorizing in the loop below.
   SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());
@@ -4509,14 +4535,17 @@
 
       R.computeMinimumValueSizes();
       int Cost = R.getTreeCost();
+      CandidateFound = true;
+      MinCost = std::min(MinCost, Cost);
 
       if (Cost < -SLPCostThreshold) {
         DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
-        R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
-                                            cast<Instruction>(Ops[0]))
-                         << "SLP vectorized with cost " << ore::NV("Cost", Cost)
-                         << " and with tree size "
-                         << ore::NV("TreeSize", R.getTreeSize()));
+        R.getORE()->emit(
+            OptimizationRemark(SV_NAME, "VectorizedList",
+                cast<Instruction>(Ops[0]))
+            << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+            << " and with tree size "
+            << ore::NV("TreeSize", R.getTreeSize()));
 
         Value *VectorizedRoot = R.vectorizeTree();
 
@@ -4551,6 +4580,18 @@
     }
   }
 
+  if (!Changed && CandidateFound) {
+    R.getORE()->emit(
+        OptimizationRemarkMissed(SV_NAME, "NotBeneficial",  I0)
+        << "List vectorization was possible but not beneficial with cost "
+        << ore::NV("Cost", MinCost) << " >= "
+        << ore::NV("Treshold", -SLPCostThreshold));
+  } else if (!Changed) {
+    R.getORE()->emit(
+        OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
+        << "Cannot SLP vectorize list: vectorization was impossible"
+        << " with available vectorization factors");
+  }
   return Changed;
 }
 
@@ -5251,6 +5292,7 @@
         SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
         V.buildTree(Reversed, ExternallyUsedValues, IgnoreList);
       }
+
       if (V.isTreeTinyAndNotFullyVectorizable())
         break;
 
@@ -5259,14 +5301,21 @@
       // Estimate cost.
       int Cost =
           V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
-      if (Cost >= -SLPCostThreshold)
+      if (Cost >= -SLPCostThreshold) {
+          V.getORE()->emit(
+              OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
+                  cast<Instruction>(VL[0]))
+              << "Vectorizing horizontal reduction is possible"
+              << "but not beneficial with cost "
+              << ore::NV("Cost", Cost));
         break;
+      }
 
       DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
                    << ". (HorRdx)\n");
-      auto *I0 = cast<Instruction>(VL[0]);
       V.getORE()->emit(
-          OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0)
+          OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
+              cast<Instruction>(VL[0]))
           << "Vectorized horizontal reduction with cost "
           << ore::NV("Cost", Cost) << " and with tree size "
           << ore::NV("TreeSize", V.getTreeSize()));
Index: test/Transforms/SLPVectorizer/X86/remark_horcost.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/remark_horcost.ll
@@ -0,0 +1,76 @@
+; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
+
+define i32 @foo(i32* %diff) #0 {
+entry:
+  %m2 = alloca [8 x [8 x i32]], align 16
+  %0 = bitcast [8 x [8 x i32]]* %m2 to i8*
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %a.088 = phi i32 [ 0, %entry ], [ %add52, %for.body ]
+  %1 = shl i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds i32, i32* %diff, i64 %1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = or i64 %1, 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %diff, i64 %3
+  %4 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %4, %2
+  %arrayidx6 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 0
+  store i32 %add3, i32* %arrayidx6, align 16
+ 
+  %add10 = add nsw i32 %add3, %a.088
+  %5 = or i64 %1, 1
+  %arrayidx13 = getelementptr inbounds i32, i32* %diff, i64 %5
+  %6 = load i32, i32* %arrayidx13, align 4
+  %7 = or i64 %1, 5
+  %arrayidx16 = getelementptr inbounds i32, i32* %diff, i64 %7
+  %8 = load i32, i32* %arrayidx16, align 4
+  %add17 = add nsw i32 %8, %6
+  %arrayidx20 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 1
+  store i32 %add17, i32* %arrayidx20, align 4
+  
+  %add24 = add nsw i32 %add10, %add17
+  %9 = or i64 %1, 2
+  %arrayidx27 = getelementptr inbounds i32, i32* %diff, i64 %9
+  %10 = load i32, i32* %arrayidx27, align 4
+  %11 = or i64 %1, 6
+  %arrayidx30 = getelementptr inbounds i32, i32* %diff, i64 %11
+  %12 = load i32, i32* %arrayidx30, align 4
+  %add31 = add nsw i32 %12, %10
+  %arrayidx34 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 2
+  store i32 %add31, i32* %arrayidx34, align 8
+  
+  %add38 = add nsw i32 %add24, %add31
+  %13 = or i64 %1, 3
+  %arrayidx41 = getelementptr inbounds i32, i32* %diff, i64 %13
+  %14 = load i32, i32* %arrayidx41, align 4
+  %15 = or i64 %1, 7
+  %arrayidx44 = getelementptr inbounds i32, i32* %diff, i64 %15
+  %16 = load i32, i32* %arrayidx44, align 4
+  
+  %add45 = add nsw i32 %16, %14
+  %arrayidx48 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 3
+  store i32 %add45, i32* %arrayidx48, align 4
+  
+  %add52 = add nsw i32 %add38, %add45
+ ; CHECK: add nsw <{{[0-9]+}} x i32>
+ ; CHECK-NOT: add nsw <{{[0-9]+}} x i32>
+ 
+ ; YAML:      --- !Missed
+ ; YAML-NEXT: Pass:            slp-vectorizer
+ ; YAML-NEXT: Name:            HorSLPNotBeneficial
+ ; YAML-NEXT: Function:        foo
+ ; YAML-NEXT: Args:
+ ; YAML-NEXT:   - String:          Vectorizing horizontal reduction is possible
+ ; YAML-NEXT:   - String:          'but not beneficial with cost ' 
+ ; YAML-NEXT:   - Cost:            '1'
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add52
+}
Index: test/Transforms/SLPVectorizer/X86/remark_listcost.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/remark_listcost.ll
@@ -0,0 +1,43 @@
+; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
+
+define void @vsub2_test(i32* %pin1, i32* %pin2, i32* %pout) #0 {
+  br label %1
+
+  %idx.04 = phi i32 [ 0, %0 ], [ %8, %1 ]
+  %po.03 = phi i32* [ %pout, %0 ], [ %7, %1 ]
+  %ptmpi2.02 = phi i32* [ %pin2, %0 ], [ %4, %1 ]
+  %ptmpi1.01 = phi i32* [ %pin1, %0 ], [ %2, %1 ]
+  %2 = getelementptr inbounds i32, i32* %ptmpi1.01, i64 1
+  %3 = load i32, i32* %ptmpi1.01, align 4, !tbaa !1
+  %4 = getelementptr inbounds i32, i32* %ptmpi2.02, i64 1
+  %5 = load i32, i32* %ptmpi2.02, align 4, !tbaa !1
+  %6 = sub nsw i32 %3, %5
+  %7 = getelementptr inbounds i32, i32* %po.03, i64 1
+ ; CHECK-NOT: <{{[0-9]+}} x i32>
+ ; YAML:      Pass:            slp-vectorizer
+ ; YAML-NEXT: Name:            NotBeneficial
+ ; YAML-NEXT: Function:        vsub2_test
+ ; YAML-NEXT: Args:
+ ; YAML-NEXT:   - String:          'List vectorization was possible but not beneficial with cost '
+ ; YAML-NEXT:   - Cost:            '0'
+ ; YAML-NEXT:   - String:          ' >= '
+ ; YAML-NEXT:   - Treshold:        '0'
+  store i32 %6, i32* %po.03, align 4, !tbaa !1
+  %8 = add nuw nsw i32 %idx.04, 1
+  %exitcond = icmp eq i32 %8, 64
+  br i1 %exitcond, label %9, label %1, !llvm.loop !5
+
+  ret void
+}
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = distinct !{!5, !6, !7}
+!6 = !{!"llvm.loop.vectorize.width", i32 1}
+!7 = !{!"llvm.loop.interleave.count", i32 1}
Index: test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
@@ -0,0 +1,60 @@
+; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
+
+define i32 @foo(i32* nocapture readonly %diff) #0 {
+entry:
+  %m2 = alloca [8 x [8 x i32]], align 16
+  %0 = bitcast [8 x [8 x i32]]* %m2 to i8*
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %a.088 = phi i32 [ 0, %entry ], [ %add24, %for.body ]
+  %1 = shl i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds i32, i32* %diff, i64 %1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = or i64 %1, 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %diff, i64 %3
+  %4 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %4, %2
+  %arrayidx6 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 0
+  store i32 %add3, i32* %arrayidx6, align 16
+  %add10 = add nsw i32 %add3, %a.088
+  %5 = or i64 %1, 1
+  %arrayidx13 = getelementptr inbounds i32, i32* %diff, i64 %5
+  %6 = load i32, i32* %arrayidx13, align 4
+  %7 = or i64 %1, 5
+  %arrayidx16 = getelementptr inbounds i32, i32* %diff, i64 %7
+  %8 = load i32, i32* %arrayidx16, align 4
+  %add17 = add nsw i32 %8, %6
+  %arrayidx20 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 1
+  store i32 %add17, i32* %arrayidx20, align 4
+  %add24 = add nsw i32 %add10, %add17
+
+ ; CHECK-NOT: add nsw <{{[0-9]+}} x i32> 
+ ; YAML:      Pass:            slp-vectorizer
+ ; YAML-NEXT: Name:            InequableTypes
+ ; YAML-NEXT: Function:        foo
+ ; YAML-NEXT: Args:
+ ; YAML-NEXT:   - String:          'Cannot SLP vectorize list: not all of the '
+ ; YAML-NEXT:   - String:          'parts of scalar instructions are of the same type: '
+ ; YAML-NEXT:   - Instruction1Opcode: add
+ ; YAML-NEXT:   - String:          ' and '
+ ; YAML-NEXT:   - Instruction2Opcode: phi
+
+ ; YAML:      Pass:            slp-vectorizer
+ ; YAML-NEXT: Name:            NotPossible
+ ; YAML-NEXT: Function:        foo
+ ; YAML-NEXT: Args:
+ ; YAML-NEXT:   - String:          'Cannot SLP vectorize list: vectorization was impossible'
+ ; YAML-NEXT:   - String:          ' with available vectorization factors'
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %arraydecay = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 0
+  ret i32 %add24
+}
+
Index: test/Transforms/SLPVectorizer/X86/remark_unsupported.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/remark_unsupported.ll
@@ -0,0 +1,34 @@
+; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
+
+; This type is not supported by SLP
+define void @test(x86_fp80* %i1, x86_fp80* %i2, x86_fp80* %o) {
+
+entry:
+  %i1.0 = load x86_fp80, x86_fp80* %i1, align 16
+  %i1.gep1 = getelementptr x86_fp80, x86_fp80* %i1, i64 1
+  %i1.1 = load x86_fp80, x86_fp80* %i1.gep1, align 16
+  br i1 undef, label %then, label %end
+then:
+  %i2.gep0 = getelementptr inbounds x86_fp80, x86_fp80* %i2, i64 0
+  %i2.0 = load x86_fp80, x86_fp80* %i2.gep0, align 16
+  %i2.gep1 = getelementptr inbounds x86_fp80, x86_fp80* %i2, i64 1
+  %i2.1 = load x86_fp80, x86_fp80* %i2.gep1, align 16
+  br label %end
+end:
+  %phi0 = phi x86_fp80 [ %i1.0, %entry ], [ %i2.0, %then ]
+
+  %phi1 = phi x86_fp80 [ %i1.1, %entry ], [ %i2.1, %then ]
+  store x86_fp80 %phi0, x86_fp80* %o, align 16
+  %o.gep1 = getelementptr inbounds x86_fp80, x86_fp80* %o, i64 1
+  store x86_fp80 %phi1, x86_fp80* %o.gep1, align 16
+ ; CHECK-NOT: <{{[0-9]+}} x x86_fp80>
+ ; YAML:      Pass:            slp-vectorizer
+ ; YAML-NEXT: Name:            UnsupportedType
+ ; YAML-NEXT: Function:        test
+ ; YAML-NEXT: Args:
+ ; YAML-NEXT:   - String:          'Cannot SLP vectorize list: type '
+ ; YAML-NEXT:   - String:          x86_fp80 is unsupported by vectorizer
+
+  ret void
+}