Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4433,19 +4433,41 @@
   unsigned Sz = R.getVectorElementSize(I0);
   unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
   unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
-  if (MaxVF < 2)
-    return false;
+  if (MaxVF < 2) {
+     R.getORE()->emit(OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
+                      << "Cannot SLP vectorize list: vectorization factor "
+                      << "less than 2 is non-sense");
+     return false;
+  }
 
   for (Value *V : VL) {
     Type *Ty = V->getType();
-    if (!isValidElementType(Ty))
+    if (!isValidElementType(Ty)) {
+      std::string type_str;
+      llvm::raw_string_ostream rso(type_str);
+      Ty->print(rso);
+      // NOTE: the following will give user internal llvm type name, which may not be useful
+      R.getORE()->emit(OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
+                       << "Cannot SLP vectorize list: type "
+                       << rso.str() + " is unsupported by vectorizer");
       return false;
+    }
     Instruction *Inst = dyn_cast<Instruction>(V);
-    if (!Inst || Inst->getOpcode() != Opcode0)
+
+    if (!Inst)
+      return false;
+    if (Inst->getOpcode() != Opcode0) {
+      // FIXME: need more user-friendly message here
+      R.getORE()->emit(OptimizationRemarkMissed(SV_NAME, "InequableTypes", I0)
+                       << "Cannot SLP vectorize list: not all of the "
+                       << "parts of scalar instructions are of the same type");
       return false;
+    }
   }
 
   bool Changed = false;
+  bool WasPossible = false;
+  int MinCost = SLPCostThreshold;
 
   // Keep track of values that were deleted by vectorizing in the loop below.
   SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());
@@ -4499,6 +4521,8 @@
 
       R.computeMinimumValueSizes();
       int Cost = R.getTreeCost();
+      WasPossible = true;
+      MinCost = std::min(MinCost, Cost);
 
       if (Cost < -SLPCostThreshold) {
         DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
@@ -4541,6 +4565,16 @@
     }
   }
 
+  if (!Changed && WasPossible) {
+    R.getORE()->emit(OptimizationRemark(SV_NAME, "NotBeneficial",  I0)
+                     << "List vectorization was possible but not beneficial with cost "
+                     << ore::NV("Cost", MinCost) << " >= "
+                     << ore::NV("Treshold", -SLPCostThreshold));
+  } else if (!Changed) {
+    R.getORE()->emit(OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
+                     << "Cannot vectorize list: vectorization was impossible"
+                     << " with available vectorization factors");
+  }
   return Changed;
 }
 
@@ -5234,6 +5268,9 @@
         SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
         V.buildTree(Reversed, ExternallyUsedValues, IgnoreList);
       }
+
+      auto *I0 = cast<Instruction>(VL[0]);
+
       if (V.isTreeTinyAndNotFullyVectorizable())
         break;
 
@@ -5242,12 +5279,15 @@
       // Estimate cost.
       int Cost =
           V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
-      if (Cost >= -SLPCostThreshold)
+      if (Cost >= -SLPCostThreshold) {
+        V.getORE()->emit(OptimizationRemarkMissed(SV_NAME, "", I0)
+                          << "Vectorizing horizontal reduction is possible but not beneficial with cost "
+                          << ore::NV("Cost", Cost));
         break;
+      }
 
       DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
                    << ". (HorRdx)\n");
-      auto *I0 = cast<Instruction>(VL[0]);
       V.getORE()->emit(
           OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0)
           << "Vectorized horizontal reduction with cost "
Index: test/Transforms/SLPVectorizer/X86/remark_horcost.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/remark_horcost.ll
@@ -0,0 +1,64 @@
+; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -slp-vectorizer -pass-remarks-missed=slp-vectorizer -o /dev/null < %s 2>&1 | FileCheck %s
+
+define i32 @foo(i32* %diff) #0 {
+entry:
+  %m2 = alloca [8 x [8 x i32]], align 16
+  %0 = bitcast [8 x [8 x i32]]* %m2 to i8*
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %a.088 = phi i32 [ 0, %entry ], [ %add52, %for.body ]
+  %1 = shl i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds i32, i32* %diff, i64 %1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = or i64 %1, 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %diff, i64 %3
+  %4 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %4, %2
+  %arrayidx6 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 0
+  store i32 %add3, i32* %arrayidx6, align 16
+ 
+  %add10 = add nsw i32 %add3, %a.088
+  %5 = or i64 %1, 1
+  %arrayidx13 = getelementptr inbounds i32, i32* %diff, i64 %5
+  %6 = load i32, i32* %arrayidx13, align 4
+  %7 = or i64 %1, 5
+  %arrayidx16 = getelementptr inbounds i32, i32* %diff, i64 %7
+  %8 = load i32, i32* %arrayidx16, align 4
+  %add17 = add nsw i32 %8, %6
+  %arrayidx20 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 1
+  store i32 %add17, i32* %arrayidx20, align 4
+  
+  %add24 = add nsw i32 %add10, %add17
+  %9 = or i64 %1, 2
+  %arrayidx27 = getelementptr inbounds i32, i32* %diff, i64 %9
+  %10 = load i32, i32* %arrayidx27, align 4
+  %11 = or i64 %1, 6
+  %arrayidx30 = getelementptr inbounds i32, i32* %diff, i64 %11
+  %12 = load i32, i32* %arrayidx30, align 4
+  %add31 = add nsw i32 %12, %10
+  %arrayidx34 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 2
+  store i32 %add31, i32* %arrayidx34, align 8
+  
+  %add38 = add nsw i32 %add24, %add31
+  %13 = or i64 %1, 3
+  %arrayidx41 = getelementptr inbounds i32, i32* %diff, i64 %13
+  %14 = load i32, i32* %arrayidx41, align 4
+  %15 = or i64 %1, 7
+  %arrayidx44 = getelementptr inbounds i32, i32* %diff, i64 %15
+  %16 = load i32, i32* %arrayidx44, align 4
+  
+  %add45 = add nsw i32 %16, %14
+  %arrayidx48 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 3
+  store i32 %add45, i32* %arrayidx48, align 4
+  
+  %add52 = add nsw i32 %add38, %add45
+ ; CHECK: remark: {{.*}}: Vectorizing horizontal reduction is possible but not beneficial with cost {{[0-9]+}}  
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add52
+}
Index: test/Transforms/SLPVectorizer/X86/remark_listcost.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/remark_listcost.ll
@@ -0,0 +1,34 @@
+; RUN: opt -S -slp-vectorizer -mtriple=x86_64-pc-linux-gnu -mcpu=generic -pass-remarks=slp-vectorizer -o /dev/null < %s 2>&1 | FileCheck %s
+
+define void @vsub2_test(i32* %pin1, i32* %pin2, i32* %pout) #0 {
+  br label %1
+
+  %idx.04 = phi i32 [ 0, %0 ], [ %8, %1 ]
+  %po.03 = phi i32* [ %pout, %0 ], [ %7, %1 ]
+  %ptmpi2.02 = phi i32* [ %pin2, %0 ], [ %4, %1 ]
+  %ptmpi1.01 = phi i32* [ %pin1, %0 ], [ %2, %1 ]
+  %2 = getelementptr inbounds i32, i32* %ptmpi1.01, i64 1
+  %3 = load i32, i32* %ptmpi1.01, align 4, !tbaa !1
+  %4 = getelementptr inbounds i32, i32* %ptmpi2.02, i64 1
+  %5 = load i32, i32* %ptmpi2.02, align 4, !tbaa !1
+  %6 = sub nsw i32 %3, %5
+  %7 = getelementptr inbounds i32, i32* %po.03, i64 1
+; CHECK: remark: {{.*}}: List vectorization was possible but not beneficial with cost {{[0-9]+}} >= {{[0-9]+}}
+  store i32 %6, i32* %po.03, align 4, !tbaa !1
+  %8 = add nuw nsw i32 %idx.04, 1
+  %exitcond = icmp eq i32 %8, 64
+  br i1 %exitcond, label %9, label %1, !llvm.loop !5
+
+  ret void
+}
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = distinct !{!5, !6, !7}
+!6 = !{!"llvm.loop.vectorize.width", i32 1}
+!7 = !{!"llvm.loop.interleave.count", i32 1}
Index: test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
@@ -0,0 +1,46 @@
+; RUN: opt -S -slp-vectorizer -mtriple=x86_64-pc-linux-gnu -mcpu=generic -pass-remarks-missed=slp-vectorizer -o /dev/null < %s 2>&1 | FileCheck %s
+
+; NOTE: This test was developed from X86/scheduling.ll
+define i32 @foo(i32* nocapture readonly %diff) #0 {
+entry:
+  %m2 = alloca [8 x [8 x i32]], align 16
+  %0 = bitcast [8 x [8 x i32]]* %m2 to i8*
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %a.088 = phi i32 [ 0, %entry ], [ %add24, %for.body ]
+  %1 = shl i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds i32, i32* %diff, i64 %1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = or i64 %1, 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %diff, i64 %3
+  %4 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %4, %2
+  %arrayidx6 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 0
+  store i32 %add3, i32* %arrayidx6, align 16
+  %add10 = add nsw i32 %add3, %a.088
+  %5 = or i64 %1, 1
+  %arrayidx13 = getelementptr inbounds i32, i32* %diff, i64 %5
+  %6 = load i32, i32* %arrayidx13, align 4
+  %7 = or i64 %1, 5
+  %arrayidx16 = getelementptr inbounds i32, i32* %diff, i64 %7
+  %8 = load i32, i32* %arrayidx16, align 4
+  %add17 = add nsw i32 %8, %6
+  %arrayidx20 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 1
+  store i32 %add17, i32* %arrayidx20, align 4
+  %add24 = add nsw i32 %add10, %add17
+
+; CHECK: remark: {{.*}}: Cannot SLP vectorize list: not all of the parts of scalar instructions are of the same type
+; CHECK: remark: {{.*}}: Cannot vectorize list: vectorization was impossible with available vectorization factors
+
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %arraydecay = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 0
+  ret i32 %add24
+}
+
Index: test/Transforms/SLPVectorizer/X86/remark_unsupported.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/remark_unsupported.ll
@@ -0,0 +1,26 @@
+; RUN: opt -S -slp-vectorizer -mtriple=x86_64-pc-linux-gnu -mcpu=generic -pass-remarks-missed=slp-vectorizer -o /dev/null < %s 2>&1 | FileCheck %s
+
+; This type is not supported by SLP
+define void @test(x86_fp80* %i1, x86_fp80* %i2, x86_fp80* %o) {
+
+entry:
+  %i1.0 = load x86_fp80, x86_fp80* %i1, align 16
+  %i1.gep1 = getelementptr x86_fp80, x86_fp80* %i1, i64 1
+  %i1.1 = load x86_fp80, x86_fp80* %i1.gep1, align 16
+  br i1 undef, label %then, label %end
+then:
+  %i2.gep0 = getelementptr inbounds x86_fp80, x86_fp80* %i2, i64 0
+  %i2.0 = load x86_fp80, x86_fp80* %i2.gep0, align 16
+  %i2.gep1 = getelementptr inbounds x86_fp80, x86_fp80* %i2, i64 1
+  %i2.1 = load x86_fp80, x86_fp80* %i2.gep1, align 16
+  br label %end
+end:
+  %phi0 = phi x86_fp80 [ %i1.0, %entry ], [ %i2.0, %then ]
+
+  %phi1 = phi x86_fp80 [ %i1.1, %entry ], [ %i2.1, %then ]
+  store x86_fp80 %phi0, x86_fp80* %o, align 16
+  %o.gep1 = getelementptr inbounds x86_fp80, x86_fp80* %o, i64 1
+  store x86_fp80 %phi1, x86_fp80* %o.gep1, align 16
+; CHECK: remark: {{.*}}: Cannot SLP vectorize list: type x86_fp80 is unsupported by vectorizer
+  ret void
+}