Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6175,6 +6175,9 @@
   float Cost = expectedCost(1).first;
   const float ScalarCost = Cost;
   unsigned Width = 1;
+  float MinVectCost;
+  unsigned BestVectCostWidth;
+  unsigned Count = 0; 
   DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
 
   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
@@ -6198,10 +6201,19 @@
                  << " because it will not generate any vector instructions.\n");
       continue;
     }
+    Count +=1;
     if (VectorCost < Cost) {
       Cost = VectorCost;
       Width = i;
     }
+    if(Count==1) {
+      MinVectCost = VectorCost;
+      BestVectCostWidth = i; 
+    }
+    if(VectorCost < MinVectCost) {
+      MinVectCost = VectorCost;
+      BestVectCostWidth = i;
+    }
   }
 
   if (!EnableCondStoresVectorization && NumPredStores) {
@@ -6212,6 +6224,29 @@
     Cost = ScalarCost;
   }
 
+  if(MaxVF >=2) {
+    if(Count >=1) {
+      ORE->emit([&]() {
+       return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
+		       "VectorizationNotBeneficial", TheLoop->getStartLoc(),
+		       TheLoop->getHeader())
+                       << "the best vector cost is : "
+		       << StringRef(std::to_string(MinVectCost)) 
+		       << " with vector width : "
+		       << StringRef(std::to_string(BestVectCostWidth))
+		       << ", Scalar loop costs: " 
+		       << StringRef(std::to_string(ScalarCost));
+      });
+    } else {
+       ORE->emit([&]() {
+        return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), 
+		       "VectorizationNotBeneficial",TheLoop->getStartLoc(), 
+		       TheLoop->getHeader())
+                       << "none of the width generate any vector instructions";
+       });
+      }
+  }
+
   DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
         << "LV: Vectorization seems to be not beneficial, "
         << "but was forced by a user.\n");
Index: test/Transforms/LoopVectorize/X86/print_cost.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/X86/print_cost.ll
@@ -0,0 +1,230 @@
+; RUN: opt < %s -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+; CHECK: remark: source.c:6:4: the best vector cost is : 74.625000 with vector width : 8, Scalar loop costs: 25.000000
+; CHECK: remark: source.c:16:6:  none of the width generate any vector instructions 
+
+; ModuleID = 'source.c'
+source_filename = "source.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind readonly uwtable
+define dso_local float @foo(float* nocapture readonly %A, float* nocapture readonly %B, i32* nocapture readonly %C, i32 %N) local_unnamed_addr #0 !dbg !6 {
+entry:
+  br label %for.body, !dbg !8
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.3, %for.body ]
+  %r.012 = phi float [ 0.000000e+00, %entry ], [ %add.3, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv, !dbg !9
+  %0 = load float, float* %arrayidx, align 4, !dbg !9, !tbaa !10
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv, !dbg !14
+  %1 = load i32, i32* %arrayidx2, align 4, !dbg !14, !tbaa !15
+  %idxprom3 = sext i32 %1 to i64, !dbg !17
+  %arrayidx4 = getelementptr inbounds float, float* %B, i64 %idxprom3, !dbg !17
+  %2 = load float, float* %arrayidx4, align 4, !dbg !17, !tbaa !10
+  %mul = fmul fast float %2, %0, !dbg !18
+  %add = fadd fast float %mul, %r.012, !dbg !19
+  %indvars.iv.next = or i64 %indvars.iv, 1, !dbg !20
+  %arrayidx.1 = getelementptr inbounds float, float* %A, i64 %indvars.iv.next, !dbg !9
+  %3 = load float, float* %arrayidx.1, align 4, !dbg !9, !tbaa !10
+  %arrayidx2.1 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv.next, !dbg !14
+  %4 = load i32, i32* %arrayidx2.1, align 4, !dbg !14, !tbaa !15
+  %idxprom3.1 = sext i32 %4 to i64, !dbg !17
+  %arrayidx4.1 = getelementptr inbounds float, float* %B, i64 %idxprom3.1, !dbg !17
+  %5 = load float, float* %arrayidx4.1, align 4, !dbg !17, !tbaa !10
+  %mul.1 = fmul fast float %5, %3, !dbg !18
+  %add.1 = fadd fast float %mul.1, %add, !dbg !19
+  %indvars.iv.next.1 = or i64 %indvars.iv, 2, !dbg !20
+  %arrayidx.2 = getelementptr inbounds float, float* %A, i64 %indvars.iv.next.1, !dbg !9
+  %6 = load float, float* %arrayidx.2, align 4, !dbg !9, !tbaa !10
+  %arrayidx2.2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv.next.1, !dbg !14
+  %7 = load i32, i32* %arrayidx2.2, align 4, !dbg !14, !tbaa !15
+  %idxprom3.2 = sext i32 %7 to i64, !dbg !17
+  %arrayidx4.2 = getelementptr inbounds float, float* %B, i64 %idxprom3.2, !dbg !17
+  %8 = load float, float* %arrayidx4.2, align 4, !dbg !17, !tbaa !10
+  %mul.2 = fmul fast float %8, %6, !dbg !18
+  %add.2 = fadd fast float %mul.2, %add.1, !dbg !19
+  %indvars.iv.next.2 = or i64 %indvars.iv, 3, !dbg !20
+  %arrayidx.3 = getelementptr inbounds float, float* %A, i64 %indvars.iv.next.2, !dbg !9
+  %9 = load float, float* %arrayidx.3, align 4, !dbg !9, !tbaa !10
+  %arrayidx2.3 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv.next.2, !dbg !14
+  %10 = load i32, i32* %arrayidx2.3, align 4, !dbg !14, !tbaa !15
+  %idxprom3.3 = sext i32 %10 to i64, !dbg !17
+  %arrayidx4.3 = getelementptr inbounds float, float* %B, i64 %idxprom3.3, !dbg !17
+  %11 = load float, float* %arrayidx4.3, align 4, !dbg !17, !tbaa !10
+  %mul.3 = fmul fast float %11, %9, !dbg !18
+  %add.3 = fadd fast float %mul.3, %add.2, !dbg !19
+  %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 4, !dbg !20
+  %exitcond.3 = icmp eq i64 %indvars.iv.next.3, 1000, !dbg !21
+  br i1 %exitcond.3, label %for.end, label %for.body, !dbg !8, !llvm.loop !22
+
+for.end:                                          ; preds = %for.body
+  ret float %add.3, !dbg !24
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: nounwind readnone uwtable
+define dso_local i32 @func() local_unnamed_addr #2 !dbg !25 {
+entry:
+  br label %for.body, !dbg !26
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 0, !dbg !27
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 100, %entry ], [ %indvars.iv.next.24, %for.body ]
+  %indvars.iv.next.24 = add nuw nsw i64 %indvars.iv, 25, !dbg !28
+  %exitcond.24 = icmp eq i64 %indvars.iv.next.24, 1000, !dbg !29
+  br i1 %exitcond.24, label %for.cond.cleanup, label %for.body, !dbg !26, !llvm.loop !30
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #1
+
+; Function Attrs: nounwind readnone uwtable
+define dso_local i32 @func2() local_unnamed_addr #2 !dbg !32 {
+entry:
+  %sum = alloca [1000 x i32], align 16
+  %0 = bitcast [1000 x i32]* %sum to i8*, !dbg !33
+  call void @llvm.lifetime.start.p0i8(i64 4000, i8* nonnull %0) #3, !dbg !33
+  %1 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 4
+  %2 = bitcast i32* %1 to i8*
+  call void @llvm.memset.p0i8.i64(i8* nonnull align 16 %2, i8 0, i64 3984, i1 false), !dbg !34
+  %3 = bitcast [1000 x i32]* %sum to <4 x i32>*, !dbg !34
+  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32>* %3, align 16, !dbg !34
+  br label %vector.body, !dbg !35
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vector.recur = phi <8 x i32> [ <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0>, %entry ], [ %wide.load18, %vector.body ]
+  %4 = add i64 %index, 100
+  %5 = add i64 %index, 101, !dbg !36
+  %6 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 %5, !dbg !37
+  %7 = bitcast i32* %6 to <8 x i32>*, !dbg !37
+  %wide.load = load <8 x i32>, <8 x i32>* %7, align 4, !dbg !37, !tbaa !15
+  %8 = getelementptr i32, i32* %6, i64 8, !dbg !37
+  %9 = bitcast i32* %8 to <8 x i32>*, !dbg !37
+  %wide.load16 = load <8 x i32>, <8 x i32>* %9, align 4, !dbg !37, !tbaa !15
+  %10 = getelementptr i32, i32* %6, i64 16, !dbg !37
+  %11 = bitcast i32* %10 to <8 x i32>*, !dbg !37
+  %wide.load17 = load <8 x i32>, <8 x i32>* %11, align 4, !dbg !37, !tbaa !15
+  %12 = getelementptr i32, i32* %6, i64 24, !dbg !37
+  %13 = bitcast i32* %12 to <8 x i32>*, !dbg !37
+  %wide.load18 = load <8 x i32>, <8 x i32>* %13, align 4, !dbg !37, !tbaa !15
+  %14 = shufflevector <8 x i32> %vector.recur, <8 x i32> %wide.load, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>, !dbg !38
+  %15 = shufflevector <8 x i32> %wide.load, <8 x i32> %wide.load16, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>, !dbg !38
+  %16 = shufflevector <8 x i32> %wide.load16, <8 x i32> %wide.load17, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>, !dbg !38
+  %17 = shufflevector <8 x i32> %wide.load17, <8 x i32> %wide.load18, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>, !dbg !38
+  %18 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 %4, !dbg !38
+  %19 = add nsw <8 x i32> %14, %wide.load, !dbg !39
+  %20 = add nsw <8 x i32> %15, %wide.load16, !dbg !39
+  %21 = add nsw <8 x i32> %16, %wide.load17, !dbg !39
+  %22 = add nsw <8 x i32> %17, %wide.load18, !dbg !39
+  %23 = bitcast i32* %18 to <8 x i32>*, !dbg !39
+  store <8 x i32> %19, <8 x i32>* %23, align 16, !dbg !39, !tbaa !15
+  %24 = getelementptr i32, i32* %18, i64 8, !dbg !39
+  %25 = bitcast i32* %24 to <8 x i32>*, !dbg !39
+  store <8 x i32> %20, <8 x i32>* %25, align 16, !dbg !39, !tbaa !15
+  %26 = getelementptr i32, i32* %18, i64 16, !dbg !39
+  %27 = bitcast i32* %26 to <8 x i32>*, !dbg !39
+  store <8 x i32> %21, <8 x i32>* %27, align 16, !dbg !39, !tbaa !15
+  %28 = getelementptr i32, i32* %18, i64 24, !dbg !39
+  %29 = bitcast i32* %28 to <8 x i32>*, !dbg !39
+  store <8 x i32> %22, <8 x i32>* %29, align 16, !dbg !39, !tbaa !15
+  %index.next = add i64 %index, 32
+  %30 = icmp eq i64 %index.next, 896
+  br i1 %30, label %for.body, label %vector.body, !llvm.loop !40
+
+for.body:                                         ; preds = %vector.body
+  %vector.recur.extract = extractelement <8 x i32> %wide.load18, i32 7, !dbg !35
+  %arrayidx = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 997, !dbg !37
+  %31 = load i32, i32* %arrayidx, align 4, !dbg !37, !tbaa !15
+  %arrayidx2 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 996, !dbg !38
+  %add3 = add nsw i32 %vector.recur.extract, %31, !dbg !39
+  store i32 %add3, i32* %arrayidx2, align 16, !dbg !39, !tbaa !15
+  %arrayidx.1 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 998, !dbg !37
+  %32 = load i32, i32* %arrayidx.1, align 8, !dbg !37, !tbaa !15
+  %arrayidx2.1 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 997, !dbg !38
+  %add3.1 = add nsw i32 %31, %32, !dbg !39
+  store i32 %add3.1, i32* %arrayidx2.1, align 4, !dbg !39, !tbaa !15
+  %arrayidx.2 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 999, !dbg !37
+  %33 = load i32, i32* %arrayidx.2, align 4, !dbg !37, !tbaa !15
+  %arrayidx2.2 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 998, !dbg !38
+  %add3.2 = add nsw i32 %32, %33, !dbg !39
+  store i32 %add3.2, i32* %arrayidx2.2, align 8, !dbg !39, !tbaa !15
+  %arrayidx.3 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 1000, !dbg !37
+  %34 = load i32, i32* %arrayidx.3, align 16, !dbg !37, !tbaa !15
+  %arrayidx2.3 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 999, !dbg !38
+  %add3.3 = add nsw i32 %33, %34, !dbg !39
+  store i32 %add3.3, i32* %arrayidx2.3, align 4, !dbg !39, !tbaa !15
+  call void @llvm.lifetime.end.p0i8(i64 4000, i8* nonnull %0) #3, !dbg !43
+  ret i32 0, !dbg !44
+}
+
+; Function Attrs: nounwind readnone uwtable
+define dso_local i32 @main() local_unnamed_addr #2 !dbg !45 {
+entry:
+  ret i32 5, !dbg !46
+}
+
+attributes #0 = { norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 7.0.0 (https://git.llvm.org/git/clang.git/ f4ab7b42ebfeff74222e4fe541a878f0f6a83d48) (https://git.llvm.org/git/llvm.git/ db5834789d3042785c79007738c6e812e9c38eb4)", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "source.c", directory: "/home/cs15btech11042/llvm-remarks/test_cases")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{!"clang version 7.0.0 (https://git.llvm.org/git/clang.git/ f4ab7b42ebfeff74222e4fe541a878f0f6a83d48) (https://git.llvm.org/git/llvm.git/ db5834789d3042785c79007738c6e812e9c38eb4)"}
+!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 2, type: !7, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 6, column: 4, scope: !6)
+!9 = !DILocation(line: 8, column: 11, scope: !6)
+!10 = !{!11, !11, i64 0}
+!11 = !{!"float", !12, i64 0}
+!12 = !{!"omnipotent char", !13, i64 0}
+!13 = !{!"Simple C/C++ TBAA"}
+!14 = !DILocation(line: 8, column: 20, scope: !6)
+!15 = !{!16, !16, i64 0}
+!16 = !{!"int", !12, i64 0}
+!17 = !DILocation(line: 8, column: 18, scope: !6)
+!18 = !DILocation(line: 8, column: 16, scope: !6)
+!19 = !DILocation(line: 8, column: 8, scope: !6)
+!20 = !DILocation(line: 6, column: 27, scope: !6)
+!21 = !DILocation(line: 6, column: 18, scope: !6)
+!22 = distinct !{!22, !8, !23}
+!23 = !DILocation(line: 9, column: 4, scope: !6)
+!24 = !DILocation(line: 10, column: 4, scope: !6)
+!25 = distinct !DISubprogram(name: "func", scope: !1, file: !1, line: 12, type: !7, isLocal: false, isDefinition: true, scopeLine: 13, isOptimized: true, unit: !0, variables: !2)
+!26 = !DILocation(line: 16, column: 6, scope: !25)
+!27 = !DILocation(line: 20, column: 5, scope: !25)
+!28 = !DILocation(line: 16, column: 29, scope: !25)
+!29 = !DILocation(line: 16, column: 22, scope: !25)
+!30 = distinct !{!30, !26, !31}
+!31 = !DILocation(line: 19, column: 5, scope: !25)
+!32 = distinct !DISubprogram(name: "func2", scope: !1, file: !1, line: 22, type: !7, isLocal: false, isDefinition: true, scopeLine: 23, isOptimized: true, unit: !0, variables: !2)
+!33 = !DILocation(line: 24, column: 5, scope: !32)
+!34 = !DILocation(line: 24, column: 9, scope: !32)
+!35 = !DILocation(line: 26, column: 6, scope: !32)
+!36 = !DILocation(line: 28, column: 24, scope: !32)
+!37 = !DILocation(line: 28, column: 19, scope: !32)
+!38 = !DILocation(line: 28, column: 9, scope: !32)
+!39 = !DILocation(line: 28, column: 16, scope: !32)
+!40 = distinct !{!40, !35, !41, !42}
+!41 = !DILocation(line: 29, column: 5, scope: !32)
+!42 = !{!"llvm.loop.isvectorized", i32 1}
+!43 = !DILocation(line: 31, column: 1, scope: !32)
+!44 = !DILocation(line: 30, column: 5, scope: !32)
+!45 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 32, type: !7, isLocal: false, isDefinition: true, scopeLine: 33, isOptimized: true, unit: !0, variables: !2)
+!46 = !DILocation(line: 41, column: 5, scope: !45)