Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6175,6 +6175,9 @@ float Cost = expectedCost(1).first; const float ScalarCost = Cost; unsigned Width = 1; + float MinVectCost; + unsigned BestVectCostWidth; + unsigned Count = 0; DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; @@ -6198,10 +6201,19 @@ << " because it will not generate any vector instructions.\n"); continue; } + Count +=1; if (VectorCost < Cost) { Cost = VectorCost; Width = i; } + if(Count==1) { + MinVectCost = VectorCost; + BestVectCostWidth = i; + } + if(VectorCost < MinVectCost) { + MinVectCost = VectorCost; + BestVectCostWidth = i; + } } if (!EnableCondStoresVectorization && NumPredStores) { @@ -6212,6 +6224,29 @@ Cost = ScalarCost; } + if(MaxVF >=2) { + if(Count >=1) { + ORE->emit([&]() { + return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), + "VectorizationNotBeneficial", TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "the best vector cost is : " + << StringRef(std::to_string(MinVectCost)) + << " with vector width : " + << StringRef(std::to_string(BestVectCostWidth)) + << ", Scalar loop costs: " + << StringRef(std::to_string(ScalarCost)); + }); + } else { + ORE->emit([&]() { + return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), + "VectorizationNotBeneficial",TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "none of the width generate any vector instructions"; + }); + } + } + DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); Index: test/Transforms/LoopVectorize/X86/print_cost.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/X86/print_cost.ll @@ -0,0 +1,230 @@ +; RUN: opt < %s -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s +; CHECK: remark: source.c:6:4: the best vector cost is : 74.625000 with vector width : 8, Scalar loop costs: 25.000000 +; CHECK: remark: source.c:16:6: none of the width generate any vector instructions + +; ModuleID = 'source.c' +source_filename = "source.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: norecurse nounwind readonly uwtable +define dso_local float @foo(float* nocapture readonly %A, float* nocapture readonly %B, i32* nocapture readonly %C, i32 %N) local_unnamed_addr #0 !dbg !6 { +entry: + br label %for.body, !dbg !8 + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.3, %for.body ] + %r.012 = phi float [ 0.000000e+00, %entry ], [ %add.3, %for.body ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv, !dbg !9 + %0 = load float, float* %arrayidx, align 4, !dbg !9, !tbaa !10 + %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv, !dbg !14 + %1 = load i32, i32* %arrayidx2, align 4, !dbg !14, !tbaa !15 + %idxprom3 = sext i32 %1 to i64, !dbg !17 + %arrayidx4 = getelementptr inbounds float, float* %B, i64 %idxprom3, !dbg !17 + %2 = load float, float* %arrayidx4, align 4, !dbg !17, !tbaa !10 + %mul = fmul fast float %2, %0, !dbg !18 + %add = fadd fast float %mul, %r.012, !dbg !19 + %indvars.iv.next = or i64 %indvars.iv, 1, !dbg !20 + %arrayidx.1 = getelementptr inbounds float, float* %A, i64 %indvars.iv.next, !dbg !9 + %3 = load float, float* %arrayidx.1, align 4, !dbg !9, !tbaa !10 + %arrayidx2.1 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv.next, !dbg !14 + %4 = load i32, i32* %arrayidx2.1, align 4, !dbg !14, !tbaa !15 + %idxprom3.1 = sext i32 %4 to i64, !dbg !17 + %arrayidx4.1 = getelementptr inbounds float, float* %B, i64 %idxprom3.1, !dbg !17 + %5 = load float, float* %arrayidx4.1, align 4, !dbg !17, !tbaa !10 + %mul.1 = fmul fast float %5, %3, !dbg !18 + %add.1 = fadd fast float %mul.1, %add, !dbg !19 + %indvars.iv.next.1 = or i64 %indvars.iv, 2, !dbg !20 + %arrayidx.2 = getelementptr inbounds float, float* %A, i64 %indvars.iv.next.1, !dbg !9 + %6 = load float, float* %arrayidx.2, align 4, !dbg !9, !tbaa !10 + %arrayidx2.2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv.next.1, !dbg !14 + %7 = load i32, i32* %arrayidx2.2, align 4, !dbg !14, !tbaa !15 + %idxprom3.2 = sext i32 %7 to i64, !dbg !17 + %arrayidx4.2 = getelementptr inbounds float, float* %B, i64 %idxprom3.2, !dbg !17 + %8 = load float, float* %arrayidx4.2, align 4, !dbg !17, !tbaa !10 + %mul.2 = fmul fast float %8, %6, !dbg !18 + %add.2 = fadd fast float %mul.2, %add.1, !dbg !19 + %indvars.iv.next.2 = or i64 %indvars.iv, 3, !dbg !20 + %arrayidx.3 = getelementptr inbounds float, float* %A, i64 %indvars.iv.next.2, !dbg !9 + %9 = load float, float* %arrayidx.3, align 4, !dbg !9, !tbaa !10 + %arrayidx2.3 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv.next.2, !dbg !14 + %10 = load i32, i32* %arrayidx2.3, align 4, !dbg !14, !tbaa !15 + %idxprom3.3 = sext i32 %10 to i64, !dbg !17 + %arrayidx4.3 = getelementptr inbounds float, float* %B, i64 %idxprom3.3, !dbg !17 + %11 = load float, float* %arrayidx4.3, align 4, !dbg !17, !tbaa !10 + %mul.3 = fmul fast float %11, %9, !dbg !18 + %add.3 = fadd fast float %mul.3, %add.2, !dbg !19 + %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 4, !dbg !20 + %exitcond.3 = icmp eq i64 %indvars.iv.next.3, 1000, !dbg !21 + br i1 %exitcond.3, label %for.end, label %for.body, !dbg !8, !llvm.loop !22 + +for.end: ; preds = %for.body + ret float %add.3, !dbg !24 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: nounwind readnone uwtable +define dso_local i32 @func() local_unnamed_addr #2 !dbg !25 { +entry: + br label %for.body, !dbg !26 + +for.cond.cleanup: ; preds = %for.body + ret i32 0, !dbg !27 + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 100, %entry ], [ %indvars.iv.next.24, %for.body ] + %indvars.iv.next.24 = add nuw nsw i64 %indvars.iv, 25, !dbg !28 + %exitcond.24 = icmp eq i64 %indvars.iv.next.24, 1000, !dbg !29 + br i1 %exitcond.24, label %for.cond.cleanup, label %for.body, !dbg !26, !llvm.loop !30 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #1 + +; Function Attrs: nounwind readnone uwtable +define dso_local i32 @func2() local_unnamed_addr #2 !dbg !32 { +entry: + %sum = alloca [1000 x i32], align 16 + %0 = bitcast [1000 x i32]* %sum to i8*, !dbg !33 + call void @llvm.lifetime.start.p0i8(i64 4000, i8* nonnull %0) #3, !dbg !33 + %1 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 4 + %2 = bitcast i32* %1 to i8* + call void @llvm.memset.p0i8.i64(i8* nonnull align 16 %2, i8 0, i64 3984, i1 false), !dbg !34 + %3 = bitcast [1000 x i32]* %sum to <4 x i32>*, !dbg !34 + store <4 x i32> , <4 x i32>* %3, align 16, !dbg !34 + br label %vector.body, !dbg !35 + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vector.recur = phi <8 x i32> [ , %entry ], [ %wide.load18, %vector.body ] + %4 = add i64 %index, 100 + %5 = add i64 %index, 101, !dbg !36 + %6 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 %5, !dbg !37 + %7 = bitcast i32* %6 to <8 x i32>*, !dbg !37 + %wide.load = load <8 x i32>, <8 x i32>* %7, align 4, !dbg !37, !tbaa !15 + %8 = getelementptr i32, i32* %6, i64 8, !dbg !37 + %9 = bitcast i32* %8 to <8 x i32>*, !dbg !37 + %wide.load16 = load <8 x i32>, <8 x i32>* %9, align 4, !dbg !37, !tbaa !15 + %10 = getelementptr i32, i32* %6, i64 16, !dbg !37 + %11 = bitcast i32* %10 to <8 x i32>*, !dbg !37 + %wide.load17 = load <8 x i32>, <8 x i32>* %11, align 4, !dbg !37, !tbaa !15 + %12 = getelementptr i32, i32* %6, i64 24, !dbg !37 + %13 = bitcast i32* %12 to <8 x i32>*, !dbg !37 + %wide.load18 = load <8 x i32>, <8 x i32>* %13, align 4, !dbg !37, !tbaa !15 + %14 = shufflevector <8 x i32> %vector.recur, <8 x i32> %wide.load, <8 x i32> , !dbg !38 + %15 = shufflevector <8 x i32> %wide.load, <8 x i32> %wide.load16, <8 x i32> , !dbg !38 + %16 = shufflevector <8 x i32> %wide.load16, <8 x i32> %wide.load17, <8 x i32> , !dbg !38 + %17 = shufflevector <8 x i32> %wide.load17, <8 x i32> %wide.load18, <8 x i32> , !dbg !38 + %18 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 %4, !dbg !38 + %19 = add nsw <8 x i32> %14, %wide.load, !dbg !39 + %20 = add nsw <8 x i32> %15, %wide.load16, !dbg !39 + %21 = add nsw <8 x i32> %16, %wide.load17, !dbg !39 + %22 = add nsw <8 x i32> %17, %wide.load18, !dbg !39 + %23 = bitcast i32* %18 to <8 x i32>*, !dbg !39 + store <8 x i32> %19, <8 x i32>* %23, align 16, !dbg !39, !tbaa !15 + %24 = getelementptr i32, i32* %18, i64 8, !dbg !39 + %25 = bitcast i32* %24 to <8 x i32>*, !dbg !39 + store <8 x i32> %20, <8 x i32>* %25, align 16, !dbg !39, !tbaa !15 + %26 = getelementptr i32, i32* %18, i64 16, !dbg !39 + %27 = bitcast i32* %26 to <8 x i32>*, !dbg !39 + store <8 x i32> %21, <8 x i32>* %27, align 16, !dbg !39, !tbaa !15 + %28 = getelementptr i32, i32* %18, i64 24, !dbg !39 + %29 = bitcast i32* %28 to <8 x i32>*, !dbg !39 + store <8 x i32> %22, <8 x i32>* %29, align 16, !dbg !39, !tbaa !15 + %index.next = add i64 %index, 32 + %30 = icmp eq i64 %index.next, 896 + br i1 %30, label %for.body, label %vector.body, !llvm.loop !40 + +for.body: ; preds = %vector.body + %vector.recur.extract = extractelement <8 x i32> %wide.load18, i32 7, !dbg !35 + %arrayidx = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 997, !dbg !37 + %31 = load i32, i32* %arrayidx, align 4, !dbg !37, !tbaa !15 + %arrayidx2 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 996, !dbg !38 + %add3 = add nsw i32 %vector.recur.extract, %31, !dbg !39 + store i32 %add3, i32* %arrayidx2, align 16, !dbg !39, !tbaa !15 + %arrayidx.1 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 998, !dbg !37 + %32 = load i32, i32* %arrayidx.1, align 8, !dbg !37, !tbaa !15 + %arrayidx2.1 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 997, !dbg !38 + %add3.1 = add nsw i32 %31, %32, !dbg !39 + store i32 %add3.1, i32* %arrayidx2.1, align 4, !dbg !39, !tbaa !15 + %arrayidx.2 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 999, !dbg !37 + %33 = load i32, i32* %arrayidx.2, align 4, !dbg !37, !tbaa !15 + %arrayidx2.2 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 998, !dbg !38 + %add3.2 = add nsw i32 %32, %33, !dbg !39 + store i32 %add3.2, i32* %arrayidx2.2, align 8, !dbg !39, !tbaa !15 + %arrayidx.3 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 1000, !dbg !37 + %34 = load i32, i32* %arrayidx.3, align 16, !dbg !37, !tbaa !15 + %arrayidx2.3 = getelementptr inbounds [1000 x i32], [1000 x i32]* %sum, i64 0, i64 999, !dbg !38 + %add3.3 = add nsw i32 %33, %34, !dbg !39 + store i32 %add3.3, i32* %arrayidx2.3, align 4, !dbg !39, !tbaa !15 + call void @llvm.lifetime.end.p0i8(i64 4000, i8* nonnull %0) #3, !dbg !43 + ret i32 0, !dbg !44 +} + +; Function Attrs: nounwind readnone uwtable +define dso_local i32 @main() local_unnamed_addr #2 !dbg !45 { +entry: + ret i32 5, !dbg !46 +} + +attributes #0 = { norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} +!llvm.ident = !{!5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 7.0.0 (https://git.llvm.org/git/clang.git/ f4ab7b42ebfeff74222e4fe541a878f0f6a83d48) (https://git.llvm.org/git/llvm.git/ db5834789d3042785c79007738c6e812e9c38eb4)", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2) +!1 = !DIFile(filename: "source.c", directory: "/home/cs15btech11042/llvm-remarks/test_cases") +!2 = !{} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{!"clang version 7.0.0 (https://git.llvm.org/git/clang.git/ f4ab7b42ebfeff74222e4fe541a878f0f6a83d48) (https://git.llvm.org/git/llvm.git/ db5834789d3042785c79007738c6e812e9c38eb4)"} +!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 2, type: !7, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2) +!7 = !DISubroutineType(types: !2) +!8 = !DILocation(line: 6, column: 4, scope: !6) +!9 = !DILocation(line: 8, column: 11, scope: !6) +!10 = !{!11, !11, i64 0} +!11 = !{!"float", !12, i64 0} +!12 = !{!"omnipotent char", !13, i64 0} +!13 = !{!"Simple C/C++ TBAA"} +!14 = !DILocation(line: 8, column: 20, scope: !6) +!15 = !{!16, !16, i64 0} +!16 = !{!"int", !12, i64 0} +!17 = !DILocation(line: 8, column: 18, scope: !6) +!18 = !DILocation(line: 8, column: 16, scope: !6) +!19 = !DILocation(line: 8, column: 8, scope: !6) +!20 = !DILocation(line: 6, column: 27, scope: !6) +!21 = !DILocation(line: 6, column: 18, scope: !6) +!22 = distinct !{!22, !8, !23} +!23 = !DILocation(line: 9, column: 4, scope: !6) +!24 = !DILocation(line: 10, column: 4, scope: !6) +!25 = distinct !DISubprogram(name: "func", scope: !1, file: !1, line: 12, type: !7, isLocal: false, isDefinition: true, scopeLine: 13, isOptimized: true, unit: !0, variables: !2) +!26 = !DILocation(line: 16, column: 6, scope: !25) +!27 = !DILocation(line: 20, column: 5, scope: !25) +!28 = !DILocation(line: 16, column: 29, scope: !25) +!29 = !DILocation(line: 16, column: 22, scope: !25) +!30 = distinct !{!30, !26, !31} +!31 = !DILocation(line: 19, column: 5, scope: !25) +!32 = distinct !DISubprogram(name: "func2", scope: !1, file: !1, line: 22, type: !7, isLocal: false, isDefinition: true, scopeLine: 23, isOptimized: true, unit: !0, variables: !2) +!33 = !DILocation(line: 24, column: 5, scope: !32) +!34 = !DILocation(line: 24, column: 9, scope: !32) +!35 = !DILocation(line: 26, column: 6, scope: !32) +!36 = !DILocation(line: 28, column: 24, scope: !32) +!37 = !DILocation(line: 28, column: 19, scope: !32) +!38 = !DILocation(line: 28, column: 9, scope: !32) +!39 = !DILocation(line: 28, column: 16, scope: !32) +!40 = distinct !{!40, !35, !41, !42} +!41 = !DILocation(line: 29, column: 5, scope: !32) +!42 = !{!"llvm.loop.isvectorized", i32 1} +!43 = !DILocation(line: 31, column: 1, scope: !32) +!44 = !DILocation(line: 30, column: 5, scope: !32) +!45 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 32, type: !7, isLocal: false, isDefinition: true, scopeLine: 33, isOptimized: true, unit: !0, variables: !2) +!46 = !DILocation(line: 41, column: 5, scope: !45)