diff --git a/llvm/include/llvm/Transforms/Utils/SizeOpts.h b/llvm/include/llvm/Transforms/Utils/SizeOpts.h --- a/llvm/include/llvm/Transforms/Utils/SizeOpts.h +++ b/llvm/include/llvm/Transforms/Utils/SizeOpts.h @@ -22,12 +22,12 @@ /// Returns true if function \p F is suggested to be size-optimized base on the /// profile. -bool shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI, - BlockFrequencyInfo *BFI); +Optional shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI); /// Returns true if basic block \p BB is suggested to be size-optimized base /// on the profile. -bool shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI, - BlockFrequencyInfo *BFI); +Optional shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI); } // end namespace llvm diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp --- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -552,8 +552,9 @@ unsigned NumUses = 0; bool OptForSize = Entry->getParent()->hasOptSize() || - llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI); - if (!OptForSize || std::distance(S,E) > 100) { + llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI) + .getValueOr(false); + if (!OptForSize || std::distance(S, E) > 100) { for (auto ConstCand = S; ConstCand != E; ++ConstCand) { NumUses += ConstCand->Uses.size(); if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost) diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -542,8 +542,9 @@ auto *HeaderBB = L->getHeader(); auto *F = HeaderBB->getParent(); - bool OptForSize = F->hasOptSize() || - llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI); + bool OptForSize = + F->hasOptSize() || + llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI).getValueOr(false); if (OptForSize) { LLVM_DEBUG( dbgs() << "Versioning is needed but not allowed when optimizing " diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -210,8 +210,9 @@ TTI.getUnrollingPreferences(L, SE, UP); // Apply size attributes - bool OptForSize = L->getHeader()->getParent()->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI); + bool OptForSize = + L->getHeader()->getParent()->hasOptSize() || + llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI).getValueOr(false); if (OptForSize) { UP.Threshold = UP.OptSizeThreshold; UP.PartialThreshold = UP.PartialOptSizeThreshold; diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -2596,8 +2596,9 @@ // Don't rewrite fputs to fwrite when optimising for size because fwrite // requires more arguments and thus extra MOVs are required. - bool OptForSize = CI->getFunction()->hasOptSize() || - llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI); + bool OptForSize = + CI->getFunction()->hasOptSize() || + llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI).getValueOr(false); if (OptForSize) return nullptr; diff --git a/llvm/lib/Transforms/Utils/SizeOpts.cpp b/llvm/lib/Transforms/Utils/SizeOpts.cpp --- a/llvm/lib/Transforms/Utils/SizeOpts.cpp +++ b/llvm/lib/Transforms/Utils/SizeOpts.cpp @@ -20,18 +20,19 @@ "pgso", cl::Hidden, cl::init(true), cl::desc("Enable the profile guided size optimization. ")); -bool llvm::shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI, - BlockFrequencyInfo *BFI) { +Optional llvm::shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { assert(F); - if (!PSI || !BFI || !PSI->hasProfileSummary()) - return false; - return ProfileGuidedSizeOpt && PSI->isFunctionColdInCallGraph(F, *BFI); + if (!ProfileGuidedSizeOpt || !PSI || !BFI || !PSI->hasProfileSummary()) + return None; + return PSI->isFunctionColdInCallGraph(F, *BFI); } -bool llvm::shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI, - BlockFrequencyInfo *BFI) { +Optional llvm::shouldOptimizeForSize(BasicBlock *BB, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { assert(BB); - if (!PSI || !BFI || !PSI->hasProfileSummary()) - return false; - return ProfileGuidedSizeOpt && PSI->isColdBlock(BB, BFI); + if (!ProfileGuidedSizeOpt || !PSI || !BFI || !PSI->hasProfileSummary()) + return None; + return PSI->isColdBlock(BB, BFI); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -293,6 +293,15 @@ "vectorize-loops", cl::init(true), cl::Hidden, cl::desc("Run the Loop vectorization passes")); +static cl::opt LocalHotnessThreshold( + "local-hotness-threshold", cl::init(500), cl::Hidden, + cl::desc( + "In cases when there is no info on block hotness available from module " + "profile we define \"local hotness\" as a ratio of the block to " + "function entry execution counts. If the ration is greater than the " + "threshold defined by this parameter the block is said to be locally " + "hot.")); + /// A helper function for converting Scalar types to vector types. /// If the incoming type is void, we return void. If the VF is 1, we return /// the scalar type. @@ -930,7 +939,7 @@ // Vectorization with OptForSize: don't allow epilogues. CM_ScalarEpilogueNotAllowedOptSize, - // A special case of vectorisation with OptForSize: loops with a very small + // A special case of vectorization with OptForSize: loops with a very small // trip count are considered for vectorization under OptForSize, thereby // making sure the cost of their loop body is dominant, free of runtime // guards and scalar iteration overheads. @@ -7365,15 +7374,48 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, - ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, + ScalarEvolution &SE) { ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; + auto IsColdByProfile = llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI); if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && - (F->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) + (F->hasOptSize() || IsColdByProfile.getValueOr(false))) SEL = CM_ScalarEpilogueNotAllowedOptSize; else if (PreferPredicateOverEpilog || Hints.getPredicate()) SEL = CM_ScalarEpilogueNotNeededUsePredicate; + else { + auto ExpectedTC = getSmallBestKnownTC(SE, L); + // Check the loop for a trip count threshold: vectorize loops with a tiny + // trip count by optimizing for size, to minimize overheads. + if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { + // Even short trip count loops may be hot (part of hot region). + // In absence of profile summary estimate loop hotness relative to + // function entry using execution frequency information. + if (!IsColdByProfile && LoopVectorizeWithBlockFrequency && BFI) { + Optional LoopCount = + BFI->getBlockProfileCount(L->getHeader(), true); + Optional FunctionCount = + BFI->getBlockProfileCount(&F->getEntryBlock(), true); + if (LoopCount && FunctionCount && + (*LoopCount > *FunctionCount * LocalHotnessThreshold)) { + LLVM_DEBUG(dbgs() << "Allow epilog for short trip count loop due to " + "hotness considerations."); + return CM_ScalarEpilogueAllowed; + } + } + + LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " + << "This loop is worth vectorizing only if no scalar " + << "iteration overheads are incurred."); + if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) + LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); + else { + LLVM_DEBUG(dbgs() << "\n"); + SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; + } + } + } return SEL; } @@ -7391,7 +7433,8 @@ assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); - ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); + ScalarEpilogueLowering SEL = + getScalarEpilogueLowering(F, L, Hints, PSI, BFI, *PSE.getSE()); LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); @@ -7483,7 +7526,8 @@ // Check the function attributes and profiles to find out if this function // should be optimized for size. - ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); + ScalarEpilogueLowering SEL = + getScalarEpilogueLowering(F, L, Hints, PSI, BFI, *SE); // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before @@ -7496,21 +7540,6 @@ assert(L->empty() && "Inner loop expected."); - // Check the loop for a trip count threshold: vectorize loops with a tiny trip - // count by optimizing for size, to minimize overheads. - auto ExpectedTC = getSmallBestKnownTC(*SE, L); - if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { - LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " - << "This loop is worth vectorizing only if no scalar " - << "iteration overheads are incurred."); - if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) - LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); - else { - LLVM_DEBUG(dbgs() << "\n"); - SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; - } - } - // Check the function attributes to see if implicit floats are allowed. // FIXME: This check doesn't seem possibly correct -- what if the loop is // an integer loop and the vector instructions selected are purely integer diff --git a/llvm/test/Transforms/LoopVectorize/hot_short_tc_loop.ll b/llvm/test/Transforms/LoopVectorize/hot_short_tc_loop.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/hot_short_tc_loop.ll @@ -0,0 +1,209 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes="print,loop-vectorize" -S < %s 2>&1 | FileCheck %s + +; Check vectorization of hot short trip count with epilog. In this case inner +; loop trip count is not constant and its value is estimated by profile. + +; ModuleID = 'test.cpp' +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = dso_local global [5 x i32] zeroinitializer, align 16 +@b = dso_local global [5 x i32] zeroinitializer, align 16 + +; Function Attrs: uwtable +define dso_local void @_Z3fooi(i32 %M) local_unnamed_addr #0 !prof !11 { +; CHECK: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15:%.*]] +; CHECK: [[TMP18:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND6:%.*]] +; CHECK: [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP23:%.*]] +; CHECK: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], [[TMP18]] +; CHECK: store <4 x i32> [[TMP26]], <4 x i32>* [[TMP28:%.*]] +; +entry: + %a = alloca [5 x i32], align 16 + %b = alloca [5 x i32], align 16 + %0 = bitcast [5 x i32]* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %0) #3 + %1 = bitcast [5 x i32]* %b to i8* + call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %1) #3 + %arraydecay = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 0 + br label %for.body.us.preheader + +for.body.us.preheader: ; preds = %entry + %wide.trip.count = zext i32 %M to i64 + br label %for.body.us + +for.body.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.body.us.preheader + %j.019.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.body.us.preheader ] + call void @_Z3barPi(i32* nonnull %arraydecay) + br label %for.body4.us + +for.body4.us: ; preds = %for.body4.us, %for.body.us + %indvars.iv = phi i64 [ 0, %for.body.us ], [ %indvars.iv.next, %for.body4.us ] + %arrayidx.us = getelementptr inbounds [5 x i32], [5 x i32]* %b, i64 0, i64 %indvars.iv + %2 = load i32, i32* %arrayidx.us, align 4, !tbaa !2 + %3 = trunc i64 %indvars.iv to i32 + %mul.us = mul nsw i32 %2, %3 + %arrayidx6.us = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 %indvars.iv + %4 = load i32, i32* %arrayidx6.us, align 4, !tbaa !2 + %add.us = add nsw i32 %4, %mul.us + store i32 %add.us, i32* %arrayidx6.us, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us, !prof !10 + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us + %inc8.us = add nuw nsw i32 %j.019.us, 1 + %exitcond21 = icmp eq i32 %inc8.us, 20 + br i1 %exitcond21, label %for.cond.cleanup.loopexit, label %for.body.us, !prof !12 + +for.cond.cleanup.loopexit: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us + br label %for.cond.cleanup + +for.cond.cleanup.loopexit24: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit24, %for.cond.cleanup.loopexit + call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %1) #3 + call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %0) #3 + ret void +} + +; Check vectorization of hot short trip count with epilog. In this case inner +; loop trip count is known constant value. + +; Function Attrs: uwtable +define dso_local void @_Z3fooi2() local_unnamed_addr #0 !prof !14 { +; CHECK: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15:%.*]] +; CHECK: [[TMP18:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND6:%.*]] +; CHECK: [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP23:%.*]] +; CHECK: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], [[TMP18]] +; CHECK: store <4 x i32> [[TMP26]], <4 x i32>* [[TMP28:%.*]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup3 + ret void + +for.body: ; preds = %entry, %for.cond.cleanup3 + %j.018 = phi i32 [ 0, %entry ], [ %inc8, %for.cond.cleanup3 ] + tail call void @_Z3barPi(i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i64 0, i64 0)) + br label %for.body4 + +for.cond.cleanup3: ; preds = %for.body4 + %inc8 = add nuw nsw i32 %j.018, 1 + %cmp = icmp ult i32 %inc8, 1000 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !prof !13 + +for.body4: ; preds = %for.body, %for.body4 + %i.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body4 ] + %idxprom = zext i32 %i.017 to i64 + %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @b, i64 0, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul = mul nsw i32 %0, %i.017 + %arrayidx6 = getelementptr inbounds [5 x i32], [5 x i32]* @a, i64 0, i64 %idxprom + %1 = load i32, i32* %arrayidx6, align 4, !tbaa !2 + %add = add nsw i32 %1, %mul + store i32 %add, i32* %arrayidx6, align 4, !tbaa !2 + %inc = add nuw nsw i32 %i.017, 1 + %cmp2 = icmp ult i32 %inc, 5 + br i1 %cmp2, label %for.body4, label %for.cond.cleanup3 +} + +; This is negative test. Check that vectorization is not performed for COLD +; short trip count loop requiring epilog. Note that outer loop has only 20 +; iterations and there is no associated profile info. + + +; Function Attrs: uwtable +define dso_local void @_Z3fooi3(i32 %M) local_unnamed_addr #0 !prof !16 { +; CHECK: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_US:%.*]] +; CHECK: [[MUL_US:%.*]] = mul nsw i32 [[TMP2]], [[TMP3:%.*]] +; CHECK: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX6_US:%.*]] +; CHECK: [[ADD_US:%.*]] = add nsw i32 [[TMP4]], [[MUL_US]] +; CHECK: store i32 [[ADD_US]], i32* [[ARRAYIDX6_US]] +; +entry: + %a = alloca [5 x i32], align 16 + %b = alloca [5 x i32], align 16 + %0 = bitcast [5 x i32]* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %0) #3 + %1 = bitcast [5 x i32]* %b to i8* + call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %1) #3 + %arraydecay = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 0 + br label %for.body.us.preheader + +for.body.us.preheader: ; preds = %entry + %wide.trip.count = zext i32 %M to i64 + br label %for.body.us + +for.body.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.body.us.preheader + %j.019.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.body.us.preheader ] + call void @_Z3barPi(i32* nonnull %arraydecay) + br label %for.body4.us + +for.body4.us: ; preds = %for.body4.us, %for.body.us + %indvars.iv = phi i64 [ 0, %for.body.us ], [ %indvars.iv.next, %for.body4.us ] + %arrayidx.us = getelementptr inbounds [5 x i32], [5 x i32]* %b, i64 0, i64 %indvars.iv + %2 = load i32, i32* %arrayidx.us, align 4, !tbaa !2 + %3 = trunc i64 %indvars.iv to i32 + %mul.us = mul nsw i32 %2, %3 + %arrayidx6.us = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 %indvars.iv + %4 = load i32, i32* %arrayidx6.us, align 4, !tbaa !2 + %add.us = add nsw i32 %4, %mul.us + store i32 %add.us, i32* %arrayidx6.us, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us, !prof !15 + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us + %inc8.us = add nuw nsw i32 %j.019.us, 1 + %exitcond21 = icmp eq i32 %inc8.us, 20 + br i1 %exitcond21, label %for.cond.cleanup.loopexit, label %for.body.us + +for.cond.cleanup.loopexit: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us + br label %for.cond.cleanup + +for.cond.cleanup.loopexit24: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit24, %for.cond.cleanup.loopexit + call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %1) #3 + call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %0) #3 + ret void +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1 + +declare dso_local void @_Z3barPi(i32*) local_unnamed_addr #2 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1 + +attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind willreturn } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project f379dd57b978c4e1483d721f422c79e3c0c5ccdc)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C++ TBAA"} +!6 = distinct !{!6, !7} +!7 = !{!"llvm.loop.isvectorized", i32 1} +!8 = distinct !{!8, !9, !7} +!9 = !{!"llvm.loop.unroll.runtime.disable"} +!10 = !{!"branch_weights", i32 999, i32 4995} +!12 = !{!"branch_weights", i32 1, i32 999} +!11 = !{!"function_entry_count", i64 1} +!13 = !{!"branch_weights", i32 1000, i32 1} +!14 = !{!"function_entry_count", i64 1} +!15 = !{!"branch_weights", i32 9, i32 45} +!16 = !{!"function_entry_count", i64 1}