diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h --- a/llvm/include/llvm/Support/MathExtras.h +++ b/llvm/include/llvm/Support/MathExtras.h @@ -732,6 +732,11 @@ return alignTo(Numerator, Denominator) / Denominator; } +/// Returns the integer nearest(Numerator / Denominator). +inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) { + return (Numerator + (Denominator / 2)) / Denominator; +} + /// Returns the largest uint64_t less than or equal to \p Value and is /// \p Skew mod \p Align. \p Align must be non-zero inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -261,10 +261,22 @@ void addStringMetadataToLoop(Loop *TheLoop, const char *MDString, unsigned V = 0); -/// Get a loop's estimated trip count based on branch weight metadata. +/// Returns a loop's estimated trip count based on branch weight metadata. +/// In addition if \p EstimatedBackEdgeExitWeight is not null it is initialized +/// with weight of loop's latch leading to the exit. /// Returns 0 when the count is estimated to be 0, or None when a meaningful /// estimate can not be made. -Optional getLoopEstimatedTripCount(Loop *L); +Optional +getLoopEstimatedTripCount(Loop *L, + unsigned *EstimatedBackEdgeExitWeight = nullptr); + +/// Set a loop's branch weight metadata to reflect that loop has \p +/// EstimatedTripCount iterations and \p EstimatedBackEdgeExitWeight exits +/// through latch. Returns true if metadata is successfully updated, false +/// otherwise. Note that loop must have a latch block which controls loop exit +/// in order to succeed. +bool setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount, + unsigned EstimatedBackEdgeExitWeight); /// Check inner loop (L) backedge count is known to be invariant on all /// iterations of its outer loop. If the loop has no parent, this is trivially @@ -357,6 +369,22 @@ bool cannotBeMinInLoop(const SCEV *S, const Loop *L, ScalarEvolution &SE, bool Signed); +/// Set weights for the \p UnrolledLoop \p RemainderLoop based on weights for +/// the \p OrigLoop and the following distribution of \p OrigLoop iterations +/// among \p UnrolledLoop and \p RemainderLoop. \p UnrolledLoop receives weights +/// that reflect TC/UF iterations, and \p RemainderLoop receives weights that +/// reflect the remaining TC%UF iterations. +/// +/// Note that \p OrigLoop may be equal to either \p UnrolledLoop or \p +/// RemainderLoop in which case weights for \p OrigLoop are updated accordingly. +/// Note also behavior is undefined if \p UnrolledLoop or \p RemainderLoop are +/// equal. If \p OrigLoop has no profile info associated nothing happens. +/// +/// This utility may be useful for such optimizations as unroller and +/// vectorizer as it's typical transformation for them. +void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, + Loop *RemainderLoop, uint64_t UF); + } // end namespace llvm #endif // LLVM_TRANSFORMS_UTILS_LOOPUTILS_H diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -32,6 +32,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" @@ -677,17 +678,14 @@ } } -Optional llvm::getLoopEstimatedTripCount(Loop *L) { - // Support loops with an exiting latch and other existing exists only - // deoptimize. - +static bool isLoopLatchControlsExit(Loop *L) { // Get the branch weights for the loop's backedge. BasicBlock *Latch = L->getLoopLatch(); if (!Latch) - return None; + return false; BranchInst *LatchBR = dyn_cast(Latch->getTerminator()); if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch)) - return None; + return false; assert((LatchBR->getSuccessor(0) == L->getHeader() || LatchBR->getSuccessor(1) == L->getHeader()) && @@ -698,24 +696,76 @@ if (any_of(ExitBlocks, [](const BasicBlock *EB) { return !EB->getTerminatingDeoptimizeCall(); })) + return false; + return true; +} + +Optional +llvm::getLoopEstimatedTripCount(Loop *L, + unsigned *EstimatedBackEdgeExitWeight) { + // Support loops with an exiting latch and other existing exists only + // deoptimize. + if (!isLoopLatchControlsExit(L)) return None; + BranchInst *LatchBR = + dyn_cast(L->getLoopLatch()->getTerminator()); + // To estimate the number of times the loop body was executed, we want to // know the number of times the backedge was taken, vs. the number of times // we exited the loop. - uint64_t TrueVal, FalseVal; - if (!LatchBR->extractProfMetadata(TrueVal, FalseVal)) + uint64_t BackEdgeTakenWeight, BackEdgeExitWeight; + if (!LatchBR->extractProfMetadata(BackEdgeTakenWeight, BackEdgeExitWeight)) return None; - if (!TrueVal || !FalseVal) + if (LatchBR->getSuccessor(0) != L->getHeader()) + std::swap(BackEdgeTakenWeight, BackEdgeExitWeight); + + if (EstimatedBackEdgeExitWeight) + *EstimatedBackEdgeExitWeight = BackEdgeExitWeight; + + if (!BackEdgeTakenWeight || !BackEdgeExitWeight) return 0; - // Divide the count of the backedge by the count of the edge exiting the loop, - // rounding to nearest. - if (LatchBR->getSuccessor(0) == L->getHeader()) - return (TrueVal + (FalseVal / 2)) / FalseVal; - else - return (FalseVal + (TrueVal / 2)) / TrueVal; + // Trip count is ratio of loop header block execution weight to loop exit + // weight. + uint64_t HeaderBlockWeight = + BackEdgeTakenWeight + BackEdgeExitWeight; + return llvm::divideNearest(HeaderBlockWeight, BackEdgeExitWeight); +} + +bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount, + unsigned EstimatedBackEdgeExitWeight) { + // Support loops with an exiting latch and other existing exists only + // deoptimize. + if (!isLoopLatchControlsExit(L)) + return false; + + // Calculate taken and exit weights. + unsigned BackEdgeExitWeight = 0; + unsigned BackEdgeTakenWeight = 0; + + if (EstimatedTripCount > 0) { + BackEdgeExitWeight = EstimatedBackEdgeExitWeight; + BackEdgeTakenWeight = (EstimatedTripCount - 1) * BackEdgeExitWeight; + } + + BasicBlock *LoopLatch = L->getLoopLatch(); + BranchInst *LatchBranch = dyn_cast(LoopLatch->getTerminator()); + bool IsBackEdgeTakenOnTrue = L->contains(*succ_begin(LoopLatch)); + + // Make a swap if back edge is taken when condition is "false". + if (!IsBackEdgeTakenOnTrue) + std::swap(BackEdgeTakenWeight, BackEdgeExitWeight); + + MDBuilder MDB(LatchBranch->getContext()); + + // Set/Update profile metadata. + LatchBranch->setMetadata( + LLVMContext::MD_prof, + MDB.createBranchWeights(BackEdgeTakenWeight, BackEdgeExitWeight)); + + return true; } bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop, @@ -1031,3 +1081,28 @@ SE.isLoopEntryGuardedByCond(L, Predicate, S, SE.getConstant(Max)); } + +/// Set weights for the \p UnrolledLoop \p RemainderLoop based on weights for +/// the \p OrigLoop. +void llvm::setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, + Loop *RemainderLoop, uint64_t UF) { + assert(UnrolledLoop != RemainderLoop && + "Unrolled and Remainder loops are expected to not be the same"); + + // Get number of iterations in the original scalar loop. + unsigned OrigBackEdgeExitWeight = 0; + Optional OrigAverageTripCount = + getLoopEstimatedTripCount(OrigLoop, &OrigBackEdgeExitWeight); + if (!OrigAverageTripCount) + return; + + // Calculate number of iterations in unrolled loop. + unsigned UnrolledAverageTripCount = *OrigAverageTripCount / UF; + // Calculate number of iterations for remainder loop. + unsigned RemainderAverageTripCount = *OrigAverageTripCount % UF; + + setLoopEstimatedTripCount(UnrolledLoop, UnrolledAverageTripCount, + OrigBackEdgeExitWeight); + setLoopEstimatedTripCount(RemainderLoop, RemainderAverageTripCount, + OrigBackEdgeExitWeight); +} diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3460,6 +3460,19 @@ // Remove redundant induction instructions. cse(LoopVectorBody); + + // Set/update profile weights for the vector and remainder loops as original + // loop iterations are now distributed among them. Note that original loop + // represented by LoopScalarBody becomes remainder loop after vectorization. + // + // For cases like foldTailByMasking() and requiresScalarEpiloque() we may + // end up getting slightly roughened result but that should be OK since + // profile is not inherently precise anyway. Note also possible bypass of + // vector code caused by legality checks is not taken into account as unlikely + // case. + setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), + LI->getLoopFor(LoopVectorBody), + LI->getLoopFor(LoopScalarBody), VF * UF); } void InnerLoopVectorizer::fixCrossIterationPHIs() { diff --git a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll @@ -0,0 +1,97 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes="print,loop-vectorize" -force-vector-width=4 -force-vector-interleave=1 -S < %s | FileCheck %s +; RUN: opt -passes="print,loop-vectorize" -force-vector-width=4 -force-vector-interleave=4 -S < %s | FileCheck %s -check-prefix=CHECK-MASKED + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = dso_local global [1024 x i32] zeroinitializer, align 16 +@b = dso_local global [1024 x i32] zeroinitializer, align 16 + +; Check correctness of profile info for vectorization without epilog. +; Function Attrs: nofree norecurse nounwind uwtable +define dso_local void @_Z3foov() local_unnamed_addr #0 { +; CHECK-LABEL: @_Z3foov( +; CHECK: [[VECTOR_BODY:vector\.body]]: +; CHECK: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]], +; CHECK: [[FOR_BODY:for\.body]]: +; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]], +; CHECK-MASKED: [[VECTOR_BODY:vector\.body]]: +; CHECK-MASKED: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]], +; CHECK-MASKED: [[FOR_BODY:for\.body]]: +; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]], +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %1 = trunc i64 %indvars.iv to i32 + %mul = mul nsw i32 %0, %1 + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %indvars.iv + %2 = load i32, i32* %arrayidx2, align 4, !tbaa !2 + %add = add nsw i32 %2, %mul + store i32 %add, i32* %arrayidx2, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !6 +} + +; Check correctness of profile info for vectorization with epilog. +; Function Attrs: nofree norecurse nounwind uwtable +define dso_local void @_Z3foo2v() local_unnamed_addr #0 { +; CHECK-LABEL: @_Z3foo2v( +; CHECK: [[VECTOR_BODY:vector\.body]]: +; CHECK: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]], +; CHECK: [[FOR_BODY:for\.body]]: +; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]], +; CHECK-MASKED: [[VECTOR_BODY:vector\.body]]: +; CHECK-MASKED: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]], +; CHECK-MASKED: [[FOR_BODY:for\.body]]: +; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]], +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %1 = trunc i64 %indvars.iv to i32 + %mul = mul nsw i32 %0, %1 + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %indvars.iv + %2 = load i32, i32* %arrayidx2, align 4, !tbaa !2 + %add = add nsw i32 %2, %mul + store i32 %add, i32* %arrayidx2, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1027 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !7 +} + +attributes #0 = { "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +; CHECK: [[LP1_255]] = !{!"branch_weights", i32 1, i32 255} +; CHECK: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0} +; CHECK-MASKED: [[LP1_63]] = !{!"branch_weights", i32 1, i32 63} +; CHECK-MASKED: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0} +; CHECK: [[LP1_2]] = !{!"branch_weights", i32 1, i32 2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project c292b5b5e059e6ce3e6449e6827ef7e1037c21c4)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C++ TBAA"} +!6 = !{!"branch_weights", i32 1, i32 1023} +!7 = !{!"branch_weights", i32 1, i32 1026} diff --git a/llvm/test/Transforms/LoopVectorize/tripcount.ll b/llvm/test/Transforms/LoopVectorize/tripcount.ll --- a/llvm/test/Transforms/LoopVectorize/tripcount.ll +++ b/llvm/test/Transforms/LoopVectorize/tripcount.ll @@ -61,8 +61,10 @@ ; but has a high trip count per invocation. Vectorize it. ; CHECK-LABEL: @foo_low_trip_count3( -; CHECK: vector.body: - +; CHECK: [[VECTOR_BODY:vector\.body]]: +; CHECK: br i1 [[TMP9:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP3:\!.*]], +; CHECK: [[FOR_BODY:for\.body]]: +; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP6:\!.*]], entry: br i1 %cond, label %for.preheader, label %for.end, !prof !2 @@ -205,6 +207,9 @@ ret i32 0 } +; CHECK: [[LP3]] = !{!"branch_weights", i32 10, i32 2490} +; CHECK: [[LP6]] = !{!"branch_weights", i32 10, i32 0} + !0 = !{!"function_entry_count", i64 100} !1 = !{!"branch_weights", i32 100, i32 0} !2 = !{!"branch_weights", i32 10, i32 90}