diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -357,6 +357,19 @@ bool cannotBeMinInLoop(const SCEV *S, const Loop *L, ScalarEvolution &SE, bool Signed); +/// Update profile info for the \p OrigLoop and \p UnrolledLoop so that original +/// number of iterations in the \p OrigLoop (TC) are distributed as follows. +/// \p OrigLoop gets TC%UF iterations, while rest iterations are executed as +/// part of \p UrolledLoop. In addition, \p UrolledLoop executes blocks of \UF +/// original iterations thus will do TC/UF iterations in total. +/// +/// This utility may be useful for such optimizations as unroller and +/// vectorizer as it's typical transformation for them. +/// +/// If \p OrigLoop has no profile info associated nothing is done. +void fixProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UrolledLoop, + uint64_t UF); + } // end namespace llvm #endif // LLVM_TRANSFORMS_UTILS_LOOPUTILS_H diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -32,6 +32,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" @@ -1031,3 +1032,72 @@ SE.isLoopEntryGuardedByCond(L, Predicate, S, SE.getConstant(Max)); } + +/// Update profile info for the \p OrigLoop and \p UnrolledLoop. +void llvm::fixProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, + uint64_t UF) { + uint64_t OrigBackedgeTakenWeight = 0; + uint64_t OrigLoopEntryWeight = 0; + auto *OrigLoopLatchBranch = OrigLoop->getLoopLatch()->getTerminator(); + + if (!OrigLoopLatchBranch->extractProfMetadata(OrigBackedgeTakenWeight, + OrigLoopEntryWeight)) + return; + + MDBuilder MDB(OrigLoopLatchBranch->getContext()); + auto *UnrolledBBI = UnrolledLoop->getLoopLatch()->getTerminator(); + bool IsTrueBackEdgeOrigLoop = + OrigLoop->contains(*succ_begin(OrigLoop->getLoopLatch())); + bool IsTrueBackEdgeVecLoop = + UnrolledLoop->contains(*succ_begin(UnrolledLoop->getLoopLatch())); + + if (!IsTrueBackEdgeOrigLoop) + std::swap(OrigBackedgeTakenWeight, OrigLoopEntryWeight); + + if (OrigLoopEntryWeight == 0) + return; + + // Calculate number of iterations in the original scalar loop. + const uint64_t OrigHeaderBlockWeight = + OrigBackedgeTakenWeight + OrigLoopEntryWeight; + const uint64_t OrigAverageTripCount = + OrigHeaderBlockWeight / OrigLoopEntryWeight; + // Calculate number of iterations in unrolled loop. + uint64_t UrollAverageTripCount = OrigAverageTripCount / UF; + // Calculate number of iterations for remainder loop. + uint64_t RemainderAverageTripCount = OrigAverageTripCount % UF; + + // Calculate taken and fall through counts for unrolled loop. + uint64_t UnrolledLoopBackedgeWeight = 0; + uint64_t UnrolledLoopEntryWeight = 0; + if (UrollAverageTripCount > 0) { + UnrolledLoopBackedgeWeight = + (UrollAverageTripCount - 1) * OrigLoopEntryWeight; + UnrolledLoopEntryWeight = OrigLoopEntryWeight; + } + + // Now calculate counters for remainder loop. + uint64_t RemainderLoopEntryWeight = 0; + uint64_t RemainderLoopBackedgeWeight = 0; + if (RemainderAverageTripCount > 0) { + RemainderLoopEntryWeight = + (RemainderAverageTripCount - 1) * OrigLoopEntryWeight; + RemainderLoopBackedgeWeight = OrigLoopEntryWeight; + } + + // Make a swap if back edge is taken when condition "false". + if (!IsTrueBackEdgeVecLoop) + std::swap(UnrolledLoopBackedgeWeight, UnrolledLoopEntryWeight); + // Set new profile metadata. + UnrolledBBI->setMetadata(LLVMContext::MD_prof, + MDB.createBranchWeights(UnrolledLoopBackedgeWeight, + UnrolledLoopEntryWeight)); + // Make a swap if back edge is taken when condition "false". + if (!IsTrueBackEdgeOrigLoop) + std::swap(RemainderLoopEntryWeight, RemainderLoopBackedgeWeight); + // Set new profile metadata. + OrigLoopLatchBranch->setMetadata( + LLVMContext::MD_prof, + MDB.createBranchWeights(RemainderLoopEntryWeight, + RemainderLoopBackedgeWeight)); +} diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -115,6 +115,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" @@ -3452,6 +3453,13 @@ // Remove redundant induction instructions. cse(LoopVectorBody); + + // For cases like foldTailByMasking() and requiresScalarEpiloque() we may + // end up getting slightly roughened result but that should be OK since + // profile is not inherently precise anyway. Note also possible bypass of + // vector code caused by legality checks is into account as unlikely to case. + fixProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), + LI->getLoopFor(LoopVectorBody), VF * UF); } void InnerLoopVectorizer::fixCrossIterationPHIs() { diff --git a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll @@ -0,0 +1,97 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes="print,loop-vectorize" -force-vector-width=4 -force-vector-interleave=1 -S < %s | FileCheck %s +; RUN: opt -passes="print,loop-vectorize" -force-vector-width=4 -force-vector-interleave=4 -S < %s | FileCheck %s -check-prefix=CHECK-MASKED + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = dso_local global [1024 x i32] zeroinitializer, align 16 +@b = dso_local global [1024 x i32] zeroinitializer, align 16 + +; Check correctness of profile info for vectorization without epilog. +; Function Attrs: nofree norecurse nounwind uwtable +define dso_local void @_Z3foov() local_unnamed_addr #0 { +; CHECK-LABEL: @_Z3foov( +; CHECK: [[VECTOR_BODY:vector\.body]]: +; CHECK: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]], +; CHECK: [[FOR_BODY:for\.body]]: +; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]], +; CHECK-MASKED: [[VECTOR_BODY:vector\.body]]: +; CHECK-MASKED: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]], +; CHECK-MASKED: [[FOR_BODY:for\.body]]: +; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]], +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %1 = trunc i64 %indvars.iv to i32 + %mul = mul nsw i32 %0, %1 + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %indvars.iv + %2 = load i32, i32* %arrayidx2, align 4, !tbaa !2 + %add = add nsw i32 %2, %mul + store i32 %add, i32* %arrayidx2, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !6 +} + +; Check correctness of profile info for vectorization with epilog. +; Function Attrs: nofree norecurse nounwind uwtable +define dso_local void @_Z3foo2v() local_unnamed_addr #0 { +; CHECK-LABEL: @_Z3foo2v( +; CHECK: [[VECTOR_BODY:vector\.body]]: +; CHECK: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]], +; CHECK: [[FOR_BODY:for\.body]]: +; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]], +; CHECK-MASKED: [[VECTOR_BODY:vector\.body]]: +; CHECK-MASKED: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]], +; CHECK-MASKED: [[FOR_BODY:for\.body]]: +; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]], +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %1 = trunc i64 %indvars.iv to i32 + %mul = mul nsw i32 %0, %1 + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %indvars.iv + %2 = load i32, i32* %arrayidx2, align 4, !tbaa !2 + %add = add nsw i32 %2, %mul + store i32 %add, i32* %arrayidx2, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1027 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !7 +} + +attributes #0 = { "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +; CHECK: [[LP1_255]] = !{!"branch_weights", i32 1, i32 255} +; CHECK: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0} +; CHECK-MASKED: [[LP1_63]] = !{!"branch_weights", i32 1, i32 63} +; CHECK-MASKED: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0} +; CHECK: [[LP1_2]] = !{!"branch_weights", i32 1, i32 2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project c292b5b5e059e6ce3e6449e6827ef7e1037c21c4)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C++ TBAA"} +!6 = !{!"branch_weights", i32 1, i32 1023} +!7 = !{!"branch_weights", i32 1, i32 1026} diff --git a/llvm/test/Transforms/LoopVectorize/tripcount.ll b/llvm/test/Transforms/LoopVectorize/tripcount.ll --- a/llvm/test/Transforms/LoopVectorize/tripcount.ll +++ b/llvm/test/Transforms/LoopVectorize/tripcount.ll @@ -61,8 +61,10 @@ ; but has a high trip count per invocation. Vectorize it. ; CHECK-LABEL: @foo_low_trip_count3( -; CHECK: vector.body: - +; CHECK: [[VECTOR_BODY:vector\.body]]: +; CHECK: br i1 [[TMP9:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP3:\!.*]], +; CHECK: [[FOR_BODY:for\.body]]: +; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP6:\!.*]], entry: br i1 %cond, label %for.preheader, label %for.end, !prof !2 @@ -205,6 +207,9 @@ ret i32 0 } +; CHECK: [[LP3]] = !{!"branch_weights", i32 10, i32 2490} +; CHECK: [[LP6]] = !{!"branch_weights", i32 10, i32 0} + !0 = !{!"function_entry_count", i64 100} !1 = !{!"branch_weights", i32 100, i32 0} !2 = !{!"branch_weights", i32 10, i32 90}