diff --git a/llvm/lib/Transforms/Scalar/LoopSink.cpp b/llvm/lib/Transforms/Scalar/LoopSink.cpp --- a/llvm/lib/Transforms/Scalar/LoopSink.cpp +++ b/llvm/lib/Transforms/Scalar/LoopSink.cpp @@ -177,13 +177,33 @@ SmallPtrSet BBs; for (auto &U : I.uses()) { Instruction *UI = cast(U.getUser()); - // We cannot sink I to PHI-uses. - if (isa(UI)) - return false; + // We cannot sink I if it has uses outside of the loop. if (!L.contains(LI.getLoopFor(UI->getParent()))) return false; - BBs.insert(UI->getParent()); + + if (!isa(UI)) { + BBs.insert(UI->getParent()); + continue; + } + + // We cannot sink I to PHI-uses, try to look through PHI to find the incoming + // block of the value being used. + PHINode *PN = dyn_cast(UI); + BasicBlock *PhiBB = nullptr; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *PhiVal = PN->getIncomingValue(i); + if (PhiVal != U) + continue; + + // If value's incoming block is from loop preheader directly, there's no + // place to sink to, bailout. + PhiBB = PN->getIncomingBlock(i); + if (L.getLoopPreheader() == PhiBB) + return false; + + BBs.insert(PhiBB); + } } // findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max diff --git a/llvm/test/Transforms/LICM/loopsink-phi.ll b/llvm/test/Transforms/LICM/loopsink-phi.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LICM/loopsink-phi.ll @@ -0,0 +1,117 @@ +; RUN: opt -S -verify-memoryssa -aa-pipeline=basic-aa -passes=loop-sink < %s | FileCheck %s +; Make sure that unprofitable loop ICM can be undone by loop sink, and loop sink can handle +; sinking through PHI use. + + +; CHECK: .l.cold1: +; CHECK-NEXT: {{.*}} = mul nsw i32 {{.*}}, {{.*}} +; CHECK: .l.cold2 +; CHECK-NEXT: {{.*}} = add nsw i32 {{.*}}, {{.*}} + +define dso_local i32 @_Z3fooii(i32 %0, i32 %1) local_unnamed_addr #0 !dbg !30 !prof !36 { + %3 = tail call i32 @_Z3bari(i32 %1), !dbg !37, !prof !38 + %4 = icmp eq i32 %0, 0, !dbg !39 + br i1 %4, label %.l.ret, label %.l.check.preheader, !dbg !42 + +.l.check.preheader: ; preds = %2 + %flag = icmp eq i32 %1, 5 + %tmp2 = add nsw i32 %3, %3 + %tmp1 = mul nsw i32 %3, %3 + br label %.l.check, !dbg !42 + +.l.ret.loopexit: ; preds = %.l.iterate + %.lcssa = phi i32 [ %12, %.l.iterate ] + br label %.l.ret, !dbg !43 + +.l.ret: ; preds = %.l.ret.loopexit, %2 + %5 = phi i32 [ 0, %2 ], [ %.lcssa, %.l.ret.loopexit ] + ret i32 %5, !dbg !43 + +.l.check: ; preds = %.l.iterate, %.l.check.preheader + %6 = phi i32 [ 0, %.l.check.preheader ], [ %13, %.l.iterate ] + %7 = phi i32 [ %0, %.l.check.preheader ], [ %12, %.l.iterate ] + %8 = icmp eq i32 %6, %1, !dbg !44 + br i1 %8, label %.l.cold, label %.l.iterate, !dbg !46, !prof !47 + +.l.cold: ; preds = %.l.check + br i1 %flag, label %.l.cold1, label %.l.cold2 + +.l.cold1: ; preds = %.l.cold + br label %.l.cold3 + +.l.cold2: ; preds = %.l.cold + br label %.l.cold3 + +.l.cold3: ; preds = %.l.cold2, %.l.cold1 + %9 = phi i32 [ %tmp1, %.l.cold1 ], [ %tmp2, %.l.cold2 ] + %10 = tail call i32 @_Z3bari(i32 %7), !dbg !48 + %11 = add nsw i32 %10, %9, !dbg !49 + br label %.l.iterate, !dbg !50 + +.l.iterate: ; preds = %.l.cold3, %.l.check + %12 = phi i32 [ %11, %.l.cold3 ], [ %7, %.l.check ] + %13 = add nuw nsw i32 %6, 1, !dbg !51 + %14 = icmp eq i32 %13, %12, !dbg !39 + br i1 %14, label %.l.ret.loopexit, label %.l.check, !dbg !42, !llvm.loop !52 +} + +declare dso_local i32 @_Z3bari(i32) local_unnamed_addr + +attributes #0 = { "use-sample-profile" } + +!llvm.module.flags = !{!0, !1} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10, !11, !12} +!3 = !{!"ProfileFormat", !"SampleProfile"} +!4 = !{!"TotalCount", i64 403} +!5 = !{!"MaxCount", i64 200} +!6 = !{!"MaxInternalCount", i64 0} +!7 = !{!"MaxFunctionCount", i64 1} +!8 = !{!"NumCounts", i64 6} +!9 = !{!"NumFunctions", i64 1} +!10 = !{!"IsPartialProfile", i64 0} +!11 = !{!"PartialProfileRatio", double 0.000000e+00} +!12 = !{!"DetailedSummary", !13} +!13 = !{!14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29} +!14 = !{i32 10000, i64 200, i32 2} +!15 = !{i32 100000, i64 200, i32 2} +!16 = !{i32 200000, i64 200, i32 2} +!17 = !{i32 300000, i64 200, i32 2} +!18 = !{i32 400000, i64 200, i32 2} +!19 = !{i32 500000, i64 200, i32 2} +!20 = !{i32 600000, i64 200, i32 2} +!21 = !{i32 700000, i64 200, i32 2} +!22 = !{i32 800000, i64 200, i32 2} +!23 = !{i32 900000, i64 200, i32 2} +!24 = !{i32 950000, i64 200, i32 2} +!25 = !{i32 990000, i64 200, i32 2} +!26 = !{i32 999000, i64 1, i32 5} +!27 = !{i32 999900, i64 1, i32 5} +!28 = !{i32 999990, i64 1, i32 5} +!29 = !{i32 999999, i64 1, i32 5} +!30 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooii", scope: !31, file: !31, line: 2, type: !32, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !35) +!31 = !DIFile(filename: "foo.cpp", directory: "/tmp/gather_pgo") +!32 = !DISubroutineType(types: !33) +!33 = !{!34, !34, !34} +!34 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!35 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !31, producer: "clang version 8.0.20181009 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, nameTableKind: None) +!36 = !{!"function_entry_count", i64 2} +!37 = !DILocation(line: 3, column: 14, scope: !30) +!38 = !{!"branch_weights", i32 2} +!39 = !DILocation(line: 4, column: 21, scope: !40) +!40 = distinct !DILexicalBlock(scope: !41, file: !31, line: 4, column: 3) +!41 = distinct !DILexicalBlock(scope: !30, file: !31, line: 4, column: 3) +!42 = !DILocation(line: 4, column: 3, scope: !41) +!43 = !DILocation(line: 7, column: 3, scope: !30) +!44 = !DILocation(line: 5, column: 11, scope: !45) +!45 = distinct !DILexicalBlock(scope: !40, file: !31, line: 5, column: 9) +!46 = !DILocation(line: 5, column: 9, scope: !40) +!47 = !{!"branch_weights", i32 1, i32 201} +!48 = !DILocation(line: 6, column: 30, scope: !45) +!49 = !DILocation(line: 6, column: 28, scope: !45) +!50 = !DILocation(line: 6, column: 7, scope: !45) +!51 = !DILocation(line: 4, column: 30, scope: !40) +!52 = distinct !{!52, !42, !53} +!53 = !DILocation(line: 6, column: 38, scope: !41)