diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -946,13 +946,23 @@ /// matrix instructions. const MapVector &Inst2ColumnMatrix; + /// Mapping from values to the leaves of all expressions that the value is + /// part of. + const DenseMap> &Shared; + + /// Leaf node of the expression to linearize. + Value *Leaf; + /// Used to keep track of sub-expressions that get reused while linearizing /// the expression. Re-used sub-expressions are marked as (reused). SmallPtrSet ReusedExprs; ExprLinearizer(const DataLayout &DL, - const MapVector &Inst2ColumnMatrix) - : Str(), Stream(Str), DL(DL), Inst2ColumnMatrix(Inst2ColumnMatrix) {} + const MapVector &Inst2ColumnMatrix, + const DenseMap> &Shared, + Value *Leaf) + : Str(), Stream(Str), DL(DL), Inst2ColumnMatrix(Inst2ColumnMatrix), + Shared(Shared), Leaf(Leaf) {} void indent(unsigned N) { LineLength += N; @@ -1108,11 +1118,30 @@ /// Linearize expression \p Expr starting at an indentation of \p Indent. /// Expressions that are re-used multiple times are prefixed with (reused) /// at the re-used root instruction. - void linearizeExpr(Value *Expr, unsigned Indent, bool ParentReused) { + void linearizeExpr(Value *Expr, unsigned Indent, bool ParentReused, + bool ParentShared) { auto *I = cast(Expr); maybeIndent(Indent); SmallVector Ops; + // Is Expr shared with other expression leaves? + bool ExprShared = false; + + // Deal with shared subtrees. Mark them as shared, if required. + if (!ParentShared) { + auto SI = Shared.find(Expr); + assert(SI != Shared.end() && SI->second.find(Leaf) != SI->second.end()); + + for (Value *S : SI->second) { + if (S == Leaf) + continue; + DebugLoc DL = cast(S)->getDebugLoc(); + write("shared with remark at line " + std::to_string(DL.getLine()) + + " column " + std::to_string(DL.getCol()) + " ("); + } + ExprShared = SI->second.size() > 1; + } + bool Reused = !ReusedExprs.insert(Expr).second; if (Reused && !ParentReused) write("(reused) "); @@ -1144,7 +1173,7 @@ maybeIndent(Indent + 1); if (isMatrix(Op)) - linearizeExpr(Op, Indent + 1, Reused); + linearizeExpr(Op, Indent + 1, Reused, ExprShared); else write(Op); if (Op != Ops.back()) @@ -1171,7 +1200,6 @@ /// /// TODO: /// * Summarize number of vector instructions generated for each expression. - /// * Account for shared sub-expressions. /// * Propagate matrix remarks up the inlining chain. struct RemarkGenerator { const MapVector &Inst2ColumnMatrix; @@ -1194,9 +1222,27 @@ return Leaves; } + /// Recursively traverse expression \p V starting at \p Leaf and add \p Leaf + /// to all visited expressions in \p Shared. + void collectSharedInfo(Value *Leaf, Value *V, + DenseMap> &Shared) { + + if (Inst2ColumnMatrix.find(V) == Inst2ColumnMatrix.end()) + return; + + auto I = Shared.insert({V, {}}); + I.first->second.insert(Leaf); + + for (Value *Op : cast(V)->operand_values()) + collectSharedInfo(Leaf, Op, Shared); + return; + } + /// Calculate the number of exclusive and shared op counts for expression /// starting at \p V. Expressions used multiple times are counted once. - OpInfoTy sumOpInfos(Value *Root, SmallPtrSetImpl &ReusedExprs) { + std::pair + sumOpInfos(Value *Root, SmallPtrSetImpl &ReusedExprs, + DenseMap> &Shared) { auto CM = Inst2ColumnMatrix.find(Root); if (CM == Inst2ColumnMatrix.end()) return {}; @@ -1205,10 +1251,21 @@ if (!ReusedExprs.insert(Root).second) return {}; - OpInfoTy Count = CM->second.getOpInfo(); - for (Value *Op : cast(Root)->operand_values()) - Count += sumOpInfos(Op, ReusedExprs); - return Count; + OpInfoTy SharedCount; + OpInfoTy Count; + + auto I = Shared.find(Root); + if (I->second.size() == 1) + Count = CM->second.getOpInfo(); + else + SharedCount = CM->second.getOpInfo(); + + for (Value *Op : cast(Root)->operand_values()) { + auto C = sumOpInfos(Op, ReusedExprs, Shared); + Count += C.first; + SharedCount += C.second; + } + return {Count, SharedCount}; } void emitRemarks() { @@ -1218,26 +1275,47 @@ // Find leafs of matrix expressions. auto Leaves = getExpressionLeaves(); + DenseMap> Shared; + + for (Value *Leaf : Leaves) + collectSharedInfo(Leaf, Leaf, Shared); + // Generate remarks for each leaf. for (auto *L : Leaves) { SmallPtrSet ReusedExprs; - auto Counts = sumOpInfos(L, ReusedExprs); + OpInfoTy Counts, SharedCounts; + std::tie(Counts, SharedCounts) = sumOpInfos(L, ReusedExprs, Shared); + OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", cast(L)->getDebugLoc(), cast(L)->getParent()); + Rem << "Lowered with "; Rem << ore::NV("NumStores", Counts.NumStores) << " stores, " << ore::NV("NumLoads", Counts.NumLoads) << " loads, " << ore::NV("NumComputeOps", Counts.NumComputeOps) << " compute ops"; - Rem << ("\n" + linearize(L, DL)); + if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 || + SharedCounts.NumComputeOps > 0) { + Rem << ",\nadditionally " + << ore::NV("NumStores", SharedCounts.NumStores) << " stores, " + << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, " + << ore::NV("NumFPOps", SharedCounts.NumComputeOps) + << " compute ops" + << " are shared with other expressions"; + } + + Rem << ("\n" + linearize(L, Shared, DL)); ORE.emit(Rem); } } - std::string linearize(Value *L, const DataLayout &DL) { - ExprLinearizer Lin(DL, Inst2ColumnMatrix); - Lin.linearizeExpr(L, 0, false); + std::string + linearize(Value *L, + const DenseMap> &Shared, + const DataLayout &DL) { + ExprLinearizer Lin(DL, Inst2ColumnMatrix, Shared, L); + Lin.linearizeExpr(L, 0, false, false); return Lin.getResult(); } }; diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll @@ -0,0 +1,162 @@ +; REQUIRES: aarch64-registered-target + +; This test needs to be target specific due to the cost estimate in the output. + +; RUN: opt -lower-matrix-intrinsics -pass-remarks-output=%t -pass-remarks=lower-matrix-intrinsics -mtriple=arm64-apple-iphoneos %s 2>&1 -disable-output | FileCheck --check-prefix=STDERR %s +; RUN: FileCheck --input-file=%t --check-prefix=YAML %s + +; YAML-LABEL: --- !Passed +; YAML-NEXT: Pass: lower-matrix-intrinsics +; YAML-NEXT: Name: matrix-lowered +; YAML-NEXT: DebugLoc: { File: test.cpp, Line: 35, Column: 71 } +; YAML-NEXT: Function: test_2leafs +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'Lowered with ' +; YAML-NEXT: - NumStores: '4' +; YAML-NEXT: - String: ' stores, ' +; YAML-NEXT: - NumLoads: '0' +; YAML-NEXT: - String: ' loads, ' +; YAML-NEXT: - NumComputeOps: '0' +; YAML-NEXT: - String: ' compute ops' +; YAML-NEXT: - String: ', +; YAML-NEXT: additionally ' +; YAML-NEXT: - NumStores: '0' +; YAML-NEXT: - String: ' stores, ' +; YAML-NEXT: - NumLoads: '4' +; YAML-NEXT: - String: ' loads, ' +; YAML-NEXT: - NumFPOps: '16' +; YAML-NEXT: - String: ' compute ops' +; YAML-NEXT: - String: ' are shared with other expressions' +; YAML-NEXT: - String: | +; YAML: columnwise.store.4x2.double( +; YAML-NEXT: shared with remark at line 35 column 45 (transpose.2x4.double(columnwise.load.2x4.double(addr %arg1, +; YAML-NEXT: scalar)), +; YAML-NEXT: addr %arg3, +; YAML-NEXT: 10) + +; YAML-LABEL: --- !Passed +; YAML-NEXT: Pass: lower-matrix-intrinsics +; YAML-NEXT: Name: matrix-lowered +; YAML-NEXT: DebugLoc: { File: test.cpp, Line: 35, Column: 45 } +; YAML-NEXT: Function: test_2leafs +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'Lowered with ' +; YAML-NEXT: - NumStores: '30' +; YAML-NEXT: - String: ' stores, ' +; YAML-NEXT: - NumLoads: '45' +; YAML-NEXT: - String: ' loads, ' +; YAML-NEXT: - NumComputeOps: '120' +; YAML-NEXT: - String: ' compute ops' +; YAML-NEXT: - String: ', +; YAML-NEXT: additionally ' +; YAML-NEXT: - NumStores: '0' +; YAML-NEXT: - String: ' stores, ' +; YAML-NEXT: - NumLoads: '4' +; YAML-NEXT: - String: ' loads, ' +; YAML-NEXT: - NumFPOps: '16' +; YAML-NEXT: - String: ' compute ops' +; YAML-NEXT: - String: ' are shared with other expressions' +; YAML-NEXT: - String: | +; YAML: columnwise.store.4x15.double( +; YAML-NEXT: fsub( +; YAML-NEXT: columnwise.load.4x15.double(addr %arg2, 20), +; YAML-NEXT: multiply.4x2.2x15.double( +; YAML-NEXT: shared with remark at line 35 column 71 (transpose.2x4.double(columnwise.load.2x4.double(addr %arg1, +; YAML-NEXT: scalar)), +; YAML-NEXT: columnwise.load.2x15.double(addr %arg3, scalar))), +; YAML-NEXT: addr %arg2, +; YAML-NEXT: 10) + + +; STDERR-LABEL: remark: test.cpp:35:71: Lowered with 4 stores, 0 loads, 0 compute ops, +; STDERR-NEXT: additionally 0 stores, 4 loads, 16 compute ops are shared with other expressions +; STDERR-NEXT: columnwise.store.4x2.double( +; STDERR-NEXT: shared with remark at line 35 column 45 (transpose.2x4.double(columnwise.load.2x4.double(addr %arg1, +; STDERR-NEXT: scalar)), +; STDERR-NEXT: addr %arg3, +; STDERR-NEXT: 10) + +; STDERR-LABEL: remark: test.cpp:35:45: Lowered with 30 stores, 45 loads, 120 compute ops, +; STDERR-NEXT: additionally 0 stores, 4 loads, 16 compute ops are shared with other expressions +; STDERR-NEXT: columnwise.store.4x15.double( +; STDERR-NEXT: fsub( +; STDERR-NEXT: columnwise.load.4x15.double(addr %arg2, 20), +; STDERR-NEXT: multiply.4x2.2x15.double( +; STDERR-NEXT: shared with remark at line 35 column 71 (transpose.2x4.double(columnwise.load.2x4.double(addr %arg1, +; STDERR-NEXT: scalar)), +; STDERR-NEXT: columnwise.load.2x15.double(addr %arg3, scalar))), +; STDERR-NEXT: addr %arg2, +; STDERR-NEXT: 10) +define void @test_2leafs(double* %arg1, double* %arg2, double* %arg3, i32 %stride, i32 %offset) !dbg !8 { +bb: + %shared.load = tail call <8 x double> @llvm.matrix.columnwise.load.v8f64.p0f64(double* %arg1, i32 %stride, i32 2, i32 4), !dbg !10, !noalias !10 + %shared.load.2 = tail call <30 x double> @llvm.matrix.columnwise.load.v30f64.p0f64(double* %arg3, i32 %stride, i32 2, i32 15), !dbg !10, !noalias !10 + %tmp17 = tail call <8 x double> @llvm.matrix.transpose.v8f64(<8 x double> %shared.load, i32 2, i32 4), !dbg !10 + tail call void @llvm.matrix.columnwise.store.v8f64.p0f64(<8 x double> %tmp17, double* %arg3, i32 10, i32 4, i32 2), !dbg !10 + %tmp18 = tail call <60 x double> @llvm.matrix.columnwise.load.v60f64.p0f64(double* %arg2, i32 20, i32 4, i32 15), !dbg !11 + %tmp48 = tail call <60 x double> @llvm.matrix.multiply.v60f64.v8f64.v30f64(<8 x double> %tmp17, <30 x double> %shared.load.2, i32 4, i32 2, i32 15), !dbg !11 + %tmp49 = fsub <60 x double> %tmp18, %tmp48, !dbg !11 + tail call void @llvm.matrix.columnwise.store.v60f64.p0f64(<60 x double> %tmp49, double* %arg2, i32 10, i32 4, i32 15), !dbg !11 + ret void +} + +declare <8 x double> @llvm.matrix.transpose.v8f64(<8 x double>, i32 immarg, i32 immarg) +declare <8 x double> @llvm.matrix.columnwise.load.v8f64.p0f64(double*, i32, i32 immarg, i32 immarg) +declare <30 x double> @llvm.matrix.columnwise.load.v30f64.p0f64(double*, i32, i32 immarg, i32 immarg) +declare <60 x double> @llvm.matrix.columnwise.load.v60f64.p0f64(double*, i32, i32 immarg, i32 immarg) +declare void @llvm.matrix.columnwise.store.v60f64.p0f64(<60 x double>, double* writeonly, i32, i32 immarg, i32 immarg) +declare void @llvm.matrix.columnwise.store.v8f64.p0f64(<8 x double>, double* writeonly, i32, i32 immarg, i32 immarg) +declare <60 x double> @llvm.matrix.multiply.v60f64.v8f64.v30f64(<8 x double>, <30 x double>, i32 immarg, i32 immarg, i32 immarg) + +!llvm.module.flags = !{!0, !1, !2, !3} +!llvm.dbg.cu = !{!4} +!llvm.ident = !{!7} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 13, i32 0]} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 7, !"PIC Level", i32 2} +!4 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !5, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !6, nameTableKind: GNU) +!5 = !DIFile(filename: "test.cpp", directory: "") +!6 = !{} +!7 = !{!"clang"} +!8 = distinct !DISubprogram(name: "test", scope: !5, file: !5, line: 26, type: !9, scopeLine: 27, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) +!9 = !DISubroutineType(types: !6) +!10 = distinct !DILocation(line: 35, column: 71, scope: !8) +!11 = distinct !DILocation(line: 35, column: 45, scope: !8) +!12 = !DILocation(line: 800, column: 17, scope: !13, inlinedAt: !15) +!13 = distinct !DISubprogram(name: "foo", scope: !14, file: !14, line: 789, type: !9, scopeLine: 790, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) +!14 = !DIFile(filename: "bar.h", directory: "bar") +!15 = distinct !DILocation(line: 1280, column: 5, scope: !16, inlinedAt: !18) +!16 = distinct !DISubprogram(name: "zar", scope: !17, file: !17, line: 1275, type: !9, scopeLine: 1278, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) +!17 = !DIFile(filename: "file1.h", directory: "dir1") +!18 = distinct !DILocation(line: 1278, column: 1, scope: !19, inlinedAt: !20) +!19 = distinct !DISubprogram(name: "yo", scope: !17, file: !17, line: 1275, type: !9, scopeLine: 1278, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) +!20 = distinct !DILocation(line: 2514, column: 26, scope: !21, inlinedAt: !22) +!21 = distinct !DISubprogram(name: "zzzz", scope: !14, file: !14, line: 2505, type: !9, scopeLine: 2506, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) +!22 = distinct !DILocation(line: 1263, column: 5, scope: !23, inlinedAt: !24) +!23 = distinct !DISubprogram(name: "ppppp", scope: !17, file: !17, line: 1258, type: !9, scopeLine: 1261, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) +!24 = distinct !DILocation(line: 1261, column: 1, scope: !25, inlinedAt: !26) +!25 = distinct !DISubprogram(name: "qqqq", scope: !17, file: !17, line: 1258, type: !9, scopeLine: 1261, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) +!26 = distinct !DILocation(line: 168, column: 7, scope: !27, inlinedAt: !29) +!27 = distinct !DISubprogram(name: "lll", scope: !28, file: !28, line: 166, type: !9, scopeLine: 169, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) +!28 = !DIFile(filename: "file2.h", directory: "dir2") +!29 = distinct !DILocation(line: 169, column: 1, scope: !30, inlinedAt: !31) +!30 = distinct !DISubprogram(name: "Expr1", scope: !28, file: !28, line: 166, type: !9, scopeLine: 169, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) +!31 = distinct !DILocation(line: 368, column: 12, scope: !32, inlinedAt: !33) +!32 = distinct !DISubprogram(name: "yyyyy", scope: !14, file: !14, line: 364, type: !9, scopeLine: 365, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) +!33 = distinct !DILocation(line: 1297, column: 34, scope: !34, inlinedAt: !35) +!34 = distinct !DISubprogram(name: "eeeee", scope: !14, file: !14, line: 1290, type: !9, scopeLine: 1291, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) +!35 = distinct !DILocation(line: 2306, column: 5, scope: !36, inlinedAt: !11) +!36 = distinct !DISubprogram(name: "aaaaa", scope: !37, file: !37, line: 2304, type: !9, scopeLine: 2305, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) +!37 = !DIFile(filename: "foo.c", directory: "/") +!38 = distinct !DISubprogram(name: "test2", scope: !5, file: !5, line: 90, type: !9, scopeLine: 27, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) +!39 = distinct !DILocation(line: 44, column: 44, scope: !38) +!40 = distinct !DILocation(line: 55, column: 55, scope: !38) +!41 = distinct !DILocation(line: 66, column: 66, scope: !38) +!42 = distinct !DISubprogram(name: "test2", scope: !5, file: !5, line: 90, type: !9, scopeLine: 27, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) +!43 = distinct !DILocation(line: 77, column: 77, scope: !42) +!44 = distinct !DILocation(line: 88, column: 88, scope: !42) +!45 = distinct !DISubprogram(name: "test2", scope: !5, file: !5, line: 90, type: !9, scopeLine: 27, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) +!46 = distinct !DILocation(line: 99, column: 99, scope: !45) +!47 = distinct !DILocation(line: 111, column: 111, scope: !45) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll @@ -1,5 +1,7 @@ ; REQUIRES: aarch64-registered-target +; This test needs to be target specific due to the cost estimate in the output. + ; RUN: opt -lower-matrix-intrinsics -pass-remarks=lower-matrix-intrinsics -mtriple=arm64-apple-iphoneos < %s 2>&1 | FileCheck %s ; CHECK-LABEL: remark: test.h:40:20: Lowered with 6 stores, 6 loads, 24 compute ops