diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -10,8 +10,6 @@ // // TODO: // * Implement multiply & add fusion -// * Add remark, summarizing the available matrix optimization opportunities -// (WIP). // //===----------------------------------------------------------------------===// @@ -25,6 +23,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/CFG.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -50,6 +49,14 @@ cl::desc("Allow the use of FMAs if available and profitable. This may " "result in different results, due to less rounding error.")); +/// Helper function to either return Scope, if it is a subprogram or the +/// attached subprogram for a local scope. +static DISubprogram *getSubprogram(DIScope *Scope) { + if (auto *Subprogram = dyn_cast(Scope)) + return Subprogram; + return cast(Scope)->getSubprogram(); +} + namespace { // Given an element poitner \p BasePtr to the start of a (sub) matrix, compute @@ -574,7 +581,7 @@ } } - RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, DL); + RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, Func); RemarkGen.emitRemarks(); for (Instruction *Inst : reverse(ToRemove)) @@ -950,6 +957,9 @@ /// part of. const DenseMap> &Shared; + /// Set of values to consider as matrix operations. + const SmallSetVector &ExprSet; + /// Leaf node of the expression to linearize. Value *Leaf; @@ -960,9 +970,9 @@ ExprLinearizer(const DataLayout &DL, const MapVector &Inst2ColumnMatrix, const DenseMap> &Shared, - Value *Leaf) + const SmallSetVector &ExprSet, Value *Leaf) : Str(), Stream(Str), DL(DL), Inst2ColumnMatrix(Inst2ColumnMatrix), - Shared(Shared), Leaf(Leaf) {} + Shared(Shared), ExprSet(ExprSet), Leaf(Leaf) {} void indent(unsigned N) { LineLength += N; @@ -997,9 +1007,7 @@ } /// Returns true if \p V is a matrix value. - bool isMatrix(Value *V) const { - return Inst2ColumnMatrix.find(V) != Inst2ColumnMatrix.end(); - } + bool isMatrix(Value *V) const { return ExprSet.count(V); } /// If \p V is a matrix value, print its shape as as NumRows x NumColumns to /// \p SS. @@ -1191,60 +1199,65 @@ /// Generate remarks for matrix operations in a function. To generate remarks /// for matrix expressions, the following approach is used: - /// 1. Collect leafs of matrix expressions (done in - /// RemarkGenerator::getExpressionLeaves). Leaves are lowered matrix - /// instructions without other matrix users (like stores). - /// - /// 2. For each leaf, create a remark containing a linearizied version of the + /// 1. Use the inlined-at debug information to group matrix operations to the + /// DISubprograms they are contained in. + /// 2. Collect leaves of matrix expressions (done in + /// RemarkGenerator::getExpressionLeaves) for each subprogram - expression + // mapping. Leaves are lowered matrix instructions without other matrix + // users (like stores) in the current subprogram. + /// 3. For each leaf, create a remark containing a linearizied version of the /// matrix expression. - /// - /// TODO: - /// * Summarize number of vector instructions generated for each expression. - /// * Propagate matrix remarks up the inlining chain. struct RemarkGenerator { const MapVector &Inst2ColumnMatrix; OptimizationRemarkEmitter &ORE; + Function &Func; const DataLayout &DL; RemarkGenerator(const MapVector &Inst2ColumnMatrix, - OptimizationRemarkEmitter &ORE, const DataLayout &DL) - : Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), DL(DL) {} + OptimizationRemarkEmitter &ORE, Function &Func) + : Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), Func(Func), + DL(Func.getParent()->getDataLayout()) {} /// Return all leafs of matrix expressions. Those are instructions in /// Inst2ColumnMatrix returing void. Currently that should only include /// stores. - SmallVector getExpressionLeaves() { + SmallVector + getExpressionLeaves(const SmallSetVector &Ops) { SmallVector Leaves; - for (auto &KV : Inst2ColumnMatrix) - if (KV.first->getType()->isVoidTy()) - Leaves.push_back(KV.first); + for (auto *Op : Ops) + if (Op->getType()->isVoidTy() || + !any_of(Op->users(), [&Ops](User *U) { return Ops.count(U); })) + Leaves.push_back(Op); return Leaves; } /// Recursively traverse expression \p V starting at \p Leaf and add \p Leaf - /// to all visited expressions in \p Shared. + /// to all visited expressions in \p Shared. Limit the matrix operations to + /// the ones in \p ExprSet. void collectSharedInfo(Value *Leaf, Value *V, + const SmallSetVector &ExprSet, DenseMap> &Shared) { - if (Inst2ColumnMatrix.find(V) == Inst2ColumnMatrix.end()) + if (!ExprSet.count(V)) return; auto I = Shared.insert({V, {}}); I.first->second.insert(Leaf); for (Value *Op : cast(V)->operand_values()) - collectSharedInfo(Leaf, Op, Shared); + collectSharedInfo(Leaf, Op, ExprSet, Shared); return; } /// Calculate the number of exclusive and shared op counts for expression /// starting at \p V. Expressions used multiple times are counted once. + /// Limit the matrix operations to the ones in \p ExprSet. std::pair sumOpInfos(Value *Root, SmallPtrSetImpl &ReusedExprs, - DenseMap> &Shared) { - auto CM = Inst2ColumnMatrix.find(Root); - if (CM == Inst2ColumnMatrix.end()) + const SmallSetVector &ExprSet, + DenseMap> &Shared) const { + if (!ExprSet.count(Root)) return {}; // Already counted this expression. Stop. @@ -1255,13 +1268,14 @@ OpInfoTy Count; auto I = Shared.find(Root); + auto CM = Inst2ColumnMatrix.find(Root); if (I->second.size() == 1) Count = CM->second.getOpInfo(); else SharedCount = CM->second.getOpInfo(); for (Value *Op : cast(Root)->operand_values()) { - auto C = sumOpInfos(Op, ReusedExprs, Shared); + auto C = sumOpInfos(Op, ReusedExprs, ExprSet, Shared); Count += C.first; SharedCount += C.second; } @@ -1272,49 +1286,84 @@ if (!ORE.allowExtraAnalysis(DEBUG_TYPE)) return; - // Find leafs of matrix expressions. - auto Leaves = getExpressionLeaves(); - - DenseMap> Shared; - - for (Value *Leaf : Leaves) - collectSharedInfo(Leaf, Leaf, Shared); - - // Generate remarks for each leaf. - for (auto *L : Leaves) { - SmallPtrSet ReusedExprs; - OpInfoTy Counts, SharedCounts; - std::tie(Counts, SharedCounts) = sumOpInfos(L, ReusedExprs, Shared); - - OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", - cast(L)->getDebugLoc(), - cast(L)->getParent()); - - Rem << "Lowered with "; - Rem << ore::NV("NumStores", Counts.NumStores) << " stores, " - << ore::NV("NumLoads", Counts.NumLoads) << " loads, " - << ore::NV("NumComputeOps", Counts.NumComputeOps) << " compute ops"; - - if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 || - SharedCounts.NumComputeOps > 0) { - Rem << ",\nadditionally " - << ore::NV("NumStores", SharedCounts.NumStores) << " stores, " - << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, " - << ore::NV("NumFPOps", SharedCounts.NumComputeOps) - << " compute ops" - << " are shared with other expressions"; + // Map matrix operations to subprograms, by traversing the inlinedAt + // chain. If the function does not have a DISubprogram, we only map them + // to the containing function. + MapVector> Mapping; + for (auto &KV : Inst2ColumnMatrix) { + if (Func.getSubprogram()) { + auto *I = cast(KV.first); + DILocation *Context = I->getDebugLoc(); + while (Context) { + auto I = Mapping.insert({getSubprogram(Context->getScope()), {}}); + I.first->second.push_back(KV.first); + Context = DebugLoc(Context).getInlinedAt(); + } + } else { + auto I = Mapping.insert({nullptr, {}}); + I.first->second.push_back(KV.first); } + } + for (auto &KV : Mapping) { + // Group matrix operations together by finding the connected components + // of operations. We also return the debug location of a sink + // instruction (without any more matrix ops as user), to be used for the + // remark. + SmallSetVector ExprSet(KV.second.begin(), KV.second.end()); + auto Leaves = getExpressionLeaves(ExprSet); + + DenseMap> Shared; + + for (Value *Leaf : Leaves) + collectSharedInfo(Leaf, Leaf, ExprSet, Shared); + + // Generate remarks for each leaf. + for (auto *L : Leaves) { + + DebugLoc Loc = cast(L)->getDebugLoc(); + DILocation *Context = cast(L)->getDebugLoc(); + while (Context) { + if (getSubprogram(Context->getScope()) == KV.first) { + Loc = Context; + break; + } + Context = DebugLoc(Context).getInlinedAt(); + } - Rem << ("\n" + linearize(L, Shared, DL)); - ORE.emit(Rem); + SmallPtrSet ReusedExprs; + OpInfoTy Counts, SharedCounts; + std::tie(Counts, SharedCounts) = + sumOpInfos(L, ReusedExprs, ExprSet, Shared); + + OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", Loc, + cast(L)->getParent()); + + Rem << "Lowered with "; + Rem << ore::NV("NumStores", Counts.NumStores) << " stores, " + << ore::NV("NumLoads", Counts.NumLoads) << " loads, " + << ore::NV("NumComputeOps", Counts.NumComputeOps) + << " compute ops"; + + if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 || + SharedCounts.NumComputeOps > 0) { + Rem << ",\nadditionally " + << ore::NV("NumStores", SharedCounts.NumStores) << " stores, " + << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, " + << ore::NV("NumFPOps", SharedCounts.NumComputeOps) + << " compute ops" + << " are shared with other expressions"; + } + + Rem << ("\n" + linearize(L, Shared, ExprSet, DL)); + ORE.emit(Rem); + } } } - std::string - linearize(Value *L, - const DenseMap> &Shared, - const DataLayout &DL) { - ExprLinearizer Lin(DL, Inst2ColumnMatrix, Shared, L); + std::string linearize( + Value *L, const DenseMap> &Shared, + const SmallSetVector &ExprSet, const DataLayout &DL) { + ExprLinearizer Lin(DL, Inst2ColumnMatrix, Shared, ExprSet, L); Lin.linearizeExpr(L, 0, false, false); return Lin.getResult(); } diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll @@ -0,0 +1,166 @@ +; REQUIRES: aarch64-registered-target + +; This test needs to be target specific due to the cost estimate in the output. + +; RUN: opt -lower-matrix-intrinsics -pass-remarks=lower-matrix-intrinsics -mtriple=arm64-apple-iphoneos -S < %s 2>&1 | FileCheck %s + +; Test the propagation of matrix expressions along to inlined-at chain. The IR +; in the test roughly corresponds to the C++ code below, with the IR containing +; references to a few more functions. + +; matrix.h +; template +; struct Matrix { +; using matrix_t = Ty __attribute__((matrix_type(R, C))); +; +; matrix_t value; +; }; +; +; ; add.h +; template +; Matrix add(Matrix M1, Matrix M2) { +; Matrix Result; +; Result.value = __builtin_matrix_add(M1.value, M2.value); +; return Result; +; } +; +; load.h: +; template +; Matrix load(Ty *Ptr) { +; Matrix Result; +; Result.value = *reinterpret_cast ::matrix_t *>(Ptr); +; return Result; +; } +; +; store.h: +; template +; void store(Matrix M1, Ty *Ptr) { +; *reinterpret_cast(Ptr) = M1.value; +; } +; +; toplevel.cpp +; void test(double *A, double *B, double *C) { +; store(add(load(A), load(B)), C); +; } +; + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "aarch64-apple-ios" + +; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops +; CHECK-NEXT: load(addr %A) + +; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops +; CHECK-NEXT: columnwise.load.3x5.double(addr %B, 5) + +; CHECK-LABEL: remark: load.h:41:11: Lowered with 0 stores, 1 loads, 0 compute ops +; CHECK-NEXT: load(addr %D) + +; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops +; CHECK-NEXT: load(addr %A) + +; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops +; CHECK-NEXT: columnwise.load.3x5.double(addr %B, 5) + +; CHECK-LABEL: remark: toplevel.c:410:0: Lowered with 10 stores, 20 loads, 10 compute ops +; CHECK-NEXT: store( +; CHECK-NEXT: fadd( +; CHECK-NEXT: load(addr %A), +; CHECK-NEXT: columnwise.load.3x5.double(addr %B, 5)), +; CHECK-NEXT: addr %C) + +; CHECK-LABEL: remark: toplevel.c:510:0: Lowered with 1 stores, 1 loads, 8 compute ops +; CHECK-NEXT: store( +; CHECK-NEXT: transpose.1x2.float(transpose.2x1.float(load(addr %D))), +; CHECK-NEXT: addr %D) + +; CHECK-LABEL: remark: add.h:66:11: Lowered with 0 stores, 0 loads, 10 compute ops +; CHECK-NEXT: fadd( +; CHECK-NEXT: addr %A, +; CHECK-NEXT: scalar) + +; CHECK-LABEL: remark: store.h:10:11: Lowered with 10 stores, 0 loads, 0 compute ops +; CHECK-NEXT: store( +; CHECK-NEXT: scalar, +; CHECK-NEXT: addr %C) + +; CHECK-LABEL: remark: store.h:66:11: Lowered with 1 stores, 0 loads, 0 compute ops +; CHECK-NEXT: store( +; CHECK-NEXT: scalar, +; CHECK-NEXT: addr %D) + +; CHECK-LABEL: remark: transpose.h:13:11: Lowered with 0 stores, 0 loads, 8 compute ops +; CHECK-NEXT: transpose.1x2.float(transpose.2x1.float(addr %D)) + +define void @toplevel(<15 x double>* %A, <15 x double>* %B, <15 x double>* %C, <2 x float>* %D) !dbg !16 { +entry: + %a = load <15 x double>, <15 x double> *%A, align 16, !dbg !3791 + %b = call <15 x double> @llvm.matrix.columnwise.load(<15 x double>* %B, i32 5, i32 3, i32 5), !dbg !3793 + %c = fadd <15 x double> %a, %b, !dbg !100 + store <15 x double> %c, <15 x double> *%C, align 16, !dbg !102 + + %load = load <2 x float>, <2 x float>* %D, !dbg !104 + %t1 = call <2 x float> @llvm.matrix.transpose(<2 x float> %load, i32 2, i32 1), !dbg !106 + %t2 = call <2 x float> @llvm.matrix.transpose(<2 x float> %t1, i32 1, i32 2), !dbg !106 + store <2 x float> %t2, <2 x float>* %D, !dbg !108 + ret void +} + +declare <15 x double> @llvm.matrix.columnwise.load(<15 x double>*, i32, i32, i32) +declare <2 x float> @llvm.matrix.transpose(<2 x float>, i32, i32) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "load.h", directory: "/test") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "load_fn", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!17 = !DIFile(filename: "toplevel.c", directory: "/test") +!16 = distinct !DISubprogram(name: "toplevel", scope: !1, file: !17, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!18 = !DIFile(filename: "assign.h", directory: "/test") +!19 = distinct !DISubprogram(name: "assign", scope: !1, file: !18, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) + +!20 = !DIFile(filename: "add.h", directory: "/test") +!21 = distinct !DISubprogram(name: "add_fn", scope: !1, file: !20, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) + +!22 = !DIFile(filename: "store.h", directory: "/test") +!23 = distinct !DISubprogram(name: "store_fn", scope: !1, file: !22, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) + +!24 = !DIFile(filename: "transpose.h", directory: "/test") +!25 = distinct !DISubprogram(name: "transpose", scope: !1, file: !24, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) + + +!6 = !DISubroutineType(types: !7) +!7 = !{null, !8, !8, !11} +!8 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !9) +!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 32, align: 32) +!10 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float) +!11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!12 = !{!13} +!13 = !DILocalVariable(name: "a", arg: 1, scope: !5, file: !1, line: 1, type: !8) +!14 = !DILocation(line: 1, column: 27, scope: !5) + +!3791 = !DILocation(line: 41, column: 43, scope: !5, inlinedAt: !3795) +!3792 = !DILocation(line: 405, column: 3, scope: !16) +!3793 = !DILocation(line: 41, column: 43, scope: !5, inlinedAt: !3796) +!3794 = !DILocation(line: 406, column: 11, scope: !16) +!3795 = !DILocation(line: 32, column: 43, scope: !19, inlinedAt: !3792) +!3796 = !DILocation(line: 32, column: 43, scope: !19, inlinedAt: !3794) + +!100 = !DILocation(line: 66, column: 11, scope: !21, inlinedAt: !101) +!101 = !DILocation(line: 410, column: 11, scope: !16) + +!102 = !DILocation(line: 10, column: 11, scope: !23, inlinedAt: !103) +!103 = !DILocation(line: 410, column: 0, scope: !16) + +!104 = !DILocation(line: 41, column: 11, scope: !5, inlinedAt: !101) +!105 = !DILocation(line: 500, column: 11, scope: !16) + +!106 = !DILocation(line: 13, column: 11, scope: !25, inlinedAt: !101) +!107 = !DILocation(line: 510, column: 11, scope: !16) + +!108 = !DILocation(line: 66, column: 11, scope: !23, inlinedAt: !109) +!109 = !DILocation(line: 510, column: 0, scope: !16) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll @@ -71,8 +71,8 @@ define void @binaryops(<9 x double>* %A, <9 x double>* %B) !dbg !31 { %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !32 - %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix - %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix + %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !32 + %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !32 call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !32 ret void } @@ -95,8 +95,8 @@ define void @multiple_expressions(<9 x double>* %A, <9 x double>* %B, <12 x double>* %C, <12 x double>* %D, <4 x double>* %E) !dbg !33 { %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !34 - %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix - %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix + %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !34 + %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !34 call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !34 %C.matrix = load <12 x double>, <12 x double>* %C, !dbg !34 @@ -119,8 +119,8 @@ define void @stackaddresses(<9 x double>* %A) !dbg !35 { %B = alloca <9 x double> %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !36 - %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix - %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix + %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !36 + %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !36 call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !36 ret void } @@ -140,7 +140,7 @@ %s2 = bitcast <15 x double>* %s1 to i64*, !dbg !22 %s3 = bitcast i64* %s2 to <15 x double>*, !dbg !22 - %t = call <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double> %av, i32 5, i32 3) + %t = call <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double> %av, i32 5, i32 3), !dbg !22 store <15 x double> %t, <15 x double>* %s3, !dbg !22 ret void