diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -10,8 +10,6 @@
 //
 // TODO:
 //  * Implement multiply & add fusion
-//  * Add remark, summarizing the available matrix optimization opportunities
-//    (WIP).
 //
 //===----------------------------------------------------------------------===//
 
@@ -25,6 +23,7 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -50,6 +49,14 @@
     cl::desc("Allow the use of FMAs if available and profitable. This may "
              "result in different results, due to less rounding error."));
 
+/// Helper function to either return Scope, if it is a subprogram or the
+/// attached subprogram for a local scope.
+static DISubprogram *getSubprogram(DIScope *Scope) {
+  if (auto *Subprogram = dyn_cast<DISubprogram>(Scope))
+    return Subprogram;
+  return cast<DILocalScope>(Scope)->getSubprogram();
+}
+
 namespace {
 
 // Given an element poitner \p BasePtr to the start of a (sub) matrix, compute
@@ -574,7 +581,7 @@
       }
     }
 
-    RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, DL);
+    RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, Func);
     RemarkGen.emitRemarks();
 
     for (Instruction *Inst : reverse(ToRemove))
@@ -950,6 +957,9 @@
     /// part of.
     const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared;
 
+    /// Set of values to consider as matrix operations.
+    const SmallSetVector<Value *, 32> &ExprSet;
+
     /// Leaf node of the expression to linearize.
     Value *Leaf;
 
@@ -960,9 +970,9 @@
     ExprLinearizer(const DataLayout &DL,
                    const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix,
                    const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
-                   Value *Leaf)
+                   const SmallSetVector<Value *, 32> &ExprSet, Value *Leaf)
         : Str(), Stream(Str), DL(DL), Inst2ColumnMatrix(Inst2ColumnMatrix),
-          Shared(Shared), Leaf(Leaf) {}
+          Shared(Shared), ExprSet(ExprSet), Leaf(Leaf) {}
 
     void indent(unsigned N) {
       LineLength += N;
@@ -997,9 +1007,7 @@
     }
 
     /// Returns true if \p V is a matrix value.
-    bool isMatrix(Value *V) const {
-      return Inst2ColumnMatrix.find(V) != Inst2ColumnMatrix.end();
-    }
+    bool isMatrix(Value *V) const { return ExprSet.count(V); }
 
     /// If \p V is a matrix value, print its shape as as NumRows x NumColumns to
     /// \p SS.
@@ -1191,60 +1199,65 @@
 
   /// Generate remarks for matrix operations in a function. To generate remarks
   /// for matrix expressions, the following approach is used:
-  /// 1. Collect leafs of matrix expressions (done in
-  ///    RemarkGenerator::getExpressionLeaves).  Leaves are lowered matrix
-  ///    instructions without other matrix users (like stores).
-  ///
-  /// 2. For each leaf, create a remark containing a linearizied version of the
+  /// 1. Use the inlined-at debug information to group matrix operations to the
+  ///    DISubprograms they are contained in.
+  /// 2. Collect leaves of matrix expressions (done in
+  ///    RemarkGenerator::getExpressionLeaves) for each subprogram - expression
+  //     mapping.  Leaves are lowered matrix instructions without other matrix
+  //     users (like stores) in the current subprogram.
+  /// 3. For each leaf, create a remark containing a linearizied version of the
   ///    matrix expression.
-  ///
-  /// TODO:
-  ///  * Summarize number of vector instructions generated for each expression.
-  ///  * Propagate matrix remarks up the inlining chain.
   struct RemarkGenerator {
     const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix;
     OptimizationRemarkEmitter &ORE;
+    Function &Func;
     const DataLayout &DL;
 
     RemarkGenerator(const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix,
-                    OptimizationRemarkEmitter &ORE, const DataLayout &DL)
-        : Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), DL(DL) {}
+                    OptimizationRemarkEmitter &ORE, Function &Func)
+        : Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), Func(Func),
+          DL(Func.getParent()->getDataLayout()) {}
 
     /// Return all leafs of matrix expressions. Those are instructions in
     /// Inst2ColumnMatrix returing void. Currently that should only include
     /// stores.
-    SmallVector<Value *, 4> getExpressionLeaves() {
+    SmallVector<Value *, 4>
+    getExpressionLeaves(const SmallSetVector<Value *, 32> &Ops) {
       SmallVector<Value *, 4> Leaves;
-      for (auto &KV : Inst2ColumnMatrix)
-        if (KV.first->getType()->isVoidTy())
-          Leaves.push_back(KV.first);
+      for (auto *Op : Ops)
+        if (Op->getType()->isVoidTy() ||
+            !any_of(Op->users(), [&Ops](User *U) { return Ops.count(U); }))
+          Leaves.push_back(Op);
 
       return Leaves;
     }
 
     /// Recursively traverse expression \p V starting at \p Leaf and add \p Leaf
-    /// to all visited expressions in \p Shared.
+    /// to all visited expressions in \p Shared. Limit the matrix operations to
+    /// the ones in \p ExprSet.
     void collectSharedInfo(Value *Leaf, Value *V,
+                           const SmallSetVector<Value *, 32> &ExprSet,
                            DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) {
 
-      if (Inst2ColumnMatrix.find(V) == Inst2ColumnMatrix.end())
+      if (!ExprSet.count(V))
         return;
 
       auto I = Shared.insert({V, {}});
       I.first->second.insert(Leaf);
 
       for (Value *Op : cast<Instruction>(V)->operand_values())
-        collectSharedInfo(Leaf, Op, Shared);
+        collectSharedInfo(Leaf, Op, ExprSet, Shared);
       return;
     }
 
     /// Calculate the number of exclusive and shared op counts for expression
     /// starting at \p V. Expressions used multiple times are counted once.
+    /// Limit the matrix operations to the ones in \p ExprSet.
     std::pair<OpInfoTy, OpInfoTy>
     sumOpInfos(Value *Root, SmallPtrSetImpl<Value *> &ReusedExprs,
-               DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) {
-      auto CM = Inst2ColumnMatrix.find(Root);
-      if (CM == Inst2ColumnMatrix.end())
+               const SmallSetVector<Value *, 32> &ExprSet,
+               DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) const {
+      if (!ExprSet.count(Root))
         return {};
 
       // Already counted this expression. Stop.
@@ -1255,13 +1268,14 @@
       OpInfoTy Count;
 
       auto I = Shared.find(Root);
+      auto CM = Inst2ColumnMatrix.find(Root);
       if (I->second.size() == 1)
         Count = CM->second.getOpInfo();
       else
         SharedCount = CM->second.getOpInfo();
 
       for (Value *Op : cast<Instruction>(Root)->operand_values()) {
-        auto C = sumOpInfos(Op, ReusedExprs, Shared);
+        auto C = sumOpInfos(Op, ReusedExprs, ExprSet, Shared);
         Count += C.first;
         SharedCount += C.second;
       }
@@ -1272,49 +1286,84 @@
       if (!ORE.allowExtraAnalysis(DEBUG_TYPE))
         return;
 
-      // Find leafs of matrix expressions.
-      auto Leaves = getExpressionLeaves();
-
-      DenseMap<Value *, SmallPtrSet<Value *, 2>> Shared;
-
-      for (Value *Leaf : Leaves)
-        collectSharedInfo(Leaf, Leaf, Shared);
-
-      // Generate remarks for each leaf.
-      for (auto *L : Leaves) {
-        SmallPtrSet<Value *, 8> ReusedExprs;
-        OpInfoTy Counts, SharedCounts;
-        std::tie(Counts, SharedCounts) = sumOpInfos(L, ReusedExprs, Shared);
-
-        OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered",
-                               cast<Instruction>(L)->getDebugLoc(),
-                               cast<Instruction>(L)->getParent());
-
-        Rem << "Lowered with ";
-        Rem << ore::NV("NumStores", Counts.NumStores) << " stores, "
-            << ore::NV("NumLoads", Counts.NumLoads) << " loads, "
-            << ore::NV("NumComputeOps", Counts.NumComputeOps) << " compute ops";
-
-        if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 ||
-            SharedCounts.NumComputeOps > 0) {
-          Rem << ",\nadditionally "
-              << ore::NV("NumStores", SharedCounts.NumStores) << " stores, "
-              << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, "
-              << ore::NV("NumFPOps", SharedCounts.NumComputeOps)
-              << " compute ops"
-              << " are shared with other expressions";
+      // Map matrix operations to subprograms, by traversing the inlinedAt
+      // chain. If the function does not have a DISubprogram, we only map them
+      // to the containing function.
+      MapVector<DISubprogram *, SmallVector<Value *, 8>> Mapping;
+      for (auto &KV : Inst2ColumnMatrix) {
+        if (Func.getSubprogram()) {
+          auto *I = cast<Instruction>(KV.first);
+          DILocation *Context = I->getDebugLoc();
+          while (Context) {
+            auto I = Mapping.insert({getSubprogram(Context->getScope()), {}});
+            I.first->second.push_back(KV.first);
+            Context = DebugLoc(Context).getInlinedAt();
+          }
+        } else {
+          auto I = Mapping.insert({nullptr, {}});
+          I.first->second.push_back(KV.first);
         }
+      }
+      for (auto &KV : Mapping) {
+        // Group matrix operations together by finding the connected components
+        // of operations. We also return the debug location of a sink
+        // instruction (without any more matrix ops as user), to be used for the
+        // remark.
+        SmallSetVector<Value *, 32> ExprSet(KV.second.begin(), KV.second.end());
+        auto Leaves = getExpressionLeaves(ExprSet);
+
+        DenseMap<Value *, SmallPtrSet<Value *, 2>> Shared;
+
+        for (Value *Leaf : Leaves)
+          collectSharedInfo(Leaf, Leaf, ExprSet, Shared);
+
+        // Generate remarks for each leaf.
+        for (auto *L : Leaves) {
+
+          DebugLoc Loc = cast<Instruction>(L)->getDebugLoc();
+          DILocation *Context = cast<Instruction>(L)->getDebugLoc();
+          while (Context) {
+            if (getSubprogram(Context->getScope()) == KV.first) {
+              Loc = Context;
+              break;
+            }
+            Context = DebugLoc(Context).getInlinedAt();
+          }
 
-        Rem << ("\n" + linearize(L, Shared, DL));
-        ORE.emit(Rem);
+          SmallPtrSet<Value *, 8> ReusedExprs;
+          OpInfoTy Counts, SharedCounts;
+          std::tie(Counts, SharedCounts) =
+              sumOpInfos(L, ReusedExprs, ExprSet, Shared);
+
+          OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", Loc,
+                                 cast<Instruction>(L)->getParent());
+
+          Rem << "Lowered with ";
+          Rem << ore::NV("NumStores", Counts.NumStores) << " stores, "
+              << ore::NV("NumLoads", Counts.NumLoads) << " loads, "
+              << ore::NV("NumComputeOps", Counts.NumComputeOps)
+              << " compute ops";
+
+          if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 ||
+              SharedCounts.NumComputeOps > 0) {
+            Rem << ",\nadditionally "
+                << ore::NV("NumStores", SharedCounts.NumStores) << " stores, "
+                << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, "
+                << ore::NV("NumFPOps", SharedCounts.NumComputeOps)
+                << " compute ops"
+                << " are shared with other expressions";
+          }
+
+          Rem << ("\n" + linearize(L, Shared, ExprSet, DL));
+          ORE.emit(Rem);
+        }
       }
     }
 
-    std::string
-    linearize(Value *L,
-              const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
-              const DataLayout &DL) {
-      ExprLinearizer Lin(DL, Inst2ColumnMatrix, Shared, L);
+    std::string linearize(
+        Value *L, const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
+        const SmallSetVector<Value *, 32> &ExprSet, const DataLayout &DL) {
+      ExprLinearizer Lin(DL, Inst2ColumnMatrix, Shared, ExprSet, L);
       Lin.linearizeExpr(L, 0, false, false);
       return Lin.getResult();
     }
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll
@@ -0,0 +1,166 @@
+; REQUIRES: aarch64-registered-target
+
+; This test needs to be target specific due to the cost estimate in the output.
+
+; RUN: opt -lower-matrix-intrinsics -pass-remarks=lower-matrix-intrinsics -mtriple=arm64-apple-iphoneos -S < %s 2>&1 | FileCheck  %s
+
+; Test the propagation of matrix expressions along to inlined-at chain. The IR
+; in the test roughly corresponds to the C++ code below, with the IR containing
+; references to a few more functions.
+
+; matrix.h
+; template <typename Ty, unsigned R, unsigned C>
+; struct Matrix {
+;   using matrix_t = Ty __attribute__((matrix_type(R, C)));
+;
+;   matrix_t value;
+; };
+;
+; ; add.h
+; template <typename Ty, unsigned R, unsigned C>
+; Matrix<Ty, R, C> add(Matrix<Ty, R, C> M1, Matrix<Ty, R, C> M2) {
+;   Matrix<Ty, R, C> Result;
+;   Result.value = __builtin_matrix_add(M1.value, M2.value);
+;   return Result;
+; }
+;
+; load.h:
+; template <typename Ty, unsigned R, unsigned C>
+; Matrix<Ty, R, C> load(Ty *Ptr) {
+;   Matrix<Ty, R, C> Result;
+;   Result.value = *reinterpret_cast <typename Matrix<Ty, R, C>::matrix_t *>(Ptr);
+;   return Result;
+; }
+;
+; store.h:
+; template <typename Ty, unsigned R, unsigned C>
+; void store(Matrix<Ty, R, C> M1, Ty *Ptr) {
+;   *reinterpret_cast<typename decltype(M1)::matrix_t *>(Ptr) = M1.value;
+; }
+;
+; toplevel.cpp
+; void test(double *A, double *B, double *C) {
+;   store(add(load<double, 3, 5>(A), load<double, 3, 5>(B)), C);
+; }
+;
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "aarch64-apple-ios"
+
+; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops
+; CHECK-NEXT:  load(addr %A)
+
+; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops
+; CHECK-NEXT:  columnwise.load.3x5.double(addr %B, 5)
+
+; CHECK-LABEL: remark: load.h:41:11: Lowered with 0 stores, 1 loads, 0 compute ops
+; CHECK-NEXT: load(addr %D)
+
+; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops
+; CHECK-NEXT:  load(addr %A)
+
+; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops
+; CHECK-NEXT:  columnwise.load.3x5.double(addr %B, 5)
+
+; CHECK-LABEL: remark: toplevel.c:410:0: Lowered with 10 stores, 20 loads, 10 compute ops
+; CHECK-NEXT:  store(
+; CHECK-NEXT:   fadd(
+; CHECK-NEXT:    load(addr %A),
+; CHECK-NEXT:    columnwise.load.3x5.double(addr %B, 5)),
+; CHECK-NEXT:   addr %C)
+
+; CHECK-LABEL: remark: toplevel.c:510:0: Lowered with 1 stores, 1 loads, 8 compute ops
+; CHECK-NEXT:  store(
+; CHECK-NEXT:   transpose.1x2.float(transpose.2x1.float(load(addr %D))),
+; CHECK-NEXT:   addr %D)
+
+; CHECK-LABEL: remark: add.h:66:11: Lowered with 0 stores, 0 loads, 10 compute ops
+; CHECK-NEXT:  fadd(
+; CHECK-NEXT:   addr %A,
+; CHECK-NEXT:   scalar)
+
+; CHECK-LABEL: remark: store.h:10:11: Lowered with 10 stores, 0 loads, 0 compute ops
+; CHECK-NEXT:  store(
+; CHECK-NEXT:   scalar,
+; CHECK-NEXT:   addr %C)
+
+; CHECK-LABEL: remark: store.h:66:11: Lowered with 1 stores, 0 loads, 0 compute ops
+; CHECK-NEXT:  store(
+; CHECK-NEXT:  scalar,
+; CHECK-NEXT:  addr %D)
+
+; CHECK-LABEL: remark: transpose.h:13:11: Lowered with 0 stores, 0 loads, 8 compute ops
+; CHECK-NEXT:  transpose.1x2.float(transpose.2x1.float(addr %D))
+
+define void @toplevel(<15 x double>* %A, <15 x double>* %B, <15 x double>* %C, <2 x float>* %D) !dbg !16 {
+entry:
+  %a = load <15 x double>, <15 x double> *%A, align 16, !dbg !3791
+  %b = call <15 x double> @llvm.matrix.columnwise.load(<15 x double>* %B, i32 5, i32 3, i32 5), !dbg !3793
+  %c  = fadd <15 x double> %a, %b, !dbg !100
+  store <15 x double> %c, <15 x double> *%C, align 16, !dbg !102
+
+  %load = load <2 x float>, <2 x float>* %D, !dbg !104
+  %t1 = call <2 x float> @llvm.matrix.transpose(<2 x float> %load, i32 2, i32 1), !dbg !106
+  %t2 = call <2 x float> @llvm.matrix.transpose(<2 x float> %t1, i32 1, i32 2), !dbg !106
+  store <2 x float> %t2, <2 x float>* %D, !dbg !108
+  ret void
+}
+
+declare <15 x double> @llvm.matrix.columnwise.load(<15 x double>*, i32, i32, i32)
+declare <2 x float> @llvm.matrix.transpose(<2 x float>, i32, i32)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "load.h", directory: "/test")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "load_fn", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!17 = !DIFile(filename: "toplevel.c", directory: "/test")
+!16 = distinct !DISubprogram(name: "toplevel", scope: !1, file: !17, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!18 = !DIFile(filename: "assign.h", directory: "/test")
+!19 = distinct !DISubprogram(name: "assign", scope: !1, file: !18, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+
+!20 = !DIFile(filename: "add.h", directory: "/test")
+!21 = distinct !DISubprogram(name: "add_fn", scope: !1, file: !20, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+
+!22 = !DIFile(filename: "store.h", directory: "/test")
+!23 = distinct !DISubprogram(name: "store_fn", scope: !1, file: !22, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+
+!24 = !DIFile(filename: "transpose.h", directory: "/test")
+!25 = distinct !DISubprogram(name: "transpose", scope: !1, file: !24, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+
+
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8, !8, !11}
+!8 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !9)
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 32, align: 32)
+!10 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
+!11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!12 = !{!13}
+!13 = !DILocalVariable(name: "a", arg: 1, scope: !5, file: !1, line: 1, type: !8)
+!14 = !DILocation(line: 1, column: 27, scope: !5)
+
+!3791 = !DILocation(line: 41, column: 43, scope: !5, inlinedAt: !3795)
+!3792 = !DILocation(line: 405, column: 3, scope: !16)
+!3793 = !DILocation(line: 41, column: 43, scope: !5, inlinedAt: !3796)
+!3794 = !DILocation(line: 406, column: 11, scope: !16)
+!3795 = !DILocation(line: 32, column: 43, scope: !19, inlinedAt: !3792)
+!3796 = !DILocation(line: 32, column: 43, scope: !19, inlinedAt: !3794)
+
+!100 = !DILocation(line: 66, column: 11, scope: !21, inlinedAt: !101)
+!101 = !DILocation(line: 410, column: 11, scope: !16)
+
+!102 = !DILocation(line: 10, column: 11, scope: !23, inlinedAt: !103)
+!103 = !DILocation(line: 410, column: 0, scope: !16)
+
+!104 = !DILocation(line: 41, column: 11, scope: !5, inlinedAt: !101)
+!105 = !DILocation(line: 500, column: 11, scope: !16)
+
+!106 = !DILocation(line: 13, column: 11, scope: !25, inlinedAt: !101)
+!107 = !DILocation(line: 510, column: 11, scope: !16)
+
+!108 = !DILocation(line: 66, column: 11, scope: !23, inlinedAt: !109)
+!109 = !DILocation(line: 510, column: 0, scope: !16)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
@@ -71,8 +71,8 @@
 
 define void @binaryops(<9 x double>* %A, <9 x double>* %B) !dbg !31 {
   %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !32
-  %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
-  %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
+  %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !32
+  %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !32
   call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !32
   ret void
 }
@@ -95,8 +95,8 @@
 
 define void @multiple_expressions(<9 x double>* %A, <9 x double>* %B, <12 x double>* %C, <12 x double>* %D, <4 x double>* %E) !dbg !33 {
   %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !34
-  %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
-  %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
+  %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !34
+  %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !34
   call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !34
 
   %C.matrix = load <12 x double>, <12 x double>* %C, !dbg !34
@@ -119,8 +119,8 @@
 define void @stackaddresses(<9 x double>* %A) !dbg !35 {
   %B = alloca <9 x double>
   %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !36
-  %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
-  %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
+  %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !36
+  %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !36
   call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !36
   ret void
 }
@@ -140,7 +140,7 @@
   %s2 = bitcast <15 x double>* %s1 to i64*, !dbg !22
   %s3 = bitcast i64* %s2 to <15 x double>*, !dbg !22
 
-  %t = call <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double> %av, i32 5, i32 3)
+  %t = call <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double> %av, i32 5, i32 3), !dbg !22
 
   store <15 x double> %t, <15 x double>* %s3, !dbg !22
   ret void