diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1830,6 +1830,11 @@
     break;
   }
   case Intrinsic::matrix_multiply: {
+    // Optimize negation in matrix multiplication.
+    // If we have a negated operand where its size is larger than the second
+    // operand or the result we can optimize the result by moving the negation
+    // operation to the smallest operand in the equation.
+
     // -A * -B -> A * B
     Value *A, *B;
     if (match(II->getArgOperand(0), m_FNeg(m_Value(A))) &&
@@ -1838,6 +1843,53 @@
       replaceOperand(*II, 1, B);
       return II;
     }
+
+    Value *Op0 = II->getOperand(0);
+    Value *Op1 = II->getOperand(1);
+    Value *OpNotNeg, *NegatedOperand;
+    unsigned NegatedOperandArg, OtherOperandArg;
+    if (match(Op0, m_FNeg(m_Value(OpNotNeg)))) {
+      NegatedOperand = Op0;
+      NegatedOperandArg = 0;
+      OtherOperandArg = 1;
+    } else if (match(Op1, m_FNeg(m_Value(OpNotNeg)))) {
+      NegatedOperand = Op1;
+      NegatedOperandArg = 1;
+      OtherOperandArg = 0;
+    } else
+      // Multiplication doesn't have a negated operand.
+      break;
+    
+    // Only optimize if the negated operand has only one use.
+    if (!NegatedOperand->hasOneUse())
+      break;
+
+    Value *OtherOperand = II->getOperand(OtherOperandArg);
+    VectorType *RetType = cast<VectorType>(II->getType());
+    VectorType *NegatedOperandType =
+        cast<VectorType>(NegatedOperand->getType());
+    VectorType *OtherOperandType = cast<VectorType>(OtherOperand->getType());
+    // (-A) * B -> A * (-B), if it is cheaper to negate B
+    if (ElementCount::isKnownGT(NegatedOperandType->getElementCount(),
+                                OtherOperandType->getElementCount()) &&
+        ElementCount::isKnownLT(OtherOperandType->getElementCount(),
+                                RetType->getElementCount())) {
+      Value *InverseOtherOperand = Builder.CreateFNeg(OtherOperand);
+      replaceOperand(*II, NegatedOperandArg, OpNotNeg);
+      replaceOperand(*II, OtherOperandArg, InverseOtherOperand);
+      return II;
+    }
+    // (-A) * B -> -(A * B), if it is cheaper to negate the result
+    if (ElementCount::isKnownGT(NegatedOperandType->getElementCount(),
+                                RetType->getElementCount())) {
+      replaceOperand(*II, NegatedOperandArg, OpNotNeg);
+      IRBuilderBase::InsertPointGuard Guard(Builder);
+      Builder.SetInsertPoint(II->getNextNode());
+      Instruction *FNegInst = cast<Instruction>(Builder.CreateFNeg(II));
+      replaceInstUsesWith(*II, FNegInst);
+      FNegInst->setOperand(0, II);
+      return II;
+    }
     break;
   }
   case Intrinsic::fmuladd: {
diff --git a/llvm/test/Transforms/InstCombine/matrix-multiplication-negation.ll b/llvm/test/Transforms/InstCombine/matrix-multiplication-negation.ll
--- a/llvm/test/Transforms/InstCombine/matrix-multiplication-negation.ll
+++ b/llvm/test/Transforms/InstCombine/matrix-multiplication-negation.ll
@@ -4,9 +4,9 @@
 ; The result has the fewest vector elements between the result and the two operands so the negation can be moved there
 define <2 x double> @test_negation_move_to_result(<6 x double> %a, <3 x double> %b) {
 ; CHECK-LABEL: @test_negation_move_to_result(
-; CHECK-NEXT:    [[A_NEG:%.*]] = fneg <6 x double> [[A:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A_NEG]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
-; CHECK-NEXT:    ret <2 x double> [[RES]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A:%.*]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <2 x double> [[RES]]
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %a.neg = fneg <6 x double> %a
   %res = tail call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> %a.neg, <3 x double> %b, i32 2, i32 3, i32 1)
@@ -17,9 +17,9 @@
 ; Fast flag should be preserved
 define <2 x double> @test_negation_move_to_result_with_fastflags(<6 x double> %a, <3 x double> %b) {
 ; CHECK-LABEL: @test_negation_move_to_result_with_fastflags(
-; CHECK-NEXT:    [[A_NEG:%.*]] = fneg <6 x double> [[A:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call fast <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A_NEG]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
-; CHECK-NEXT:    ret <2 x double> [[RES]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call fast <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A:%.*]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <2 x double> [[RES]]
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %a.neg = fneg <6 x double> %a
   %res = tail call fast <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> %a.neg, <3 x double> %b, i32 2, i32 3, i32 1)
@@ -29,8 +29,8 @@
 ; %b has the fewest vector elements between the result and the two operands so the negation can be moved there
 define <9 x double> @test_move_negation_to_second_operand(<27 x double> %a, <3 x double> %b) {
 ; CHECK-LABEL: @test_move_negation_to_second_operand(
-; CHECK-NEXT:    [[A_NEG:%.*]] = fneg <27 x double> [[A:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A_NEG]], <3 x double> [[B:%.*]], i32 9, i32 3, i32 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <3 x double> [[B:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A:%.*]], <3 x double> [[TMP1]], i32 9, i32 3, i32 1)
 ; CHECK-NEXT:    ret <9 x double> [[RES]]
 ;
   %a.neg = fneg <27 x double> %a
@@ -42,8 +42,8 @@
 ; Fast flag should be preserved
 define <9 x double> @test_move_negation_to_second_operand_with_fast_flags(<27 x double> %a, <3 x double> %b) {
 ; CHECK-LABEL: @test_move_negation_to_second_operand_with_fast_flags(
-; CHECK-NEXT:    [[A_NEG:%.*]] = fneg <27 x double> [[A:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call fast <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A_NEG]], <3 x double> [[B:%.*]], i32 9, i32 3, i32 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <3 x double> [[B:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call fast <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A:%.*]], <3 x double> [[TMP1]], i32 9, i32 3, i32 1)
 ; CHECK-NEXT:    ret <9 x double> [[RES]]
 ;
   %a.neg = fneg <27 x double> %a
@@ -54,9 +54,9 @@
 ; The result has the fewest vector elements between the result and the two operands so the negation can be moved there
 define <2 x double> @test_negation_move_to_result_from_second_operand(<3 x double> %a, <6 x double> %b){
 ; CHECK-LABEL: @test_negation_move_to_result_from_second_operand(
-; CHECK-NEXT:    [[B_NEG:%.*]] = fneg <6 x double> [[B:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.matrix.multiply.v2f64.v3f64.v6f64(<3 x double> [[A:%.*]], <6 x double> [[B_NEG]], i32 1, i32 3, i32 2)
-; CHECK-NEXT:    ret <2 x double> [[RES]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.matrix.multiply.v2f64.v3f64.v6f64(<3 x double> [[A:%.*]], <6 x double> [[B:%.*]], i32 1, i32 3, i32 2)
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <2 x double> [[RES]]
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %b.neg = fneg <6 x double> %b
   %res = tail call <2 x double> @llvm.matrix.multiply.v2f64.v3f64.v6f64(<3 x double> %a, <6 x double> %b.neg, i32 1, i32 3, i32 2)
@@ -66,8 +66,8 @@
 ; %a has the fewest vector elements between the result and the two operands so the negation can be moved there
 define <9 x double> @test_move_negation_to_first_operand(<3 x double> %a, <27 x double> %b) {
 ; CHECK-LABEL: @test_move_negation_to_first_operand(
-; CHECK-NEXT:    [[B_NEG:%.*]] = fneg <27 x double> [[B:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v3f64.v27f64(<3 x double> [[A:%.*]], <27 x double> [[B_NEG]], i32 1, i32 3, i32 9)
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <3 x double> [[A:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v3f64.v27f64(<3 x double> [[TMP1]], <27 x double> [[B:%.*]], i32 1, i32 3, i32 9)
 ; CHECK-NEXT:    ret <9 x double> [[RES]]
 ;
   %b.neg = fneg <27 x double> %b
@@ -234,8 +234,8 @@
 ; negation should be moved to the second operand given it has the smallest operand count
 define <72 x double> @chain_of_matrix_mutliplies(<27 x double> %a, <3 x double> %b, <8 x double> %c) {
 ; CHECK-LABEL: @chain_of_matrix_mutliplies(
-; CHECK-NEXT:    [[A_NEG:%.*]] = fneg <27 x double> [[A:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A_NEG]], <3 x double> [[B:%.*]], i32 9, i32 3, i32 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <3 x double> [[B:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A:%.*]], <3 x double> [[TMP1]], i32 9, i32 3, i32 1)
 ; CHECK-NEXT:    [[RES_2:%.*]] = tail call <72 x double> @llvm.matrix.multiply.v72f64.v9f64.v8f64(<9 x double> [[RES]], <8 x double> [[C:%.*]], i32 9, i32 1, i32 8)
 ; CHECK-NEXT:    ret <72 x double> [[RES_2]]
 ;
@@ -249,11 +249,11 @@
 ; second negation should be moved to the result of the second multipication
 define <6 x double> @chain_of_matrix_mutliplies_with_two_negations(<3 x double> %a, <5 x double> %b, <10 x double> %c) {
 ; CHECK-LABEL: @chain_of_matrix_mutliplies_with_two_negations(
-; CHECK-NEXT:    [[B_NEG:%.*]] = fneg <5 x double> [[B:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double> [[A:%.*]], <5 x double> [[B_NEG]], i32 3, i32 1, i32 5)
-; CHECK-NEXT:    [[RES_NEG:%.*]] = fneg <15 x double> [[RES]]
-; CHECK-NEXT:    [[RES_2:%.*]] = tail call <6 x double> @llvm.matrix.multiply.v6f64.v15f64.v10f64(<15 x double> [[RES_NEG]], <10 x double> [[C:%.*]], i32 3, i32 5, i32 2)
-; CHECK-NEXT:    ret <6 x double> [[RES_2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <3 x double> [[A:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double> [[TMP1]], <5 x double> [[B:%.*]], i32 3, i32 1, i32 5)
+; CHECK-NEXT:    [[RES_2:%.*]] = tail call <6 x double> @llvm.matrix.multiply.v6f64.v15f64.v10f64(<15 x double> [[RES]], <10 x double> [[C:%.*]], i32 3, i32 5, i32 2)
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg <6 x double> [[RES_2]]
+; CHECK-NEXT:    ret <6 x double> [[TMP2]]
 ;
   %b.neg = fneg <5 x double> %b
   %res = tail call <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double> %a, <5 x double> %b.neg, i32 3, i32 1, i32 5)
@@ -265,10 +265,10 @@
 ; negation should be propagated to the result of the second matrix multiplication
 define <6 x double> @chain_of_matrix_mutliplies_propagation(<15 x double> %a, <20 x double> %b, <8 x double> %c){
 ; CHECK-LABEL: @chain_of_matrix_mutliplies_propagation(
-; CHECK-NEXT:    [[A_NEG:%.*]] = fneg <15 x double> [[A:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> [[A_NEG]], <20 x double> [[B:%.*]], i32 3, i32 5, i32 4)
+; CHECK-NEXT:    [[RES:%.*]] = tail call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> [[A:%.*]], <20 x double> [[B:%.*]], i32 3, i32 5, i32 4)
 ; CHECK-NEXT:    [[RES_2:%.*]] = tail call <6 x double> @llvm.matrix.multiply.v6f64.v12f64.v8f64(<12 x double> [[RES]], <8 x double> [[C:%.*]], i32 3, i32 4, i32 2)
-; CHECK-NEXT:    ret <6 x double> [[RES_2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <6 x double> [[RES_2]]
+; CHECK-NEXT:    ret <6 x double> [[TMP1]]
 ;
   %a.neg = fneg <15 x double> %a
   %res = tail call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> %a.neg, <20 x double> %b, i32 3, i32 5, i32 4)