diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1852,6 +1852,64 @@
 
     [[fallthrough]];
   }
+  case Intrinsic::matrix_multiply: {
+    // Optimise negation in matrix multiplication.
+    // If we have a negated operand where it's size is larger than the second
+    // operand or the result We can optimise the result by moving the negation
+    // operation to the smallest operand in the equation This covers two cases:
+    // Case 1: the operand has the smalest element count i.e
+    // (-A) * B = A * (-B)
+    // Case 2: the result has the smalest element count
+    // (-A) * B = -(A * B)
+    Value *X;
+
+    Value *Op0 = II->getArgOperand(0);
+    Value *Op1 = II->getArgOperand(1);
+
+    VectorType *RetType = dyn_cast<VectorType>(II->getType());
+    Instruction *FNegOp;
+    Value *SecondOperand;
+    unsigned SecondOperandArg;
+    if (match(Op0, m_FNeg(m_Value(X)))) {
+      FNegOp = cast<Instruction>(Op0);
+      SecondOperand = Op1;
+      SecondOperandArg = 1;
+    } else if (match(Op1, m_FNeg(m_Value(X)))) {
+      FNegOp = cast<Instruction>(Op1);
+      SecondOperand = Op0;
+      SecondOperandArg = 0;
+    } else {
+      break;
+    }
+    if (!FNegOp->hasOneUse())
+      break;
+
+    Value *OpNotNeg = FNegOp->getOperand(0);
+    VectorType *FNegType = dyn_cast<VectorType>(FNegOp->getType());
+    VectorType *SecondOperandType = cast<VectorType>(SecondOperand->getType());
+    if (ElementCount::isKnownGT(FNegType->getElementCount(),
+                                SecondOperandType->getElementCount()) &&
+        ElementCount::isKnownLT(SecondOperandType->getElementCount(),
+                                RetType->getElementCount())) {
+      replaceInstUsesWith(*FNegOp, OpNotNeg);
+      Value *InverseSecondOp = Builder.CreateFNeg(SecondOperand);
+      Instruction *NewCall = II->clone();
+      NewCall->setOperand(SecondOperandArg, InverseSecondOp);
+      NewCall->insertAfter(II);
+      return replaceInstUsesWith(*II, NewCall);
+    }
+    if (ElementCount::isKnownGT(FNegType->getElementCount(),
+                                RetType->getElementCount())) {
+      replaceInstUsesWith(*FNegOp, OpNotNeg);
+      // Insert after call instruction
+      Builder.SetInsertPoint(II->getNextNode());
+      Instruction *FNegInst = cast<Instruction>(Builder.CreateFNeg(II));
+      replaceInstUsesWith(*II, FNegInst);
+      FNegInst->setOperand(0, II);
+      return II;
+    }
+    break;
+  }
   case Intrinsic::fma: {
     // fma fneg(x), fneg(y), z -> fma x, y, z
     Value *Src0 = II->getArgOperand(0);
diff --git a/llvm/test/Transforms/InstCombine/matrix-multiplication-negation.ll b/llvm/test/Transforms/InstCombine/matrix-multiplication-negation.ll
--- a/llvm/test/Transforms/InstCombine/matrix-multiplication-negation.ll
+++ b/llvm/test/Transforms/InstCombine/matrix-multiplication-negation.ll
@@ -3,9 +3,9 @@
 
 define <3 x double> @matrix_multiply_v9f64_v3f64(<9 x double> %a, <3 x double> %b) {
 ; CHECK-LABEL: @matrix_multiply_v9f64_v3f64(
-; CHECK-NEXT:    [[A_NEG:%.*]] = fneg <9 x double> [[A:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <3 x double> @llvm.matrix.multiply.v3f64.v9f64.v3f64(<9 x double> [[A_NEG]], <3 x double> [[B:%.*]], i32 3, i32 3, i32 1)
-; CHECK-NEXT:    ret <3 x double> [[RES]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <3 x double> @llvm.matrix.multiply.v3f64.v9f64.v3f64(<9 x double> [[A:%.*]], <3 x double> [[B:%.*]], i32 3, i32 3, i32 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <3 x double> [[RES]]
+; CHECK-NEXT:    ret <3 x double> [[TMP1]]
 ;
   %a.neg = fneg <9 x double> %a
   %res = tail call <3 x double> @llvm.matrix.multiply.v3f64.v9f64.v3f64(<9 x double> %a.neg, <3 x double> %b, i32 3, i32 3, i32 1)
@@ -15,9 +15,9 @@
 
 define <9 x double> @matrix_multiply_v27f64_v3f64(<27 x double> %a, <3 x double> %b) {
 ; CHECK-LABEL: @matrix_multiply_v27f64_v3f64(
-; CHECK-NEXT:    [[A_NEG:%.*]] = fneg <27 x double> [[A:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A_NEG]], <3 x double> [[B:%.*]], i32 9, i32 3, i32 1)
-; CHECK-NEXT:    ret <9 x double> [[RES]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <3 x double> [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A:%.*]], <3 x double> [[TMP1]], i32 9, i32 3, i32 1)
+; CHECK-NEXT:    ret <9 x double> [[TMP2]]
 ;
   %a.neg = fneg <27 x double> %a
   %res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> %a.neg, <3 x double> %b, i32 9, i32 3, i32 1)
@@ -27,9 +27,9 @@
 
 define <12 x double> @matrix_multiply_v15f64_v20f64(<15 x double> %a, <20 x double> %b) {
 ; CHECK-LABEL: @matrix_multiply_v15f64_v20f64(
-; CHECK-NEXT:    [[A_NEG:%.*]] = fneg <15 x double> [[A:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> [[A_NEG]], <20 x double> [[B:%.*]], i32 3, i32 5, i32 4)
-; CHECK-NEXT:    ret <12 x double> [[RES]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> [[A:%.*]], <20 x double> [[B:%.*]], i32 3, i32 5, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <12 x double> [[RES]]
+; CHECK-NEXT:    ret <12 x double> [[TMP1]]
 ;
   %a.neg = fneg <15 x double> %a
   %res = tail call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> %a.neg, <20 x double> %b, i32 3, i32 5, i32 4)