diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15195,6 +15195,7 @@
 ::
 
       declare vectorty @llvm.matrix.multiply.*(vectorty %A, vectorty %B, i32 <M>, i32 <N>, i32 <K>)
+      declare vectorty @llvm.matrix.multiply.*(vectorty %A, vectorty %B, i32 <M>, i32 <N>, i32 <K>, i1 <HasNUW>, i1 <HasNSW>)
 
 Overview:
 """""""""
@@ -15210,6 +15211,10 @@
 must have <M> * <K> elements, %B must have <K> * <N> elements and the returned
 vector must have <M> * <N> elements.
 
+Optionally, two additional i1 arguments can be provided when operating on
+integer matrixes. They indicated whether the NUW and NSW flags should be added
+to the generated instructions respectively.
+
 
 '``llvm.matrix.columnwise.load.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1384,7 +1384,8 @@
                                      llvm_anyvector_ty,
                                      llvm_i32_ty,
                                      llvm_i32_ty,
-                                     llvm_i32_ty],
+                                     llvm_i32_ty,
+                                     llvm_vararg_ty],
                                     [IntrNoMem, IntrSpeculatable,
                                      IntrWillReturn, ImmArg<2>, ImmArg<3>,
                                      ImmArg<4>]>;
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4941,6 +4941,27 @@
       NumRows = cast<ConstantInt>(Call.getArgOperand(2));
       NumColumns = cast<ConstantInt>(Call.getArgOperand(4));
       TypeToCheck = cast<VectorType>(Call.getType());
+
+      Assert(Call.getNumArgOperands() == 5 || Call.getNumArgOperands() == 7,
+             "llvm.matrix.multiply takes either 5 or 7 arguments");
+      // Verify variadic arguments (HasNUW, HasNSW)
+      if (Call.getNumArgOperands() == 7) {
+        Assert(TypeToCheck->getElementType()->isIntegerTy(),
+               "llvm.matrix.multiply with 7 arguments must operate on integer "
+               "matrixes");
+        Value *Arg5 = Call.getArgOperand(5);
+        Type *Arg5Ty = Arg5->getType();
+        Assert(Arg5Ty->isIntegerTy() && Arg5Ty->getIntegerBitWidth() == 1 &&
+                   isa<ConstantInt>(Arg5),
+               "sixth argument of llvm.matrix.multiply must be an i1 integer "
+               "constant");
+        Value *Arg6 = Call.getArgOperand(6);
+        Type *Arg6Ty = Arg6->getType();
+        Assert(Arg6Ty->isIntegerTy() && Arg6Ty->getIntegerBitWidth() == 1 &&
+                   isa<ConstantInt>(Arg6),
+               "seventh argument of llvm.matrix.multiply must be an i1 integer "
+               "constant");
+      }
       break;
     case Intrinsic::matrix_transpose:
       NumRows = cast<ConstantInt>(Call.getArgOperand(1));
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -185,6 +185,27 @@
   LoopInfo &LI;
   OptimizationRemarkEmitter &ORE;
 
+  /// Contains a set of options for generating code for llvm.matrix.multiply.
+  struct MultiplyOptions {
+    /// Is Floating-point contraction allowed?
+    bool AllowContract;
+    /// Should the NUW flag be added?
+    bool HasNUW;
+    /// Should the NSW flag be added?
+    bool HasNSW;
+
+    MultiplyOptions(CallInst *MatMul) {
+      AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
+                                               MatMul->hasAllowContract());
+      HasNUW = MatMul->getNumArgOperands() == 7
+                   ? cast<ConstantInt>(MatMul->getArgOperand(5))->getZExtValue()
+                   : false;
+      HasNSW = MatMul->getNumArgOperands() == 7
+                   ? cast<ConstantInt>(MatMul->getArgOperand(6))->getZExtValue()
+                   : false;
+    }
+  };
+
   /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation.
   struct OpInfoTy {
     /// Number of stores emitted to generate this matrix.
@@ -932,14 +953,15 @@
   }
 
   Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp,
-                      IRBuilder<> &Builder, bool AllowContraction,
+                      IRBuilder<> &Builder, MultiplyOptions Opts,
                       unsigned &NumComputeOps) {
     NumComputeOps += getNumOps(A->getType());
     if (!Sum)
-      return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B);
+      return UseFPOp ? Builder.CreateFMul(A, B)
+                     : Builder.CreateMul(A, B, "", Opts.HasNUW, Opts.HasNSW);
 
     if (UseFPOp) {
-      if (AllowContraction) {
+      if (Opts.AllowContract) {
         // Use fmuladd for floating point operations and let the backend decide
         // if that's profitable.
         Function *FMulAdd = Intrinsic::getDeclaration(
@@ -952,8 +974,8 @@
     }
 
     NumComputeOps += getNumOps(A->getType());
-    Value *Mul = Builder.CreateMul(A, B);
-    return Builder.CreateAdd(Sum, Mul);
+    Value *Mul = Builder.CreateMul(A, B, "", Opts.HasNUW, Opts.HasNSW);
+    return Builder.CreateAdd(Sum, Mul, "", Opts.HasNUW, Opts.HasNSW);
   }
 
   /// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For
@@ -980,7 +1002,7 @@
   /// Compute \p Result += \p A * \p B for input matrices with left-associating
   /// addition.
   void emitMatrixMultiply(MatrixTy &Result, const MatrixTy &A,
-                          const MatrixTy &B, bool AllowContraction,
+                          const MatrixTy &B, MultiplyOptions Opts,
                           IRBuilder<> &Builder, bool isTiled) {
     const unsigned VF = std::max<unsigned>(
         TTI.getRegisterBitWidth(true) /
@@ -1017,7 +1039,7 @@
             Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat");
             Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, L, Splat,
                                Result.getElementType()->isFloatingPointTy(),
-                               Builder, AllowContraction, NumComputeOps);
+                               Builder, Opts, NumComputeOps);
           }
           Result.setVector(J,
                            insertVector(Result.getVector(J), I, Sum, Builder));
@@ -1041,7 +1063,7 @@
             Value *LH = Builder.CreateExtractElement(A.getVector(I), K);
             Value *Splat = Builder.CreateVectorSplat(BlockSize, LH, "splat");
             Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, Splat, R,
-                               IsFP, Builder, AllowContraction, NumComputeOps);
+                               IsFP, Builder, Opts, NumComputeOps);
           }
           Result.setVector(I,
                            insertVector(Result.getVector(I), J, Sum, Builder));
@@ -1196,8 +1218,6 @@
     Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul);
     Value *CPtr = Store->getPointerOperand();
 
-    bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
-                                                  MatMul->hasAllowContract());
     IRBuilder<> Builder(Store);
     for (unsigned J = 0; J < C; J += TileSize)
       for (unsigned I = 0; I < R; I += TileSize) {
@@ -1213,7 +1233,7 @@
           MatrixTy B =
               loadMatrix(BPtr, RShape, Builder.getInt32(K), Builder.getInt32(J),
                          {TileM, TileC}, EltType, Builder);
-          emitMatrixMultiply(Res, A, B, AllowContract, Builder, true);
+          emitMatrixMultiply(Res, A, B, MultiplyOptions(MatMul), Builder, true);
         }
         storeMatrix(Res, CPtr, {R, M}, Builder.getInt32(I), Builder.getInt32(J),
                     EltType, Builder);
@@ -1280,9 +1300,8 @@
     // Initialize the output
     MatrixTy Result(R, C, EltType);
 
-    bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
-                                                  MatMul->hasAllowContract());
-    emitMatrixMultiply(Result, Lhs, Rhs, AllowContract, Builder, false);
+    emitMatrixMultiply(Result, Lhs, Rhs, MultiplyOptions(MatMul), Builder,
+                       false);
     finalizeLowering(MatMul, Result, Builder);
   }
 
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll
@@ -240,13 +240,13 @@
   %a = load <9 x double>, <9 x double>* %A.Ptr
   %b = load <9 x double>, <9 x double>* %B.Ptr
   %a.trans  = call <9 x double> @llvm.matrix.transpose(<9 x double> %a, i32 3, i32 3)
-  %c = call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %a.trans, <9 x double> %b, i32 3, i32 3, i32 3)
+  %c = call <9 x double> (<9 x double>, <9 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %a.trans, <9 x double> %b, i32 3, i32 3, i32 3)
   store <9 x double> %c, <9 x double>* %C.Ptr
   ret void
 }
 
 declare <9 x double> @llvm.matrix.transpose(<9 x double>, i32, i32)
-declare <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double>, <9 x double>, i32, i32, i32)
+declare <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double>, <9 x double>, i32, i32, i32, ...)
 
 define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x double>* %C.Ptr) {
 ; CHECK-LABEL: @transpose_multiply_add(
@@ -504,7 +504,7 @@
   %a = load <9 x double>, <9 x double>* %A.Ptr
   %b = load <9 x double>, <9 x double>* %B.Ptr
   %a.trans  = call <9 x double> @llvm.matrix.transpose(<9 x double> %a, i32 3, i32 3)
-  %mult = call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %a.trans, <9 x double> %b, i32 3, i32 3, i32 3)
+  %mult = call <9 x double> (<9 x double>, <9 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %a.trans, <9 x double> %b, i32 3, i32 3, i32 3)
   %c = load <9 x double>, <9 x double>* %C.Ptr
   %res = fadd <9 x double> %c, %mult
 
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll
@@ -77,9 +77,9 @@
   store i32 %r, i32* %r.addr, align 4
   store i32 %c, i32* %c.addr, align 4
   %0 = load <4 x double>, <4 x double>* getelementptr inbounds ([5 x <4 x double>], [5 x <4 x double>]* @foo, i64 0, i64 0), align 16
-  %mul = call <4 x double> @llvm.matrix.multiply(<4 x double> %0, <4 x double> %0, i32 2, i32 2, i32 2)
+  %mul = call <4 x double> (<4 x double>, <4 x double>, i32, i32, i32, ...) @llvm.matrix.multiply(<4 x double> %0, <4 x double> %0, i32 2, i32 2, i32 2)
   store <4 x double> %0, <4 x double>* getelementptr inbounds ([5 x <4 x double>], [5 x <4 x double>]* @foo, i64 0, i64 2), align 16
   ret void
 }
 
-declare <4 x double> @llvm.matrix.multiply(<4 x double>, <4 x double>, i32, i32, i32)
+declare <4 x double> @llvm.matrix.multiply(<4 x double>, <4 x double>, i32, i32, i32, ...)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-add-sub-double-row-major.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-add-sub-double-row-major.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-add-sub-double-row-major.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-add-sub-double-row-major.ll
@@ -142,11 +142,11 @@
   store <6 x double> %add, <6 x double>* %a.ptr
   %sub = fsub <6 x double> %b, <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>
   store <6 x double> %sub, <6 x double>* %b.ptr
-  %mul = call <4 x double> @llvm.matrix.multiply.v4f64.v6f64.v6f64(<6 x double> %add, <6 x double> %sub, i32 2, i32 3, i32 2)
+  %mul = call <4 x double> (<6 x double>, <6 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v6f64.v6f64(<6 x double> %add, <6 x double> %sub, i32 2, i32 3, i32 2)
   %c = load <4 x double>, <4 x double>* %c.ptr
   %res = fsub <4 x double> %c, %mul
   store <4 x double> %res, <4 x double>* %c.ptr
   ret void
 }
 
-declare <4 x double> @llvm.matrix.multiply.v4f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32)
+declare <4 x double> @llvm.matrix.multiply.v4f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32, ...)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction-fmf.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction-fmf.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction-fmf.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction-fmf.ll
@@ -62,8 +62,8 @@
 ; CHECK-NEXT:    ret <4 x double> [[TMP24]]
 ;
 entry:
-  %c = call contract <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
+  %c = call contract <4 x double> (<4 x double>, <4 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
   ret <4 x double> %c
 }
 
-declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32)
+declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32, ...)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction.ll
@@ -62,8 +62,8 @@
 ; CHECK-NEXT:    ret <4 x double> [[TMP24]]
 ;
 entry:
-  %c = call <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
+  %c = call <4 x double> (<4 x double>, <4 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
   ret <4 x double> %c
 }
 
-declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32)
+declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32, ...)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-row-major.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-row-major.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-row-major.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-row-major.ll
@@ -67,11 +67,11 @@
 ; RM-NEXT:    ret <4 x double> [[TMP28]]
 ;
 entry:
-  %c = call <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
+  %c = call <4 x double> (<4 x double>, <4 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
   ret <4 x double> %c
 }
 
-declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32)
+declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32, ...)
 
 define <4 x double> @multiply_1x2(<2 x double> %a, <2 x double> %b) {
 
@@ -112,11 +112,11 @@
 ; RM-NEXT:    ret <4 x double> [[TMP16]]
 ;
 entry:
-  %c = call <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double> %a, <2 x double> %b, i32 2, i32 1, i32 2)
+  %c = call <4 x double> (<2 x double>, <2 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double> %a, <2 x double> %b, i32 2, i32 1, i32 2)
   ret <4 x double> %c
 }
 
-declare <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double>, <2 x double>, i32, i32, i32)
+declare <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double>, <2 x double>, i32, i32, i32, ...)
 
 define <9 x double> @multiply_2x3(<6 x double> %a, <6 x double> %b) {
 ; RM-LABEL: @multiply_2x3(
@@ -249,8 +249,8 @@
 ; RM-NEXT:    ret <9 x double> [[TMP65]]
 ;
 entry:
-  %c = call <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3)
+  %c = call <9 x double> (<6 x double>, <6 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3)
   ret <9 x double> %c
 }
 
-declare <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32)
+declare <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32, ...)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double.ll
@@ -66,11 +66,11 @@
 ; CHECK-NEXT:    ret <4 x double> [[TMP28]]
 ;
 entry:
-  %c = call <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
+  %c = call <4 x double> (<4 x double>, <4 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
   ret <4 x double> %c
 }
 
-declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32)
+declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32, ...)
 
 define <4 x double> @multiply_1x2(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @multiply_1x2(
@@ -110,11 +110,11 @@
 ; CHECK-NEXT:    ret <4 x double> [[TMP16]]
 ;
 entry:
-  %c = call <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double> %a, <2 x double> %b, i32 2, i32 1, i32 2)
+  %c = call <4 x double> (<2 x double>, <2 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double> %a, <2 x double> %b, i32 2, i32 1, i32 2)
   ret <4 x double> %c
 }
 
-declare <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double>, <2 x double>, i32, i32, i32)
+declare <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double>, <2 x double>, i32, i32, i32, ...)
 
 define <9 x double> @multiply_2x3(<6 x double> %a, <6 x double> %b) {
 ; CHECK-LABEL: @multiply_2x3(
@@ -247,8 +247,8 @@
 ; CHECK-NEXT:    ret <9 x double> [[TMP65]]
 ;
 entry:
-  %c = call <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3)
+  %c = call <9 x double> (<6 x double>, <6 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3)
   ret <9 x double> %c
 }
 
-declare <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32)
+declare <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32, ...)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction-fmf.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction-fmf.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction-fmf.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction-fmf.ll
@@ -62,8 +62,8 @@
 ; CHECK-NEXT:    ret <4 x float> [[TMP24]]
 ;
 entry:
-  %c = call contract <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2)
+  %c = call contract <4 x float> (<4 x float>, <4 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2)
   ret <4 x float> %c
 }
 
-declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32, ...)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction.ll
@@ -62,8 +62,8 @@
 ; CHECK-NEXT:    ret <4 x float> [[TMP24]]
 ;
 entry:
-  %c = call contract <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2)
+  %c = call contract <4 x float> (<4 x float>, <4 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2)
   ret <4 x float> %c
 }
 
-declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32, ...)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float.ll
@@ -66,11 +66,11 @@
 ; CHECK-NEXT:    ret <4 x float> [[TMP28]]
 ;
 entry:
-  %c = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2)
+  %c = call <4 x float> (<4 x float>, <4 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2)
   ret <4 x float> %c
 }
 
-declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32, ...)
 
 define <4 x float> @multiply_1x2(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: @multiply_1x2(
@@ -110,11 +110,11 @@
 ; CHECK-NEXT:    ret <4 x float> [[TMP16]]
 ;
 entry:
-  %c = call <4 x float> @llvm.matrix.multiply.v4f32.v2f32.v2f32(<2 x float> %a, <2 x float> %b, i32 2, i32 1, i32 2)
+  %c = call <4 x float> (<2 x float>, <2 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f32.v2f32.v2f32(<2 x float> %a, <2 x float> %b, i32 2, i32 1, i32 2)
   ret <4 x float> %c
 }
 
-declare <4 x float> @llvm.matrix.multiply.v4f32.v2f32.v2f32(<2 x float>, <2 x float>, i32, i32, i32)
+declare <4 x float> @llvm.matrix.multiply.v4f32.v2f32.v2f32(<2 x float>, <2 x float>, i32, i32, i32, ...)
 
 define <9 x float> @multiply_2x3(<6 x float> %a, <6 x float> %b) {
 ; CHECK-LABEL: @multiply_2x3(
@@ -247,8 +247,8 @@
 ; CHECK-NEXT:    ret <9 x float> [[TMP65]]
 ;
 entry:
-  %c = call <9 x float> @llvm.matrix.multiply.v6f32.v6f32.v6f32(<6 x float> %a, <6 x float> %b, i32 3, i32 2, i32 3)
+  %c = call <9 x float> (<6 x float>, <6 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v6f32.v6f32.v6f32(<6 x float> %a, <6 x float> %b, i32 3, i32 2, i32 3)
   ret <9 x float> %c
 }
 
-declare <9 x float> @llvm.matrix.multiply.v6f32.v6f32.v6f32(<6 x float>, <6 x float>, i32, i32, i32)
+declare <9 x float> @llvm.matrix.multiply.v6f32.v6f32.v6f32(<6 x float>, <6 x float>, i32, i32, i32, ...)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll
@@ -277,7 +277,7 @@
 entry:
   %a = load <6 x double>, <6 x double>* %A, align 16
   %b = load <6 x double>, <6 x double>* %B, align 16
-  %c = call <9 x double> @llvm.matrix.multiply(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3)
+  %c = call <9 x double> (<6 x double>, <6 x double>, i32, i32, i32, ...) @llvm.matrix.multiply(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3)
   store <9 x double> %c, <9 x double>* %C, align 16
 
   br i1 %cond, label %true, label %false
@@ -295,9 +295,9 @@
 end:
   %a.2 = load <6 x double>, <6 x double>* %A, align 16
   %b.2 = load <6 x double>, <6 x double>* %B, align 16
-  %c.2 = call <9 x double> @llvm.matrix.multiply(<6 x double> %a.2, <6 x double> %b.2, i32 3, i32 2, i32 3)
+  %c.2 = call <9 x double> (<6 x double>, <6 x double>, i32, i32, i32, ...) @llvm.matrix.multiply(<6 x double> %a.2, <6 x double> %b.2, i32 3, i32 2, i32 3)
   store <9 x double> %c.2, <9 x double>* %C, align 16
   ret void
 }
 
-declare <9 x double> @llvm.matrix.multiply(<6 x double>, <6 x double>, i32, i32, i32)
+declare <9 x double> @llvm.matrix.multiply(<6 x double>, <6 x double>, i32, i32, i32, ...)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
@@ -264,10 +264,10 @@
   %a = load <16 x double>, <16 x double>* %A, align 16
   %b = load <16 x double>, <16 x double>* %B, align 16
 
-  %c = call <16 x double> @llvm.matrix.multiply(<16 x double> %a, <16 x double> %b, i32 4, i32 4, i32 4)
+  %c = call <16 x double> (<16 x double>, <16 x double>, i32, i32, i32, ...) @llvm.matrix.multiply(<16 x double> %a, <16 x double> %b, i32 4, i32 4, i32 4)
 
   store <16 x double> %c, <16 x double>* %C, align 16
   ret void
 }
 
-declare <16 x double> @llvm.matrix.multiply(<16 x double>, <16 x double>, i32, i32, i32)
+declare <16 x double> @llvm.matrix.multiply(<16 x double>, <16 x double>, i32, i32, i32, ...)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-nuw-nsw.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-nuw-nsw.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-nuw-nsw.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+
+declare <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, ...)
+
+define <4 x i32> @multiply_2x2_nuw(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @multiply_2x2_nuw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> undef, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK4:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT5]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw <1 x i32> [[BLOCK4]], [[SPLAT_SPLAT6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw <1 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <1 x i32> [[TMP4]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP5]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK7:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x i32> undef, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT8]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw <1 x i32> [[BLOCK7]], [[SPLAT_SPLAT9]]
+; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x i32> undef, i32 [[TMP9]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT11]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw <1 x i32> [[BLOCK10]], [[SPLAT_SPLAT12]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add nuw <1 x i32> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP12]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x i32> undef, i32 [[TMP14]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT14]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw <1 x i32> [[BLOCK13]], [[SPLAT_SPLAT15]]
+; CHECK-NEXT:    [[BLOCK16:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x i32> undef, i32 [[TMP16]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT17]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nuw <1 x i32> [[BLOCK16]], [[SPLAT_SPLAT18]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nuw <1 x i32> [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <1 x i32> [[TMP18]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP19]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK19:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x i32> undef, i32 [[TMP21]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT20]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = mul nuw <1 x i32> [[BLOCK19]], [[SPLAT_SPLAT21]]
+; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x i32> undef, i32 [[TMP23]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT23]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = mul nuw <1 x i32> [[BLOCK22]], [[SPLAT_SPLAT24]]
+; CHECK-NEXT:    [[TMP25:%.*]] = add nuw <1 x i32> [[TMP22]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <1 x i32> [[TMP25]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP26]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP27]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP28]]
+;
+entry:
+  %c = call <4 x i32> (<4 x i32>, <4 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2, i1 true, i1 false)
+  ret <4 x i32> %c
+}
+
+
+define <4 x i32> @multiply_2x2_nsw(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @multiply_2x2_nsw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> undef, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK4:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT5]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw <1 x i32> [[BLOCK4]], [[SPLAT_SPLAT6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <1 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <1 x i32> [[TMP4]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP5]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK7:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x i32> undef, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT8]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nsw <1 x i32> [[BLOCK7]], [[SPLAT_SPLAT9]]
+; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x i32> undef, i32 [[TMP9]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT11]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <1 x i32> [[BLOCK10]], [[SPLAT_SPLAT12]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <1 x i32> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP12]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x i32> undef, i32 [[TMP14]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT14]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw <1 x i32> [[BLOCK13]], [[SPLAT_SPLAT15]]
+; CHECK-NEXT:    [[BLOCK16:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x i32> undef, i32 [[TMP16]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT17]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <1 x i32> [[BLOCK16]], [[SPLAT_SPLAT18]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <1 x i32> [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <1 x i32> [[TMP18]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP19]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK19:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x i32> undef, i32 [[TMP21]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT20]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = mul nsw <1 x i32> [[BLOCK19]], [[SPLAT_SPLAT21]]
+; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x i32> undef, i32 [[TMP23]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT23]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw <1 x i32> [[BLOCK22]], [[SPLAT_SPLAT24]]
+; CHECK-NEXT:    [[TMP25:%.*]] = add nsw <1 x i32> [[TMP22]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <1 x i32> [[TMP25]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP26]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP27]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP28]]
+;
+entry:
+  %c = call <4 x i32> (<4 x i32>, <4 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2, i1 false, i1 true)
+  ret <4 x i32> %c
+}
+
+define <4 x i32> @multiply_2x2_nuw_nsw(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @multiply_2x2_nuw_nsw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> undef, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK4:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT5]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw nsw <1 x i32> [[BLOCK4]], [[SPLAT_SPLAT6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw <1 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <1 x i32> [[TMP4]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP5]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK7:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x i32> undef, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT8]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw nsw <1 x i32> [[BLOCK7]], [[SPLAT_SPLAT9]]
+; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x i32> undef, i32 [[TMP9]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT11]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw nsw <1 x i32> [[BLOCK10]], [[SPLAT_SPLAT12]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw <1 x i32> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP12]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x i32> undef, i32 [[TMP14]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT14]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw nsw <1 x i32> [[BLOCK13]], [[SPLAT_SPLAT15]]
+; CHECK-NEXT:    [[BLOCK16:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x i32> undef, i32 [[TMP16]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT17]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nuw nsw <1 x i32> [[BLOCK16]], [[SPLAT_SPLAT18]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nuw nsw <1 x i32> [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <1 x i32> [[TMP18]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP19]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK19:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x i32> undef, i32 [[TMP21]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT20]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = mul nuw nsw <1 x i32> [[BLOCK19]], [[SPLAT_SPLAT21]]
+; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x i32> undef, i32 [[TMP23]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT23]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = mul nuw nsw <1 x i32> [[BLOCK22]], [[SPLAT_SPLAT24]]
+; CHECK-NEXT:    [[TMP25:%.*]] = add nuw nsw <1 x i32> [[TMP22]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <1 x i32> [[TMP25]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP26]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP27]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP28]]
+;
+entry:
+  %c = call <4 x i32> (<4 x i32>, <4 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2, i1 true, i1 true)
+  ret <4 x i32> %c
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-row-major.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-row-major.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-row-major.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-row-major.ll
@@ -67,11 +67,11 @@
 ; RM-NEXT:    ret <4 x i32> [[TMP28]]
 ;
 entry:
-  %c = call <4 x i32> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2)
+  %c = call <4 x i32> (<4 x i32>, <4 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2)
   ret <4 x i32> %c
 }
 
-declare <4 x i32> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x i32>, <4 x i32>, i32, i32, i32)
+declare <4 x i32> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x i32>, <4 x i32>, i32, i32, i32, ...)
 
 define <4 x i32> @multiply_1x2(<2 x i32> %a, <2 x i32> %b) {
 
@@ -112,11 +112,11 @@
 ; RM-NEXT:    ret <4 x i32> [[TMP16]]
 ;
 entry:
-  %c = call <4 x i32> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x i32> %a, <2 x i32> %b, i32 2, i32 1, i32 2)
+  %c = call <4 x i32> (<2 x i32>, <2 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x i32> %a, <2 x i32> %b, i32 2, i32 1, i32 2)
   ret <4 x i32> %c
 }
 
-declare <4 x i32> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x i32>, <2 x i32>, i32, i32, i32)
+declare <4 x i32> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x i32>, <2 x i32>, i32, i32, i32, ...)
 
 define <9 x i32> @multiply_2x3(<6 x i32> %a, <6 x i32> %b) {
 ; RM-LABEL: @multiply_2x3(
@@ -249,8 +249,8 @@
 ; RM-NEXT:    ret <9 x i32> [[TMP65]]
 ;
 entry:
-  %c = call <9 x i32> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x i32> %a, <6 x i32> %b, i32 3, i32 2, i32 3)
+  %c = call <9 x i32> (<6 x i32>, <6 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x i32> %a, <6 x i32> %b, i32 3, i32 2, i32 3)
   ret <9 x i32> %c
 }
 
-declare <9 x i32> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x i32>, <6 x i32>, i32, i32, i32)
+declare <9 x i32> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x i32>, <6 x i32>, i32, i32, i32, ...)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32.ll
@@ -66,11 +66,11 @@
 ; CHECK-NEXT:    ret <4 x i32> [[TMP28]]
 ;
 entry:
-  %c = call <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2)
+  %c = call <4 x i32> (<4 x i32>, <4 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2)
   ret <4 x i32> %c
 }
 
-declare <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32)
+declare <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, ...)
 
 define <4 x i32> @multiply_1x2(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: @multiply_1x2(
@@ -110,11 +110,11 @@
 ; CHECK-NEXT:    ret <4 x i32> [[TMP16]]
 ;
 entry:
-  %c = call <4 x i32> @llvm.matrix.multiply.v4i32.v2i32.v2i32(<2 x i32> %a, <2 x i32> %b, i32 2, i32 1, i32 2)
+  %c = call <4 x i32> (<2 x i32>, <2 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v4i32.v2i32.v2i32(<2 x i32> %a, <2 x i32> %b, i32 2, i32 1, i32 2)
   ret <4 x i32> %c
 }
 
-declare <4 x i32> @llvm.matrix.multiply.v4i32.v2i32.v2i32(<2 x i32>, <2 x i32>, i32, i32, i32)
+declare <4 x i32> @llvm.matrix.multiply.v4i32.v2i32.v2i32(<2 x i32>, <2 x i32>, i32, i32, i32, ...)
 
 define <9 x i32> @multiply_2x3(<6 x i32> %a, <6 x i32> %b) {
 ; CHECK-LABEL: @multiply_2x3(
@@ -247,8 +247,8 @@
 ; CHECK-NEXT:    ret <9 x i32> [[TMP65]]
 ;
 entry:
-  %c = call <9 x i32> @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32> %a, <6 x i32> %b, i32 3, i32 2, i32 3)
+  %c = call <9 x i32> (<6 x i32>, <6 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32> %a, <6 x i32> %b, i32 3, i32 2, i32 3)
   ret <9 x i32> %c
 }
 
-declare <9 x i32> @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32>, <6 x i32>, i32, i32, i32)
+declare <9 x i32> @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32>, <6 x i32>, i32, i32, i32, ...)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll
@@ -85,7 +85,7 @@
 
 if.end:                                        ; preds = %if.then, %if.else
   %merge = phi <9 x double> [ %A.trans, %if.then], [ %B.trans, %if.else ]
-  %res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %C, <9 x double> %merge, i32 3, i32 3, i32 3)
+  %res = tail call <9 x double> (<9 x double>, <9 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %C, <9 x double> %merge, i32 3, i32 3, i32 3)
   ret <9 x double> %res
 }
 
@@ -126,10 +126,10 @@
 ;
   %A.trans = tail call <9 x double> @llvm.matrix.transpose.v9f64(<9 x double> %A, i32 3, i32 3)
   %A.foo = call <9 x double> @foo(<9 x double> %A.trans)
-  %res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %B, <9 x double> %A.foo, i32 3, i32 3, i32 3)
+  %res = tail call <9 x double> (<9 x double>, <9 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %B, <9 x double> %A.foo, i32 3, i32 3, i32 3)
   ret <9 x double> %res
 }
 
-declare <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double>, <9 x double>, i32 immarg, i32 immarg, i32 immarg)
+declare <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double>, <9 x double>, i32 immarg, i32 immarg, i32 immarg, ...)
 declare <9 x double> @llvm.matrix.transpose.v9f64(<9 x double>, i32 immarg, i32 immarg)
 declare <9 x double> @foo(<9 x double>)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll
@@ -94,7 +94,7 @@
   %tmp17 = tail call <8 x double> @llvm.matrix.transpose.v8f64(<8 x double> %shared.load, i32 2, i32 4), !dbg !10
   tail call void @llvm.matrix.columnwise.store.v8f64.p0f64(<8 x double> %tmp17, double* %arg3, i32 10, i32 4, i32 2), !dbg !10
   %tmp18 = tail call <60 x double> @llvm.matrix.columnwise.load.v60f64.p0f64(double* %arg2, i32 20, i32 4, i32 15), !dbg !11
-  %tmp48 = tail call <60 x double> @llvm.matrix.multiply.v60f64.v8f64.v30f64(<8 x double> %tmp17, <30 x double> %shared.load.2, i32 4, i32 2, i32 15), !dbg !11
+  %tmp48 = tail call <60 x double> (<8 x double>, <30 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v60f64.v8f64.v30f64(<8 x double> %tmp17, <30 x double> %shared.load.2, i32 4, i32 2, i32 15), !dbg !11
   %tmp49 = fsub <60 x double> %tmp18, %tmp48, !dbg !11
   tail call void @llvm.matrix.columnwise.store.v60f64.p0f64(<60 x double> %tmp49, double* %arg2, i32 10, i32 4, i32 15), !dbg !11
   ret void
@@ -106,7 +106,7 @@
 declare <60 x double> @llvm.matrix.columnwise.load.v60f64.p0f64(double*, i32, i32 immarg, i32 immarg)
 declare void @llvm.matrix.columnwise.store.v60f64.p0f64(<60 x double>, double* writeonly, i32, i32 immarg, i32 immarg)
 declare void @llvm.matrix.columnwise.store.v8f64.p0f64(<8 x double>, double* writeonly, i32, i32 immarg, i32 immarg)
-declare <60 x double> @llvm.matrix.multiply.v60f64.v8f64.v30f64(<8 x double>, <30 x double>, i32 immarg, i32 immarg, i32 immarg)
+declare <60 x double> @llvm.matrix.multiply.v60f64.v8f64.v30f64(<8 x double>, <30 x double>, i32 immarg, i32 immarg, i32 immarg, ...)
 
 !llvm.module.flags = !{!0, !1, !2, !3}
 !llvm.dbg.cu = !{!4}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
@@ -27,12 +27,12 @@
 define void @multiply(<12 x double>* %A, <12 x double>* %B, <4 x double>* %C) !dbg !25 {
   %A.matrix = load <12 x double>, <12 x double>* %A, !dbg !26
   %B.matrix = load <12 x double>, <12 x double>* %B, !dbg !26
-  %t = call <4 x double> @llvm.matrix.multiply(<12 x double> %A.matrix, <12 x double> %B.matrix, i32 2, i32 6, i32 2), !dbg !26
+  %t = call <4 x double> (<12 x double>, <12 x double>, i32, i32, i32, ...) @llvm.matrix.multiply(<12 x double> %A.matrix, <12 x double> %B.matrix, i32 2, i32 6, i32 2), !dbg !26
   store <4 x double> %t, <4 x double>* %C, !dbg !26
   ret void
 }
 
-declare <4 x double> @llvm.matrix.multiply(<12 x double>, <12 x double>, i32, i32, i32)
+declare <4 x double> @llvm.matrix.multiply(<12 x double>, <12 x double>, i32, i32, i32, ...)
 
 ; CHECK-LABEL: remark: test.h:60:20: Lowered with 6 stores, 6 loads, 0 compute ops
 ; CHECK-NEXT:  store(
@@ -101,7 +101,7 @@
 
   %C.matrix = load <12 x double>, <12 x double>* %C, !dbg !34
   %D.matrix = load <12 x double>, <12 x double>* %D, !dbg !34
-  %Mult.matrix = call <4 x double> @llvm.matrix.multiply(<12 x double> %C.matrix, <12 x double> %D.matrix, i32 2, i32 6, i32 2), !dbg !34
+  %Mult.matrix = call <4 x double> (<12 x double>, <12 x double>, i32, i32, i32, ...) @llvm.matrix.multiply(<12 x double> %C.matrix, <12 x double> %D.matrix, i32 2, i32 6, i32 2), !dbg !34
   store <4 x double> %Mult.matrix, <4 x double>* %E, !dbg !34
 
   ret void
diff --git a/llvm/test/Verifier/matrix-intrinsics.ll b/llvm/test/Verifier/matrix-intrinsics.ll
--- a/llvm/test/Verifier/matrix-intrinsics.ll
+++ b/llvm/test/Verifier/matrix-intrinsics.ll
@@ -10,12 +10,34 @@
   ret <4 x float> %result.2
 }
 
-declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32, ...)
 define <4 x float> @multiply(<4 x float> %m) {
 ; CHECK-NEXT: result of a matrix operation does not fit in the returned vector
 ; CHECK-NEXT: result of a matrix operation does not fit in the returned vector
-  %result.1 = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %m, <4 x float> %m, i32 3, i32 2, i32 2)
-  %result.2 = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %result.1, <4 x float> %m, i32 2, i32 2, i32 1)
+  %result.1 = call <4 x float> (<4 x float>, <4 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %m, <4 x float> %m, i32 3, i32 2, i32 2)
+  %result.2 = call <4 x float> (<4 x float>, <4 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %result.1, <4 x float> %m, i32 2, i32 2, i32 1)
+  ret <4 x float> %result.2
+}
+
+declare <6 x i32 > @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32>, <6 x i32>, i32, i32, i32, ...)
+define <4 x float> @multiply_num_args(<4 x float> %m, <6 x i32> %ia, i1 %c) {
+; CHECK-NEXT: llvm.matrix.multiply takes either 5 or 7 arguments
+; CHECK-NEXT: llvm.matrix.multiply takes either 5 or 7 arguments
+  %result.1 = call <4 x float> (<4 x float>, <4 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %m, <4 x float> %m, i32 3, i32 2, i32 2, i32 1)
+  %result.2 = call <4 x float> (<4 x float>, <4 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %result.1, <4 x float> %m, i32 2, i32 2, i32 1, i1 0, i1 1, i1 0)
+
+; CHECK-NEXT: sixth argument of llvm.matrix.multiply must be an i1 integer constant
+; CHECK-NEXT: seventh argument of llvm.matrix.multiply must be an i1 integer constant
+  %result.3 = call <6 x i32> (<6 x i32>, <6 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32> %ia, <6 x i32> %ia, i32 2, i32 3, i32 2, i32 0, i1 1)
+  %result.4 = call <6 x i32> (<6 x i32>, <6 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32> %ia, <6 x i32> %ia, i32 2, i32 3, i32 2, i1 0, i8* null)
+
+; CHECK-NEXT: llvm.matrix.multiply takes either 5 or 7 arguments
+  %result.6 = call <6 x i32> (<6 x i32>, <6 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32> %ia, <6 x i32> %ia, i32 2, i32 3, i32 2, i1 0, i1 0, i32 12)
+
+; CHECK-NEXT: sixth argument of llvm.matrix.multiply must be an i1 integer constant
+; CHECK-NEXT: seventh argument of llvm.matrix.multiply must be an i1 integer constant
+  %result.8 = call <6 x i32> (<6 x i32>, <6 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32> %ia, <6 x i32> %ia, i32 2, i32 3, i32 2, i1 %c, i1 0)
+  %result.7 = call <6 x i32> (<6 x i32>, <6 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32> %ia, <6 x i32> %ia, i32 2, i32 3, i32 2, i1 0, i1 %c)
   ret <4 x float> %result.2
 }