diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -187,6 +187,29 @@
   LoopInfo *LI;
   OptimizationRemarkEmitter *ORE;
 
+  /// Contains a set of options for generating code for llvm.matrix.multiply.
+  struct MultiplyOptions {
+    /// Is Floating-point contraction allowed?
+    bool AllowContract;
+    /// Should the NUW flag be added?
+    bool HasNUW;
+    /// Should the NSW flag be added?
+    bool HasNSW;
+
+    MultiplyOptions(CallInst *MatMul) {
+      AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
+                                               MatMul->hasAllowContract());
+      auto NUWBundle = MatMul->getOperandBundle(LLVMContext::OB_nuw);
+      HasNUW = NUWBundle
+                   ? cast<ConstantInt>(*NUWBundle->Inputs.begin())->isOne()
+                   : false;
+      auto NSWBundle = MatMul->getOperandBundle(LLVMContext::OB_nsw);
+      HasNSW = NSWBundle
+                   ? cast<ConstantInt>(*NSWBundle->Inputs.begin())->isOne()
+                   : false;
+    }
+  };
+
   /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation.
   struct OpInfoTy {
     /// Number of stores emitted to generate this matrix.
@@ -960,14 +983,15 @@
   }
 
   Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp,
-                      IRBuilder<> &Builder, bool AllowContraction,
+                      IRBuilder<> &Builder, MultiplyOptions Opts,
                       unsigned &NumComputeOps) {
     NumComputeOps += getNumOps(A->getType());
     if (!Sum)
-      return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B);
+      return UseFPOp ? Builder.CreateFMul(A, B)
+                     : Builder.CreateMul(A, B, "", Opts.HasNUW, Opts.HasNSW);
 
     if (UseFPOp) {
-      if (AllowContraction) {
+      if (Opts.AllowContract) {
         // Use fmuladd for floating point operations and let the backend decide
         // if that's profitable.
         Function *FMulAdd = Intrinsic::getDeclaration(
@@ -980,8 +1004,8 @@
     }
 
     NumComputeOps += getNumOps(A->getType());
-    Value *Mul = Builder.CreateMul(A, B);
-    return Builder.CreateAdd(Sum, Mul);
+    Value *Mul = Builder.CreateMul(A, B, "", Opts.HasNUW, Opts.HasNSW);
+    return Builder.CreateAdd(Sum, Mul, "", Opts.HasNUW, Opts.HasNSW);
   }
 
   /// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For
@@ -1008,7 +1032,7 @@
   /// Compute \p Result += \p A * \p B for input matrices with left-associating
   /// addition.
   void emitMatrixMultiply(MatrixTy &Result, const MatrixTy &A,
-                          const MatrixTy &B, bool AllowContraction,
+                          const MatrixTy &B, MultiplyOptions Opts,
                           IRBuilder<> &Builder, bool isTiled) {
     const unsigned VF = std::max<unsigned>(
         TTI.getRegisterBitWidth(true) /
@@ -1045,7 +1069,7 @@
             Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat");
             Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, L, Splat,
                                Result.getElementType()->isFloatingPointTy(),
-                               Builder, AllowContraction, NumComputeOps);
+                               Builder, Opts, NumComputeOps);
           }
           Result.setVector(J,
                            insertVector(Result.getVector(J), I, Sum, Builder));
@@ -1069,7 +1093,7 @@
             Value *LH = Builder.CreateExtractElement(A.getVector(I), K);
             Value *Splat = Builder.CreateVectorSplat(BlockSize, LH, "splat");
             Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, Splat, R,
-                               IsFP, Builder, AllowContraction, NumComputeOps);
+                               IsFP, Builder, Opts, NumComputeOps);
           }
           Result.setVector(I,
                            insertVector(Result.getVector(I), J, Sum, Builder));
@@ -1224,8 +1248,6 @@
     Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul);
     Value *CPtr = Store->getPointerOperand();
 
-    bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
-                                                  MatMul->hasAllowContract());
     IRBuilder<> Builder(Store);
     for (unsigned J = 0; J < C; J += TileSize)
       for (unsigned I = 0; I < R; I += TileSize) {
@@ -1243,7 +1265,7 @@
               loadMatrix(BPtr, LoadOp1->getAlign(), LoadOp1->isVolatile(),
                          RShape, Builder.getInt64(K), Builder.getInt64(J),
                          {TileM, TileC}, EltType, Builder);
-          emitMatrixMultiply(Res, A, B, AllowContract, Builder, true);
+          emitMatrixMultiply(Res, A, B, MultiplyOptions(MatMul), Builder, true);
         }
         storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M},
                     Builder.getInt64(I), Builder.getInt64(J), EltType, Builder);
@@ -1310,9 +1332,8 @@
     // Initialize the output
     MatrixTy Result(R, C, EltType);
 
-    bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
-                                                  MatMul->hasAllowContract());
-    emitMatrixMultiply(Result, Lhs, Rhs, AllowContract, Builder, false);
+    emitMatrixMultiply(Result, Lhs, Rhs, MultiplyOptions(MatMul), Builder,
+                       false);
     finalizeLowering(MatMul, Result, Builder);
   }
 
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-nuw-nsw.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-nuw-nsw.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-nuw-nsw.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+
+declare <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32)
+
+define <4 x i32> @multiply_2x2_nuw(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @multiply_2x2_nuw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> undef, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK4:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT5]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw <1 x i32> [[BLOCK4]], [[SPLAT_SPLAT6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw <1 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <1 x i32> [[TMP4]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP5]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK7:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x i32> undef, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT8]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw <1 x i32> [[BLOCK7]], [[SPLAT_SPLAT9]]
+; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x i32> undef, i32 [[TMP9]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT11]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw <1 x i32> [[BLOCK10]], [[SPLAT_SPLAT12]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add nuw <1 x i32> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP12]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x i32> undef, i32 [[TMP14]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT14]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw <1 x i32> [[BLOCK13]], [[SPLAT_SPLAT15]]
+; CHECK-NEXT:    [[BLOCK16:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x i32> undef, i32 [[TMP16]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT17]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nuw <1 x i32> [[BLOCK16]], [[SPLAT_SPLAT18]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nuw <1 x i32> [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <1 x i32> [[TMP18]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP19]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK19:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x i32> undef, i32 [[TMP21]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT20]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = mul nuw <1 x i32> [[BLOCK19]], [[SPLAT_SPLAT21]]
+; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x i32> undef, i32 [[TMP23]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT23]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = mul nuw <1 x i32> [[BLOCK22]], [[SPLAT_SPLAT24]]
+; CHECK-NEXT:    [[TMP25:%.*]] = add nuw <1 x i32> [[TMP22]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <1 x i32> [[TMP25]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP26]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP27]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP28]]
+;
+entry:
+  %c = call <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2) [ "nuw"(i1 true), "nsw"(i1 false) ]
+  ret <4 x i32> %c
+}
+
+
+define <4 x i32> @multiply_2x2_nsw(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @multiply_2x2_nsw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> undef, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK4:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT5]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw <1 x i32> [[BLOCK4]], [[SPLAT_SPLAT6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <1 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <1 x i32> [[TMP4]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP5]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK7:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x i32> undef, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT8]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nsw <1 x i32> [[BLOCK7]], [[SPLAT_SPLAT9]]
+; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x i32> undef, i32 [[TMP9]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT11]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <1 x i32> [[BLOCK10]], [[SPLAT_SPLAT12]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <1 x i32> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP12]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x i32> undef, i32 [[TMP14]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT14]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw <1 x i32> [[BLOCK13]], [[SPLAT_SPLAT15]]
+; CHECK-NEXT:    [[BLOCK16:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x i32> undef, i32 [[TMP16]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT17]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <1 x i32> [[BLOCK16]], [[SPLAT_SPLAT18]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <1 x i32> [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <1 x i32> [[TMP18]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP19]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK19:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x i32> undef, i32 [[TMP21]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT20]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = mul nsw <1 x i32> [[BLOCK19]], [[SPLAT_SPLAT21]]
+; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x i32> undef, i32 [[TMP23]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT23]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw <1 x i32> [[BLOCK22]], [[SPLAT_SPLAT24]]
+; CHECK-NEXT:    [[TMP25:%.*]] = add nsw <1 x i32> [[TMP22]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <1 x i32> [[TMP25]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP26]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP27]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP28]]
+;
+entry:
+  %c = call <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2) [ "nuw"(i1 false), "nsw"(i1 true) ]
+  ret <4 x i32> %c
+}
+
+define <4 x i32> @multiply_2x2_nuw_nsw(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @multiply_2x2_nuw_nsw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> undef, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK4:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT5]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw nsw <1 x i32> [[BLOCK4]], [[SPLAT_SPLAT6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw <1 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <1 x i32> [[TMP4]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP5]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK7:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x i32> undef, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT8]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw nsw <1 x i32> [[BLOCK7]], [[SPLAT_SPLAT9]]
+; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x i32> undef, i32 [[TMP9]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT11]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw nsw <1 x i32> [[BLOCK10]], [[SPLAT_SPLAT12]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw <1 x i32> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP12]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x i32> undef, i32 [[TMP14]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT14]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw nsw <1 x i32> [[BLOCK13]], [[SPLAT_SPLAT15]]
+; CHECK-NEXT:    [[BLOCK16:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x i32> undef, i32 [[TMP16]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT17]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nuw nsw <1 x i32> [[BLOCK16]], [[SPLAT_SPLAT18]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nuw nsw <1 x i32> [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <1 x i32> [[TMP18]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP19]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK19:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x i32> undef, i32 [[TMP21]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT20]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = mul nuw nsw <1 x i32> [[BLOCK19]], [[SPLAT_SPLAT21]]
+; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x i32> undef, i32 [[TMP23]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT23]], <1 x i32> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = mul nuw nsw <1 x i32> [[BLOCK22]], [[SPLAT_SPLAT24]]
+; CHECK-NEXT:    [[TMP25:%.*]] = add nuw nsw <1 x i32> [[TMP22]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <1 x i32> [[TMP25]], <1 x i32> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP26]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP27]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP28]]
+;
+entry:
+  %c = call <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2) [ "nuw"(i1 true), "nsw"(i1 true) ]
+  ret <4 x i32> %c
+}