diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -403,6 +403,19 @@
   /// Map from instructions to their produced column matrix.
   MapVector<Value *, MatrixTy> Inst2ColumnMatrix;
 
+private:
+  static FastMathFlags getFastMathFlags(Instruction *Inst) {
+    FastMathFlags FMF;
+
+    if (isa<FPMathOperator>(*Inst)) {
+      FMF = Inst->getFastMathFlags();
+    }
+
+    FMF.setAllowContract(AllowContractEnabled || FMF.allowContract());
+
+    return FMF;
+  }
+
 public:
   LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI,
                         AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI,
@@ -1149,9 +1162,9 @@
   /// column-major.  If \p IsScalarMatrixTransposed we assume the appropriate
   /// operand is transposed.
   void emitMatrixMultiply(MatrixTy &Result, const MatrixTy &A,
-                          const MatrixTy &B, bool AllowContraction,
-                          IRBuilder<> &Builder, bool IsTiled,
-                          bool IsScalarMatrixTransposed) {
+                          const MatrixTy &B, IRBuilder<> &Builder, bool IsTiled,
+                          bool IsScalarMatrixTransposed,
+                          FastMathFlags FMF) {
     const unsigned VF = std::max<unsigned>(
         TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
                 .getFixedSize() /
@@ -1166,6 +1179,9 @@
            Result.isColumnMajor() == A.isColumnMajor() &&
            "operands must agree on matrix layout");
     unsigned NumComputeOps = 0;
+
+    Builder.setFastMathFlags(FMF);
+
     if (A.isColumnMajor()) {
       // Multiply columns from the first operand with scalars from the second
       // operand. Then move along the K axes and accumulate the columns.  With
@@ -1189,8 +1205,8 @@
                 IsScalarMatrixTransposed ? J : K);
             Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat");
             Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, L, Splat,
-                               Result.getElementType()->isFloatingPointTy(),
-                               Builder, AllowContraction, NumComputeOps);
+                               IsFP, Builder, FMF.allowContract(),
+                               NumComputeOps);
           }
           Result.setVector(J,
                            insertVector(Result.getVector(J), I, Sum, Builder));
@@ -1216,7 +1232,8 @@
                 IsScalarMatrixTransposed ? I : K);
             Value *Splat = Builder.CreateVectorSplat(BlockSize, LH, "splat");
             Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, Splat, R,
-                               IsFP, Builder, AllowContraction, NumComputeOps);
+                               IsFP, Builder, FMF.allowContract(),
+                               NumComputeOps);
           }
           Result.setVector(I,
                            insertVector(Result.getVector(I), J, Sum, Builder));
@@ -1354,8 +1371,7 @@
   }
 
   void createTiledLoops(CallInst *MatMul, Value *LPtr, ShapeInfo LShape,
-                        Value *RPtr, ShapeInfo RShape, StoreInst *Store,
-                        bool AllowContract) {
+                        Value *RPtr, ShapeInfo RShape, StoreInst *Store) {
     auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();
 
     // Create the main tiling loop nest.
@@ -1391,7 +1407,8 @@
                             {TileSize, TileSize}, EltType, Builder);
     MatrixTy B = loadMatrix(RPtr, {}, false, RShape, TI.CurrentK, TI.CurrentCol,
                             {TileSize, TileSize}, EltType, Builder);
-    emitMatrixMultiply(TileResult, A, B, AllowContract, Builder, true, false);
+    emitMatrixMultiply(TileResult, A, B, Builder, true, false,
+                       getFastMathFlags(MatMul));
     // Store result after the inner loop is done.
     Builder.SetInsertPoint(TI.RowLoopLatch->getTerminator());
     storeMatrix(TileResult, Store->getPointerOperand(), Store->getAlign(),
@@ -1430,11 +1447,8 @@
     Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul);
     Value *CPtr = Store->getPointerOperand();
 
-    bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
-                                                  MatMul->hasAllowContract());
     if (TileUseLoops && (R % TileSize == 0 && C % TileSize == 0))
-      createTiledLoops(MatMul, APtr, LShape, BPtr, RShape, Store,
-                       AllowContract);
+      createTiledLoops(MatMul, APtr, LShape, BPtr, RShape, Store);
     else {
       IRBuilder<> Builder(Store);
       for (unsigned J = 0; J < C; J += TileSize)
@@ -1453,7 +1467,8 @@
                 loadMatrix(BPtr, LoadOp1->getAlign(), LoadOp1->isVolatile(),
                            RShape, Builder.getInt64(K), Builder.getInt64(J),
                            {TileM, TileC}, EltType, Builder);
-            emitMatrixMultiply(Res, A, B, AllowContract, Builder, true, false);
+            emitMatrixMultiply(Res, A, B, Builder, true, false,
+                               getFastMathFlags(MatMul));
           }
           storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M},
                       Builder.getInt64(I), Builder.getInt64(J), EltType,
@@ -1520,10 +1535,8 @@
       // Initialize the output
       MatrixTy Result(R, C, EltType);
 
-      bool AllowContract =
-          AllowContractEnabled ||
-          (isa<FPMathOperator>(MatMul) && MatMul->hasAllowContract());
-      emitMatrixMultiply(Result, MA, MB, AllowContract, Builder, false, true);
+      emitMatrixMultiply(Result, MA, MB, Builder, false, true,
+                         getFastMathFlags(MatMul));
 
       FusedInsts.insert(MatMul);
       FusedInsts.insert(cast<Instruction>(Transpose));
@@ -1578,9 +1591,8 @@
     assert(Lhs.getElementType() == Result.getElementType() &&
            "Matrix multiply result element type does not match arguments.");
 
-    bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
-                                                  MatMul->hasAllowContract());
-    emitMatrixMultiply(Result, Lhs, Rhs, AllowContract, Builder, false, false);
+    emitMatrixMultiply(Result, Lhs, Rhs, Builder, false, false,
+                       getFastMathFlags(MatMul));
     finalizeLowering(MatMul, Result, Builder);
   }
 
@@ -1665,6 +1677,8 @@
            Result.isColumnMajor() == A.isColumnMajor() &&
            "operands must agree on matrix layout");
 
+    Builder.setFastMathFlags(getFastMathFlags(Inst));
+
     // Helper to perform binary op on vectors.
     auto BuildVectorOp = [&Builder, Inst](Value *LHS, Value *RHS) {
       switch (Inst->getOpcode()) {
@@ -1709,6 +1723,8 @@
     MatrixTy Result;
     MatrixTy M = getMatrix(Op, Shape, Builder);
 
+    Builder.setFastMathFlags(getFastMathFlags(Inst));
+
     // Helper to perform unary op on vectors.
     auto BuildVectorOp = [&Builder, Inst](Value *Op) {
       switch (Inst->getOpcode()) {
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction-fmf.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction-fmf.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction-fmf.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction-fmf.ll
@@ -14,48 +14,48 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP0]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
 ; CHECK-NEXT:    [[BLOCK4:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT5]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK4]], <1 x double> [[SPLAT_SPLAT6]], <1 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK4]], <1 x double> [[SPLAT_SPLAT6]], <1 x double> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> poison, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> undef, <2 x double> [[TMP4]], <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[BLOCK7:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x double> poison, double [[TMP6]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT8]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <1 x double> [[BLOCK7]], [[SPLAT_SPLAT9]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul contract <1 x double> [[BLOCK7]], [[SPLAT_SPLAT9]]
 ; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT11]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK10]], <1 x double> [[SPLAT_SPLAT12]], <1 x double> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK10]], <1 x double> [[SPLAT_SPLAT12]], <1 x double> [[TMP7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <1 x double> [[TMP9]], <1 x double> poison, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP10]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x double> poison, double [[TMP12]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT14]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul <1 x double> [[BLOCK13]], [[SPLAT_SPLAT15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul contract <1 x double> [[BLOCK13]], [[SPLAT_SPLAT15]]
 ; CHECK-NEXT:    [[BLOCK16:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT17]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK16]], <1 x double> [[SPLAT_SPLAT18]], <1 x double> [[TMP13]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK16]], <1 x double> [[SPLAT_SPLAT18]], <1 x double> [[TMP13]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <1 x double> [[TMP15]], <1 x double> poison, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x double> undef, <2 x double> [[TMP16]], <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[BLOCK19:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x double> poison, double [[TMP18]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT20]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul <1 x double> [[BLOCK19]], [[SPLAT_SPLAT21]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul contract <1 x double> [[BLOCK19]], [[SPLAT_SPLAT21]]
 ; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK22]], <1 x double> [[SPLAT_SPLAT24]], <1 x double> [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK22]], <1 x double> [[SPLAT_SPLAT24]], <1 x double> [[TMP19]])
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <1 x double> [[TMP21]], <1 x double> poison, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x double> [[TMP17]], <2 x double> [[TMP22]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP23]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction.ll
@@ -14,48 +14,48 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP0]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
 ; CHECK-NEXT:    [[BLOCK4:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT5]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK4]], <1 x double> [[SPLAT_SPLAT6]], <1 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK4]], <1 x double> [[SPLAT_SPLAT6]], <1 x double> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> poison, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> undef, <2 x double> [[TMP4]], <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[BLOCK7:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x double> poison, double [[TMP6]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT8]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <1 x double> [[BLOCK7]], [[SPLAT_SPLAT9]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul contract <1 x double> [[BLOCK7]], [[SPLAT_SPLAT9]]
 ; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT11]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK10]], <1 x double> [[SPLAT_SPLAT12]], <1 x double> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK10]], <1 x double> [[SPLAT_SPLAT12]], <1 x double> [[TMP7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <1 x double> [[TMP9]], <1 x double> poison, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP10]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x double> poison, double [[TMP12]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT14]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul <1 x double> [[BLOCK13]], [[SPLAT_SPLAT15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul contract <1 x double> [[BLOCK13]], [[SPLAT_SPLAT15]]
 ; CHECK-NEXT:    [[BLOCK16:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT17]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK16]], <1 x double> [[SPLAT_SPLAT18]], <1 x double> [[TMP13]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK16]], <1 x double> [[SPLAT_SPLAT18]], <1 x double> [[TMP13]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <1 x double> [[TMP15]], <1 x double> poison, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x double> undef, <2 x double> [[TMP16]], <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[BLOCK19:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x double> poison, double [[TMP18]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT20]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul <1 x double> [[BLOCK19]], [[SPLAT_SPLAT21]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul contract <1 x double> [[BLOCK19]], [[SPLAT_SPLAT21]]
 ; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK22]], <1 x double> [[SPLAT_SPLAT24]], <1 x double> [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK22]], <1 x double> [[SPLAT_SPLAT24]], <1 x double> [[TMP19]])
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <1 x double> [[TMP21]], <1 x double> poison, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x double> [[TMP17]], <2 x double> [[TMP22]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP23]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction-fmf.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction-fmf.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction-fmf.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction-fmf.ll
@@ -14,48 +14,48 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> poison, float [[TMP0]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <1 x float> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul contract <1 x float> [[BLOCK]], [[SPLAT_SPLAT]]
 ; CHECK-NEXT:    [[BLOCK4:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x float> poison, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT5]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK4]], <1 x float> [[SPLAT_SPLAT6]], <1 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call contract <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK4]], <1 x float> [[SPLAT_SPLAT6]], <1 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <1 x float> [[TMP3]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP4]], <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[BLOCK7:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x float> poison, float [[TMP6]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT8]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <1 x float> [[BLOCK7]], [[SPLAT_SPLAT9]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul contract <1 x float> [[BLOCK7]], [[SPLAT_SPLAT9]]
 ; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x float> poison, float [[TMP8]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT11]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = call <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK10]], <1 x float> [[SPLAT_SPLAT12]], <1 x float> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call contract <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK10]], <1 x float> [[SPLAT_SPLAT12]], <1 x float> [[TMP7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <1 x float> [[TMP9]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x float> poison, float [[TMP12]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT14]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul <1 x float> [[BLOCK13]], [[SPLAT_SPLAT15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul contract <1 x float> [[BLOCK13]], [[SPLAT_SPLAT15]]
 ; CHECK-NEXT:    [[BLOCK16:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x float> poison, float [[TMP14]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT17]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = call <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK16]], <1 x float> [[SPLAT_SPLAT18]], <1 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call contract <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK16]], <1 x float> [[SPLAT_SPLAT18]], <1 x float> [[TMP13]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <1 x float> [[TMP15]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP16]], <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[BLOCK19:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x float> poison, float [[TMP18]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT20]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul <1 x float> [[BLOCK19]], [[SPLAT_SPLAT21]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul contract <1 x float> [[BLOCK19]], [[SPLAT_SPLAT21]]
 ; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x float> poison, float [[TMP20]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT23]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = call <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK22]], <1 x float> [[SPLAT_SPLAT24]], <1 x float> [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = call contract <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK22]], <1 x float> [[SPLAT_SPLAT24]], <1 x float> [[TMP19]])
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <1 x float> [[TMP21]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x float> [[TMP17]], <2 x float> [[TMP22]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP23]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction.ll
@@ -14,48 +14,48 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> poison, float [[TMP0]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <1 x float> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul contract <1 x float> [[BLOCK]], [[SPLAT_SPLAT]]
 ; CHECK-NEXT:    [[BLOCK4:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x float> poison, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT5]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK4]], <1 x float> [[SPLAT_SPLAT6]], <1 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call contract <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK4]], <1 x float> [[SPLAT_SPLAT6]], <1 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <1 x float> [[TMP3]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP4]], <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[BLOCK7:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x float> poison, float [[TMP6]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT8]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <1 x float> [[BLOCK7]], [[SPLAT_SPLAT9]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul contract <1 x float> [[BLOCK7]], [[SPLAT_SPLAT9]]
 ; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x float> poison, float [[TMP8]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT11]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = call <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK10]], <1 x float> [[SPLAT_SPLAT12]], <1 x float> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call contract <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK10]], <1 x float> [[SPLAT_SPLAT12]], <1 x float> [[TMP7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <1 x float> [[TMP9]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x float> poison, float [[TMP12]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT14]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul <1 x float> [[BLOCK13]], [[SPLAT_SPLAT15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul contract <1 x float> [[BLOCK13]], [[SPLAT_SPLAT15]]
 ; CHECK-NEXT:    [[BLOCK16:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x float> poison, float [[TMP14]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT17]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = call <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK16]], <1 x float> [[SPLAT_SPLAT18]], <1 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call contract <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK16]], <1 x float> [[SPLAT_SPLAT18]], <1 x float> [[TMP13]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <1 x float> [[TMP15]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP16]], <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[BLOCK19:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x float> poison, float [[TMP18]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT20]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul <1 x float> [[BLOCK19]], [[SPLAT_SPLAT21]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul contract <1 x float> [[BLOCK19]], [[SPLAT_SPLAT21]]
 ; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x float> poison, float [[TMP20]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT23]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = call <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK22]], <1 x float> [[SPLAT_SPLAT24]], <1 x float> [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = call contract <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK22]], <1 x float> [[SPLAT_SPLAT24]], <1 x float> [[TMP19]])
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <1 x float> [[TMP21]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x float> [[TMP17]], <2 x float> [[TMP22]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP23]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
@@ -43,18 +43,18 @@
 ; CHECK-NEXT:    [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST7]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD5]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[RESULT_VEC_0]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[RESULT_VEC_0]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x double> [[COL_LOAD5]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP7]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[TMP6]])
+; CHECK-NEXT:    [[TMP7]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[TMP6]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[COL_LOAD8]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[RESULT_VEC_1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[RESULT_VEC_1]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x double> [[COL_LOAD8]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP9]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP8]])
+; CHECK-NEXT:    [[TMP9]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP8]])
 ; CHECK-NEXT:    br label [[INNER_LATCH]]
 ; CHECK:       inner.latch:
 ; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
 ; CHECK-NEXT:    [[INNER_COND_NOT:%.*]] = icmp eq i64 [[INNER_STEP]], 4
-; CHECK-NEXT:    br i1 [[INNER_COND_NOT]], label [[ROWS_LATCH]], label [[INNER_HEADER]], [[LOOP0:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[INNER_COND_NOT]], label [[ROWS_LATCH]], label [[INNER_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       rows.latch:
 ; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
 ; CHECK-NEXT:    [[ROWS_COND_NOT:%.*]] = icmp eq i64 [[ROWS_STEP]], 4
@@ -140,7 +140,7 @@
 ; CHECK:       inner.latch:
 ; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
 ; CHECK-NEXT:    [[INNER_COND_NOT:%.*]] = icmp eq i64 [[INNER_STEP]], 4
-; CHECK-NEXT:    br i1 [[INNER_COND_NOT]], label [[ROWS_LATCH]], label [[INNER_HEADER]], [[LOOP2:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[INNER_COND_NOT]], label [[ROWS_LATCH]], label [[INNER_HEADER]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       rows.latch:
 ; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
 ; CHECK-NEXT:    [[ROWS_COND_NOT:%.*]] = icmp eq i64 [[ROWS_IV]], 0
@@ -232,7 +232,7 @@
 ; CHECK:       inner.latch:
 ; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
 ; CHECK-NEXT:    [[INNER_COND_NOT:%.*]] = icmp eq i64 [[INNER_IV]], 0
-; CHECK-NEXT:    br i1 [[INNER_COND_NOT]], label [[ROWS_LATCH]], label [[INNER_HEADER]], [[LOOP3:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[INNER_COND_NOT]], label [[ROWS_LATCH]], label [[INNER_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       rows.latch:
 ; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
 ; CHECK-NEXT:    [[ROWS_COND_NOT:%.*]] = icmp eq i64 [[ROWS_STEP]], 4
@@ -344,18 +344,18 @@
 ; CHECK-NEXT:    [[VEC_CAST14:%.*]] = bitcast float* [[VEC_GEP13]] to <2 x float>*
 ; CHECK-NEXT:    [[COL_LOAD15:%.*]] = load <2 x float>, <2 x float>* [[VEC_CAST14]], align 4
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[COL_LOAD12]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[COL_LOAD]], <2 x float> [[SPLAT_SPLAT]], <2 x float> [[RESULT_VEC_0]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call contract <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[COL_LOAD]], <2 x float> [[SPLAT_SPLAT]], <2 x float> [[RESULT_VEC_0]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x float> [[COL_LOAD12]], <2 x float> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP19]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[COL_LOAD9]], <2 x float> [[SPLAT_SPLAT19]], <2 x float> [[TMP18]])
+; CHECK-NEXT:    [[TMP19]] = call contract <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[COL_LOAD9]], <2 x float> [[SPLAT_SPLAT19]], <2 x float> [[TMP18]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT23:%.*]] = shufflevector <2 x float> [[COL_LOAD15]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[COL_LOAD]], <2 x float> [[SPLAT_SPLAT23]], <2 x float> [[RESULT_VEC_1]])
+; CHECK-NEXT:    [[TMP20:%.*]] = call contract <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[COL_LOAD]], <2 x float> [[SPLAT_SPLAT23]], <2 x float> [[RESULT_VEC_1]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT26:%.*]] = shufflevector <2 x float> [[COL_LOAD15]], <2 x float> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP21]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[COL_LOAD9]], <2 x float> [[SPLAT_SPLAT26]], <2 x float> [[TMP20]])
+; CHECK-NEXT:    [[TMP21]] = call contract <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[COL_LOAD9]], <2 x float> [[SPLAT_SPLAT26]], <2 x float> [[TMP20]])
 ; CHECK-NEXT:    br label [[INNER_LATCH]]
 ; CHECK:       inner.latch:
 ; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
 ; CHECK-NEXT:    [[INNER_COND_NOT:%.*]] = icmp eq i64 [[INNER_IV]], 0
-; CHECK-NEXT:    br i1 [[INNER_COND_NOT]], label [[ROWS_LATCH]], label [[INNER_HEADER]], [[LOOP5:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[INNER_COND_NOT]], label [[ROWS_LATCH]], label [[INNER_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       rows.latch:
 ; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
 ; CHECK-NEXT:    [[ROWS_COND_NOT:%.*]] = icmp eq i64 [[ROWS_IV]], 0
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll
@@ -67,13 +67,13 @@
 ; CHECK-NEXT:    [[VEC_CAST14:%.*]] = bitcast double* [[VEC_GEP13]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD15:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST14]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD12]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT18:%.*]] = shufflevector <2 x double> [[COL_LOAD12]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD9]], <2 x double> [[SPLAT_SPLAT18]], <2 x double> [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD9]], <2 x double> [[SPLAT_SPLAT18]], <2 x double> [[TMP12]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT21:%.*]] = shufflevector <2 x double> [[COL_LOAD15]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT21]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT21]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <2 x double> [[COL_LOAD15]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP15:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD9]], <2 x double> [[SPLAT_SPLAT24]], <2 x double> [[TMP14]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD9]], <2 x double> [[SPLAT_SPLAT24]], <2 x double> [[TMP14]])
 ; CHECK-NEXT:    [[VEC_CAST26:%.*]] = bitcast <9 x double>* [[C]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP13]], <2 x double>* [[VEC_CAST26]], align 8
 ; CHECK-NEXT:    [[VEC_GEP27:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 3
@@ -91,13 +91,13 @@
 ; CHECK-NEXT:    [[VEC_CAST39:%.*]] = bitcast double* [[VEC_GEP38]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD40:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST39]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT42:%.*]] = shufflevector <2 x double> [[COL_LOAD37]], <2 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = fmul <1 x double> [[COL_LOAD31]], [[SPLAT_SPLATINSERT42]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul contract <1 x double> [[COL_LOAD31]], [[SPLAT_SPLATINSERT42]]
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT45:%.*]] = shufflevector <2 x double> [[COL_LOAD37]], <2 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP18:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD34]], <1 x double> [[SPLAT_SPLATINSERT45]], <1 x double> [[TMP17]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD34]], <1 x double> [[SPLAT_SPLATINSERT45]], <1 x double> [[TMP17]])
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT48:%.*]] = shufflevector <2 x double> [[COL_LOAD40]], <2 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul <1 x double> [[COL_LOAD31]], [[SPLAT_SPLATINSERT48]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul contract <1 x double> [[COL_LOAD31]], [[SPLAT_SPLATINSERT48]]
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT51:%.*]] = shufflevector <2 x double> [[COL_LOAD40]], <2 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD34]], <1 x double> [[SPLAT_SPLATINSERT51]], <1 x double> [[TMP19]])
+; CHECK-NEXT:    [[TMP20:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD34]], <1 x double> [[SPLAT_SPLATINSERT51]], <1 x double> [[TMP19]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 2
 ; CHECK-NEXT:    [[VEC_CAST54:%.*]] = bitcast double* [[TMP21]] to <1 x double>*
 ; CHECK-NEXT:    store <1 x double> [[TMP18]], <1 x double>* [[VEC_CAST54]], align 8
@@ -113,9 +113,9 @@
 ; CHECK-NEXT:    [[VEC_CAST64:%.*]] = bitcast double* [[TMP22]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD65:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST64]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT68:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = fmul <2 x double> [[COL_LOAD59]], [[SPLAT_SPLAT68]]
+; CHECK-NEXT:    [[TMP23:%.*]] = fmul contract <2 x double> [[COL_LOAD59]], [[SPLAT_SPLAT68]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT71:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP24:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT71]], <2 x double> [[TMP23]])
+; CHECK-NEXT:    [[TMP24:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT71]], <2 x double> [[TMP23]])
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 6
 ; CHECK-NEXT:    [[VEC_CAST73:%.*]] = bitcast double* [[TMP25]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP24]], <2 x double>* [[VEC_CAST73]], align 8
@@ -129,16 +129,16 @@
 ; CHECK-NEXT:    [[VEC_CAST81:%.*]] = bitcast double* [[TMP27]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD82:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST81]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT84:%.*]] = shufflevector <2 x double> [[COL_LOAD82]], <2 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP28:%.*]] = fmul <1 x double> [[COL_LOAD76]], [[SPLAT_SPLATINSERT84]]
+; CHECK-NEXT:    [[TMP28:%.*]] = fmul contract <1 x double> [[COL_LOAD76]], [[SPLAT_SPLATINSERT84]]
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT87:%.*]] = shufflevector <2 x double> [[COL_LOAD82]], <2 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP29:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD79]], <1 x double> [[SPLAT_SPLATINSERT87]], <1 x double> [[TMP28]])
+; CHECK-NEXT:    [[TMP29:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD79]], <1 x double> [[SPLAT_SPLATINSERT87]], <1 x double> [[TMP28]])
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 8
 ; CHECK-NEXT:    [[VEC_CAST90:%.*]] = bitcast double* [[TMP30]] to <1 x double>*
 ; CHECK-NEXT:    store <1 x double> [[TMP29]], <1 x double>* [[VEC_CAST90]], align 8
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[TRUE:%.*]], label [[FALSE:%.*]]
 ; CHECK:       true:
-; CHECK-NEXT:    [[TMP31:%.*]] = fadd <3 x double> [[COL_LOAD196]], [[COL_LOAD196]]
-; CHECK-NEXT:    [[TMP32:%.*]] = fadd <3 x double> [[COL_LOAD199]], [[COL_LOAD199]]
+; CHECK-NEXT:    [[TMP31:%.*]] = fadd contract <3 x double> [[COL_LOAD196]], [[COL_LOAD196]]
+; CHECK-NEXT:    [[TMP32:%.*]] = fadd contract <3 x double> [[COL_LOAD199]], [[COL_LOAD199]]
 ; CHECK-NEXT:    [[VEC_CAST213:%.*]] = bitcast <6 x double>* [[A]] to <3 x double>*
 ; CHECK-NEXT:    store <3 x double> [[TMP31]], <3 x double>* [[VEC_CAST213]], align 8
 ; CHECK-NEXT:    [[VEC_GEP214:%.*]] = getelementptr <6 x double>, <6 x double>* [[A]], i64 0, i64 3
@@ -146,9 +146,9 @@
 ; CHECK-NEXT:    store <3 x double> [[TMP32]], <3 x double>* [[VEC_CAST215]], align 8
 ; CHECK-NEXT:    br label [[END:%.*]]
 ; CHECK:       false:
-; CHECK-NEXT:    [[TMP33:%.*]] = fadd <2 x double> [[COL_LOAD201]], [[COL_LOAD201]]
-; CHECK-NEXT:    [[TMP34:%.*]] = fadd <2 x double> [[COL_LOAD204]], [[COL_LOAD204]]
-; CHECK-NEXT:    [[TMP35:%.*]] = fadd <2 x double> [[COL_LOAD207]], [[COL_LOAD207]]
+; CHECK-NEXT:    [[TMP33:%.*]] = fadd contract <2 x double> [[COL_LOAD201]], [[COL_LOAD201]]
+; CHECK-NEXT:    [[TMP34:%.*]] = fadd contract <2 x double> [[COL_LOAD204]], [[COL_LOAD204]]
+; CHECK-NEXT:    [[TMP35:%.*]] = fadd contract <2 x double> [[COL_LOAD207]], [[COL_LOAD207]]
 ; CHECK-NEXT:    [[VEC_CAST208:%.*]] = bitcast <6 x double>* [[B]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP33]], <2 x double>* [[VEC_CAST208]], align 8
 ; CHECK-NEXT:    [[VEC_GEP209:%.*]] = getelementptr <6 x double>, <6 x double>* [[B]], i64 0, i64 2
@@ -204,13 +204,13 @@
 ; CHECK-NEXT:    [[VEC_CAST115:%.*]] = bitcast double* [[VEC_GEP114]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD116:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST115]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT119:%.*]] = shufflevector <2 x double> [[COL_LOAD113]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP48:%.*]] = fmul <2 x double> [[COL_LOAD107]], [[SPLAT_SPLAT119]]
+; CHECK-NEXT:    [[TMP48:%.*]] = fmul contract <2 x double> [[COL_LOAD107]], [[SPLAT_SPLAT119]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT122:%.*]] = shufflevector <2 x double> [[COL_LOAD113]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP49:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD110]], <2 x double> [[SPLAT_SPLAT122]], <2 x double> [[TMP48]])
+; CHECK-NEXT:    [[TMP49:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD110]], <2 x double> [[SPLAT_SPLAT122]], <2 x double> [[TMP48]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT125:%.*]] = shufflevector <2 x double> [[COL_LOAD116]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP50:%.*]] = fmul <2 x double> [[COL_LOAD107]], [[SPLAT_SPLAT125]]
+; CHECK-NEXT:    [[TMP50:%.*]] = fmul contract <2 x double> [[COL_LOAD107]], [[SPLAT_SPLAT125]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD116]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP51:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD110]], <2 x double> [[SPLAT_SPLAT128]], <2 x double> [[TMP50]])
+; CHECK-NEXT:    [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD110]], <2 x double> [[SPLAT_SPLAT128]], <2 x double> [[TMP50]])
 ; CHECK-NEXT:    [[VEC_CAST130:%.*]] = bitcast <9 x double>* [[C]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP49]], <2 x double>* [[VEC_CAST130]], align 8
 ; CHECK-NEXT:    [[VEC_GEP131:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 3
@@ -228,13 +228,13 @@
 ; CHECK-NEXT:    [[VEC_CAST143:%.*]] = bitcast double* [[VEC_GEP142]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD144:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST143]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT146:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP53:%.*]] = fmul <1 x double> [[COL_LOAD135]], [[SPLAT_SPLATINSERT146]]
+; CHECK-NEXT:    [[TMP53:%.*]] = fmul contract <1 x double> [[COL_LOAD135]], [[SPLAT_SPLATINSERT146]]
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT149:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP54:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD138]], <1 x double> [[SPLAT_SPLATINSERT149]], <1 x double> [[TMP53]])
+; CHECK-NEXT:    [[TMP54:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD138]], <1 x double> [[SPLAT_SPLATINSERT149]], <1 x double> [[TMP53]])
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT152:%.*]] = shufflevector <2 x double> [[COL_LOAD144]], <2 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP55:%.*]] = fmul <1 x double> [[COL_LOAD135]], [[SPLAT_SPLATINSERT152]]
+; CHECK-NEXT:    [[TMP55:%.*]] = fmul contract <1 x double> [[COL_LOAD135]], [[SPLAT_SPLATINSERT152]]
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT155:%.*]] = shufflevector <2 x double> [[COL_LOAD144]], <2 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP56:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD138]], <1 x double> [[SPLAT_SPLATINSERT155]], <1 x double> [[TMP55]])
+; CHECK-NEXT:    [[TMP56:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD138]], <1 x double> [[SPLAT_SPLATINSERT155]], <1 x double> [[TMP55]])
 ; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 2
 ; CHECK-NEXT:    [[VEC_CAST158:%.*]] = bitcast double* [[TMP57]] to <1 x double>*
 ; CHECK-NEXT:    store <1 x double> [[TMP54]], <1 x double>* [[VEC_CAST158]], align 8
@@ -250,9 +250,9 @@
 ; CHECK-NEXT:    [[VEC_CAST168:%.*]] = bitcast double* [[TMP58]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD169:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST168]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT172:%.*]] = shufflevector <2 x double> [[COL_LOAD169]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP59:%.*]] = fmul <2 x double> [[COL_LOAD163]], [[SPLAT_SPLAT172]]
+; CHECK-NEXT:    [[TMP59:%.*]] = fmul contract <2 x double> [[COL_LOAD163]], [[SPLAT_SPLAT172]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT175:%.*]] = shufflevector <2 x double> [[COL_LOAD169]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP60:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD166]], <2 x double> [[SPLAT_SPLAT175]], <2 x double> [[TMP59]])
+; CHECK-NEXT:    [[TMP60:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD166]], <2 x double> [[SPLAT_SPLAT175]], <2 x double> [[TMP59]])
 ; CHECK-NEXT:    [[TMP61:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 6
 ; CHECK-NEXT:    [[VEC_CAST177:%.*]] = bitcast double* [[TMP61]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP60]], <2 x double>* [[VEC_CAST177]], align 8
@@ -266,9 +266,9 @@
 ; CHECK-NEXT:    [[VEC_CAST185:%.*]] = bitcast double* [[TMP63]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD186:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST185]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT188:%.*]] = shufflevector <2 x double> [[COL_LOAD186]], <2 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP64:%.*]] = fmul <1 x double> [[COL_LOAD180]], [[SPLAT_SPLATINSERT188]]
+; CHECK-NEXT:    [[TMP64:%.*]] = fmul contract <1 x double> [[COL_LOAD180]], [[SPLAT_SPLATINSERT188]]
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT191:%.*]] = shufflevector <2 x double> [[COL_LOAD186]], <2 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP65:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD183]], <1 x double> [[SPLAT_SPLATINSERT191]], <1 x double> [[TMP64]])
+; CHECK-NEXT:    [[TMP65:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD183]], <1 x double> [[SPLAT_SPLATINSERT191]], <1 x double> [[TMP64]])
 ; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 8
 ; CHECK-NEXT:    [[VEC_CAST194:%.*]] = bitcast double* [[TMP66]] to <1 x double>*
 ; CHECK-NEXT:    store <1 x double> [[TMP65]], <1 x double>* [[VEC_CAST194]], align 8
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll
@@ -53,12 +53,12 @@
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD5]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK9]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[BLOCK]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK9]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[BLOCK]])
 ; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[COL_LOAD5]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <2 x double> poison, double [[TMP12]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT11]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK10]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK10]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[TMP11]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP15]] = shufflevector <2 x double> [[RESULT_VEC_0]], <2 x double> [[TMP14]], <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x double> [[RESULT_VEC_1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
@@ -66,19 +66,19 @@
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD8]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <2 x double> poison, double [[TMP16]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT15]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK14]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[BLOCK13]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK14]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[BLOCK13]])
 ; CHECK-NEXT:    [[BLOCK17:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[COL_LOAD8]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <2 x double> poison, double [[TMP18]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT18]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK17]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP17]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK17]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP17]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP21]] = shufflevector <2 x double> [[RESULT_VEC_1]], <2 x double> [[TMP20]], <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    br label [[INNER_LATCH]]
 ; CHECK:       inner.latch:
 ; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
 ; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ne i64 [[INNER_STEP]], 2
-; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER]], label [[ROWS_LATCH]], [[LOOP0:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER]], label [[ROWS_LATCH]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       rows.latch:
 ; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
 ; CHECK-NEXT:    [[ROWS_COND:%.*]] = icmp ne i64 [[ROWS_STEP]], 2
@@ -161,12 +161,12 @@
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD5]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK9]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[BLOCK]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK9]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[BLOCK]])
 ; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[COL_LOAD5]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <2 x double> poison, double [[TMP12]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT11]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK10]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK10]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[TMP11]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP15]] = shufflevector <2 x double> [[RESULT_VEC_0]], <2 x double> [[TMP14]], <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x double> [[RESULT_VEC_1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
@@ -174,19 +174,19 @@
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD8]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <2 x double> poison, double [[TMP16]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT15]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK14]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[BLOCK13]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK14]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[BLOCK13]])
 ; CHECK-NEXT:    [[BLOCK17:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[COL_LOAD8]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <2 x double> poison, double [[TMP18]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT18]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK17]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP17]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK17]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP17]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP21]] = shufflevector <2 x double> [[RESULT_VEC_1]], <2 x double> [[TMP20]], <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    br label [[INNER_LATCH]]
 ; CHECK:       inner.latch:
 ; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
 ; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ne i64 [[INNER_STEP]], 2
-; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER]], label [[ROWS_LATCH]], [[LOOP2:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER]], label [[ROWS_LATCH]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       rows.latch:
 ; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
 ; CHECK-NEXT:    [[ROWS_COND:%.*]] = icmp ne i64 [[ROWS_STEP]], 2
@@ -268,12 +268,12 @@
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD5]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK9]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[BLOCK]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK9]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[BLOCK]])
 ; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[COL_LOAD5]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <2 x double> poison, double [[TMP12]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT11]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK10]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK10]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[TMP11]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP15]] = shufflevector <2 x double> [[RESULT_VEC_0]], <2 x double> [[TMP14]], <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x double> [[RESULT_VEC_1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
@@ -281,19 +281,19 @@
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD8]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <2 x double> poison, double [[TMP16]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT15]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK14]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[BLOCK13]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK14]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[BLOCK13]])
 ; CHECK-NEXT:    [[BLOCK17:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[COL_LOAD8]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <2 x double> poison, double [[TMP18]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT18]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK17]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP17]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK17]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP17]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP21]] = shufflevector <2 x double> [[RESULT_VEC_1]], <2 x double> [[TMP20]], <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    br label [[INNER_LATCH]]
 ; CHECK:       inner.latch:
 ; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
 ; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ne i64 [[INNER_STEP]], 2
-; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER]], label [[ROWS_LATCH]], [[LOOP3:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER]], label [[ROWS_LATCH]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       rows.latch:
 ; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
 ; CHECK-NEXT:    [[ROWS_COND:%.*]] = icmp ne i64 [[ROWS_STEP]], 2
@@ -375,12 +375,12 @@
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD5]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK9]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[BLOCK]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK9]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[BLOCK]])
 ; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[COL_LOAD5]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <2 x double> poison, double [[TMP12]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT11]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK10]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK10]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[TMP11]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP15]] = shufflevector <2 x double> [[RESULT_VEC_0]], <2 x double> [[TMP14]], <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x double> [[RESULT_VEC_1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
@@ -388,19 +388,19 @@
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD8]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <2 x double> poison, double [[TMP16]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT15]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK14]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[BLOCK13]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK14]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[BLOCK13]])
 ; CHECK-NEXT:    [[BLOCK17:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[COL_LOAD8]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <2 x double> poison, double [[TMP18]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT18]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK17]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP17]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK17]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP17]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP21]] = shufflevector <2 x double> [[RESULT_VEC_1]], <2 x double> [[TMP20]], <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    br label [[INNER_LATCH]]
 ; CHECK:       inner.latch:
 ; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
 ; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ne i64 [[INNER_STEP]], 2
-; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER]], label [[ROWS_LATCH]], [[LOOP4:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER]], label [[ROWS_LATCH]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       rows.latch:
 ; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
 ; CHECK-NEXT:    [[ROWS_COND:%.*]] = icmp ne i64 [[ROWS_STEP]], 2
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
@@ -57,13 +57,13 @@
 ; CHECK-NEXT:    [[VEC_CAST14:%.*]] = bitcast double* [[VEC_GEP13]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD15:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST14]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD12]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT18:%.*]] = shufflevector <2 x double> [[COL_LOAD12]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD9]], <2 x double> [[SPLAT_SPLAT18]], <2 x double> [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD9]], <2 x double> [[SPLAT_SPLAT18]], <2 x double> [[TMP12]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT21:%.*]] = shufflevector <2 x double> [[COL_LOAD15]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT21]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT21]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <2 x double> [[COL_LOAD15]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP15:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD9]], <2 x double> [[SPLAT_SPLAT24]], <2 x double> [[TMP14]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD9]], <2 x double> [[SPLAT_SPLAT24]], <2 x double> [[TMP14]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 8
 ; CHECK-NEXT:    [[VEC_CAST26:%.*]] = bitcast double* [[TMP16]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD27:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST26]], align 8
@@ -77,13 +77,13 @@
 ; CHECK-NEXT:    [[VEC_CAST35:%.*]] = bitcast double* [[VEC_GEP34]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD36:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST35]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD33]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD27]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP13]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD27]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP13]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT43:%.*]] = shufflevector <2 x double> [[COL_LOAD33]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP19:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD30]], <2 x double> [[SPLAT_SPLAT43]], <2 x double> [[TMP18]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD30]], <2 x double> [[SPLAT_SPLAT43]], <2 x double> [[TMP18]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT47:%.*]] = shufflevector <2 x double> [[COL_LOAD36]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD27]], <2 x double> [[SPLAT_SPLAT47]], <2 x double> [[TMP15]])
+; CHECK-NEXT:    [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD27]], <2 x double> [[SPLAT_SPLAT47]], <2 x double> [[TMP15]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD36]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP21:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD30]], <2 x double> [[SPLAT_SPLAT50]], <2 x double> [[TMP20]])
+; CHECK-NEXT:    [[TMP21:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD30]], <2 x double> [[SPLAT_SPLAT50]], <2 x double> [[TMP20]])
 ; CHECK-NEXT:    [[VEC_CAST52:%.*]] = bitcast <16 x double>* [[C]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP19]], <2 x double>* [[VEC_CAST52]], align 8
 ; CHECK-NEXT:    [[VEC_GEP53:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 4
@@ -101,13 +101,13 @@
 ; CHECK-NEXT:    [[VEC_CAST65:%.*]] = bitcast double* [[VEC_GEP64]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD66:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST65]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = fmul <2 x double> [[COL_LOAD57]], [[SPLAT_SPLAT69]]
+; CHECK-NEXT:    [[TMP23:%.*]] = fmul contract <2 x double> [[COL_LOAD57]], [[SPLAT_SPLAT69]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP24:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP23]])
+; CHECK-NEXT:    [[TMP24:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP23]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT75:%.*]] = shufflevector <2 x double> [[COL_LOAD66]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = fmul <2 x double> [[COL_LOAD57]], [[SPLAT_SPLAT75]]
+; CHECK-NEXT:    [[TMP25:%.*]] = fmul contract <2 x double> [[COL_LOAD57]], [[SPLAT_SPLAT75]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT78:%.*]] = shufflevector <2 x double> [[COL_LOAD66]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP26:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT78]], <2 x double> [[TMP25]])
+; CHECK-NEXT:    [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT78]], <2 x double> [[TMP25]])
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 10
 ; CHECK-NEXT:    [[VEC_CAST80:%.*]] = bitcast double* [[TMP27]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD81:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST80]], align 8
@@ -121,13 +121,13 @@
 ; CHECK-NEXT:    [[VEC_CAST89:%.*]] = bitcast double* [[VEC_GEP88]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD90:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST89]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT94:%.*]] = shufflevector <2 x double> [[COL_LOAD87]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP29:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD81]], <2 x double> [[SPLAT_SPLAT94]], <2 x double> [[TMP24]])
+; CHECK-NEXT:    [[TMP29:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD81]], <2 x double> [[SPLAT_SPLAT94]], <2 x double> [[TMP24]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT97:%.*]] = shufflevector <2 x double> [[COL_LOAD87]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP30:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD84]], <2 x double> [[SPLAT_SPLAT97]], <2 x double> [[TMP29]])
+; CHECK-NEXT:    [[TMP30:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD84]], <2 x double> [[SPLAT_SPLAT97]], <2 x double> [[TMP29]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT101:%.*]] = shufflevector <2 x double> [[COL_LOAD90]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP31:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD81]], <2 x double> [[SPLAT_SPLAT101]], <2 x double> [[TMP26]])
+; CHECK-NEXT:    [[TMP31:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD81]], <2 x double> [[SPLAT_SPLAT101]], <2 x double> [[TMP26]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT104:%.*]] = shufflevector <2 x double> [[COL_LOAD90]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP32:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD84]], <2 x double> [[SPLAT_SPLAT104]], <2 x double> [[TMP31]])
+; CHECK-NEXT:    [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD84]], <2 x double> [[SPLAT_SPLAT104]], <2 x double> [[TMP31]])
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 2
 ; CHECK-NEXT:    [[VEC_CAST106:%.*]] = bitcast double* [[TMP33]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP30]], <2 x double>* [[VEC_CAST106]], align 8
@@ -146,13 +146,13 @@
 ; CHECK-NEXT:    [[VEC_CAST119:%.*]] = bitcast double* [[VEC_GEP118]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD120:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST119]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT123:%.*]] = shufflevector <2 x double> [[COL_LOAD117]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP35:%.*]] = fmul <2 x double> [[COL_LOAD111]], [[SPLAT_SPLAT123]]
+; CHECK-NEXT:    [[TMP35:%.*]] = fmul contract <2 x double> [[COL_LOAD111]], [[SPLAT_SPLAT123]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT126:%.*]] = shufflevector <2 x double> [[COL_LOAD117]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP36:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD114]], <2 x double> [[SPLAT_SPLAT126]], <2 x double> [[TMP35]])
+; CHECK-NEXT:    [[TMP36:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD114]], <2 x double> [[SPLAT_SPLAT126]], <2 x double> [[TMP35]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT129:%.*]] = shufflevector <2 x double> [[COL_LOAD120]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP37:%.*]] = fmul <2 x double> [[COL_LOAD111]], [[SPLAT_SPLAT129]]
+; CHECK-NEXT:    [[TMP37:%.*]] = fmul contract <2 x double> [[COL_LOAD111]], [[SPLAT_SPLAT129]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT132:%.*]] = shufflevector <2 x double> [[COL_LOAD120]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP38:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD114]], <2 x double> [[SPLAT_SPLAT132]], <2 x double> [[TMP37]])
+; CHECK-NEXT:    [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD114]], <2 x double> [[SPLAT_SPLAT132]], <2 x double> [[TMP37]])
 ; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 8
 ; CHECK-NEXT:    [[VEC_CAST134:%.*]] = bitcast double* [[TMP39]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD135:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST134]], align 8
@@ -166,13 +166,13 @@
 ; CHECK-NEXT:    [[VEC_CAST143:%.*]] = bitcast double* [[VEC_GEP142]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD144:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST143]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT148:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP41:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD135]], <2 x double> [[SPLAT_SPLAT148]], <2 x double> [[TMP36]])
+; CHECK-NEXT:    [[TMP41:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD135]], <2 x double> [[SPLAT_SPLAT148]], <2 x double> [[TMP36]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT151:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP42:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT151]], <2 x double> [[TMP41]])
+; CHECK-NEXT:    [[TMP42:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT151]], <2 x double> [[TMP41]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT155:%.*]] = shufflevector <2 x double> [[COL_LOAD144]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP43:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD135]], <2 x double> [[SPLAT_SPLAT155]], <2 x double> [[TMP38]])
+; CHECK-NEXT:    [[TMP43:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD135]], <2 x double> [[SPLAT_SPLAT155]], <2 x double> [[TMP38]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT158:%.*]] = shufflevector <2 x double> [[COL_LOAD144]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP44:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT158]], <2 x double> [[TMP43]])
+; CHECK-NEXT:    [[TMP44:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT158]], <2 x double> [[TMP43]])
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 8
 ; CHECK-NEXT:    [[VEC_CAST160:%.*]] = bitcast double* [[TMP45]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP42]], <2 x double>* [[VEC_CAST160]], align 8
@@ -192,13 +192,13 @@
 ; CHECK-NEXT:    [[VEC_CAST173:%.*]] = bitcast double* [[VEC_GEP172]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD174:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST173]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT177:%.*]] = shufflevector <2 x double> [[COL_LOAD171]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP48:%.*]] = fmul <2 x double> [[COL_LOAD165]], [[SPLAT_SPLAT177]]
+; CHECK-NEXT:    [[TMP48:%.*]] = fmul contract <2 x double> [[COL_LOAD165]], [[SPLAT_SPLAT177]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT180:%.*]] = shufflevector <2 x double> [[COL_LOAD171]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP49:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD168]], <2 x double> [[SPLAT_SPLAT180]], <2 x double> [[TMP48]])
+; CHECK-NEXT:    [[TMP49:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD168]], <2 x double> [[SPLAT_SPLAT180]], <2 x double> [[TMP48]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT183:%.*]] = shufflevector <2 x double> [[COL_LOAD174]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP50:%.*]] = fmul <2 x double> [[COL_LOAD165]], [[SPLAT_SPLAT183]]
+; CHECK-NEXT:    [[TMP50:%.*]] = fmul contract <2 x double> [[COL_LOAD165]], [[SPLAT_SPLAT183]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT186:%.*]] = shufflevector <2 x double> [[COL_LOAD174]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP51:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD168]], <2 x double> [[SPLAT_SPLAT186]], <2 x double> [[TMP50]])
+; CHECK-NEXT:    [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD168]], <2 x double> [[SPLAT_SPLAT186]], <2 x double> [[TMP50]])
 ; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 10
 ; CHECK-NEXT:    [[VEC_CAST188:%.*]] = bitcast double* [[TMP52]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD189:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST188]], align 8
@@ -212,13 +212,13 @@
 ; CHECK-NEXT:    [[VEC_CAST197:%.*]] = bitcast double* [[VEC_GEP196]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD198:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST197]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT202:%.*]] = shufflevector <2 x double> [[COL_LOAD195]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP54:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD189]], <2 x double> [[SPLAT_SPLAT202]], <2 x double> [[TMP49]])
+; CHECK-NEXT:    [[TMP54:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD189]], <2 x double> [[SPLAT_SPLAT202]], <2 x double> [[TMP49]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT205:%.*]] = shufflevector <2 x double> [[COL_LOAD195]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP55:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD192]], <2 x double> [[SPLAT_SPLAT205]], <2 x double> [[TMP54]])
+; CHECK-NEXT:    [[TMP55:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD192]], <2 x double> [[SPLAT_SPLAT205]], <2 x double> [[TMP54]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT209:%.*]] = shufflevector <2 x double> [[COL_LOAD198]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP56:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD189]], <2 x double> [[SPLAT_SPLAT209]], <2 x double> [[TMP51]])
+; CHECK-NEXT:    [[TMP56:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD189]], <2 x double> [[SPLAT_SPLAT209]], <2 x double> [[TMP51]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT212:%.*]] = shufflevector <2 x double> [[COL_LOAD198]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP57:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD192]], <2 x double> [[SPLAT_SPLAT212]], <2 x double> [[TMP56]])
+; CHECK-NEXT:    [[TMP57:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD192]], <2 x double> [[SPLAT_SPLAT212]], <2 x double> [[TMP56]])
 ; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 10
 ; CHECK-NEXT:    [[VEC_CAST214:%.*]] = bitcast double* [[TMP58]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP55]], <2 x double>* [[VEC_CAST214]], align 8
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-minimal.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-minimal.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-minimal.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-minimal.ll
@@ -31,21 +31,21 @@
 ; CHECK-NEXT:    [[VEC_CAST12:%.*]] = bitcast double* [[VEC_GEP11]] to <4 x double>*
 ; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST12]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <4 x double> [[COL_LOAD10]], <4 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <4 x double> [[COL_LOAD10]], <4 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[TMP0]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <4 x double> [[COL_LOAD10]], <4 x double> undef, <2 x i32> <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD5]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD5]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP1]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT22:%.*]] = shufflevector <4 x double> [[COL_LOAD10]], <4 x double> undef, <2 x i32> <i32 3, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT22]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT22]], <2 x double> [[TMP2]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT25:%.*]] = shufflevector <4 x double> [[COL_LOAD13]], <4 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT25]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT25]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT28:%.*]] = shufflevector <4 x double> [[COL_LOAD13]], <4 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT28]], <2 x double> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT28]], <2 x double> [[TMP4]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT31:%.*]] = shufflevector <4 x double> [[COL_LOAD13]], <4 x double> undef, <2 x i32> <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD5]], <2 x double> [[SPLAT_SPLAT31]], <2 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD5]], <2 x double> [[SPLAT_SPLAT31]], <2 x double> [[TMP5]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT34:%.*]] = shufflevector <4 x double> [[COL_LOAD13]], <4 x double> undef, <2 x i32> <i32 3, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT34]], <2 x double> [[TMP6]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT34]], <2 x double> [[TMP6]])
 ; CHECK-NEXT:    [[VEC_CAST35:%.*]] = bitcast <4 x double>* [[C:%.*]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[VEC_CAST35]], align 8
 ; CHECK-NEXT:    [[VEC_GEP36:%.*]] = getelementptr <4 x double>, <4 x double>* [[C]], i64 0, i64 2
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/preserve-existing-fast-math-flags.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/preserve-existing-fast-math-flags.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/preserve-existing-fast-math-flags.ll
@@ -0,0 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s
+
+; Function Attrs: nofree nounwind uwtable willreturn mustprogress
+define <4 x float> @preserve_fmf(<4 x float> %m, <4 x float> %n, float %x, float %y) {
+; CHECK-LABEL: @preserve_fmf(
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <4 x float> [[M:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <4 x float> [[M]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <4 x float> [[N:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <4 x float> [[N]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <1 x float> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK4:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x float> poison, float [[TMP3]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT5]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK4]], <1 x float> [[SPLAT_SPLAT6]], <1 x float> [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <1 x float> [[TMP4]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP5]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK7:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x float> poison, float [[TMP7]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT8]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <1 x float> [[BLOCK7]], [[SPLAT_SPLAT9]]
+; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x float> poison, float [[TMP9]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT11]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK10]], <1 x float> [[SPLAT_SPLAT12]], <1 x float> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <1 x float> [[TMP10]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP11]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT14]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul fast <1 x float> [[BLOCK13]], [[SPLAT_SPLAT15]]
+; CHECK-NEXT:    [[BLOCK16:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x float> poison, float [[TMP15]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT17]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK16]], <1 x float> [[SPLAT_SPLAT18]], <1 x float> [[TMP14]])
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <1 x float> [[TMP16]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP17]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK19:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x float> poison, float [[TMP19]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT20]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul fast <1 x float> [[BLOCK19]], [[SPLAT_SPLAT21]]
+; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x float> poison, float [[TMP21]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT23]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK22]], <1 x float> [[SPLAT_SPLAT24]], <1 x float> [[TMP20]])
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <1 x float> [[TMP22]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x float> [[TMP18]], <2 x float> [[TMP23]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[SPLIT25:%.*]] = shufflevector <4 x float> [[M]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT26:%.*]] = shufflevector <4 x float> [[M]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP25:%.*]] = fadd fast <2 x float> [[TMP12]], [[SPLIT25]]
+; CHECK-NEXT:    [[TMP26:%.*]] = fadd fast <2 x float> [[TMP24]], [[SPLIT26]]
+; CHECK-NEXT:    [[SPLIT27:%.*]] = shufflevector <4 x float> [[N]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT28:%.*]] = shufflevector <4 x float> [[N]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP27:%.*]] = fsub reassoc <2 x float> [[TMP25]], [[SPLIT27]]
+; CHECK-NEXT:    [[TMP28:%.*]] = fsub reassoc <2 x float> [[TMP26]], [[SPLIT28]]
+; CHECK-NEXT:    [[TMP29:%.*]] = fneg contract <2 x float> [[TMP27]]
+; CHECK-NEXT:    [[TMP30:%.*]] = fneg contract <2 x float> [[TMP28]]
+; CHECK-NEXT:    [[TMP31:%.*]] = fmul reassoc contract <2 x float> [[TMP27]], [[TMP29]]
+; CHECK-NEXT:    [[TMP32:%.*]] = fmul reassoc contract <2 x float> [[TMP28]], [[TMP30]]
+; CHECK-NEXT:    [[BLOCK29:%.*]] = shufflevector <2 x float> [[TMP29]], <2 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x float> [[TMP31]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT30:%.*]] = insertelement <1 x float> poison, float [[TMP33]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT31:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT30]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = fmul reassoc <1 x float> [[BLOCK29]], [[SPLAT_SPLAT31]]
+; CHECK-NEXT:    [[BLOCK32:%.*]] = shufflevector <2 x float> [[TMP30]], <2 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <2 x float> [[TMP31]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT33:%.*]] = insertelement <1 x float> poison, float [[TMP35]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT34:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT33]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = fmul reassoc <1 x float> [[BLOCK32]], [[SPLAT_SPLAT34]]
+; CHECK-NEXT:    [[TMP37:%.*]] = fadd reassoc <1 x float> [[TMP34]], [[TMP36]]
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <1 x float> [[TMP37]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP38]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK35:%.*]] = shufflevector <2 x float> [[TMP29]], <2 x float> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x float> [[TMP31]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT36:%.*]] = insertelement <1 x float> poison, float [[TMP40]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT37:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT36]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP41:%.*]] = fmul reassoc <1 x float> [[BLOCK35]], [[SPLAT_SPLAT37]]
+; CHECK-NEXT:    [[BLOCK38:%.*]] = shufflevector <2 x float> [[TMP30]], <2 x float> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[TMP31]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT39:%.*]] = insertelement <1 x float> poison, float [[TMP42]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT40:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT39]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = fmul reassoc <1 x float> [[BLOCK38]], [[SPLAT_SPLAT40]]
+; CHECK-NEXT:    [[TMP44:%.*]] = fadd reassoc <1 x float> [[TMP41]], [[TMP43]]
+; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <1 x float> [[TMP44]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP46:%.*]] = shufflevector <2 x float> [[TMP39]], <2 x float> [[TMP45]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK41:%.*]] = shufflevector <2 x float> [[TMP29]], <2 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x float> [[TMP32]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT42:%.*]] = insertelement <1 x float> poison, float [[TMP47]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT43:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT42]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP48:%.*]] = fmul reassoc <1 x float> [[BLOCK41]], [[SPLAT_SPLAT43]]
+; CHECK-NEXT:    [[BLOCK44:%.*]] = shufflevector <2 x float> [[TMP30]], <2 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <2 x float> [[TMP32]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT45:%.*]] = insertelement <1 x float> poison, float [[TMP49]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT46:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT45]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP50:%.*]] = fmul reassoc <1 x float> [[BLOCK44]], [[SPLAT_SPLAT46]]
+; CHECK-NEXT:    [[TMP51:%.*]] = fadd reassoc <1 x float> [[TMP48]], [[TMP50]]
+; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <1 x float> [[TMP51]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP53:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP52]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK47:%.*]] = shufflevector <2 x float> [[TMP29]], <2 x float> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <2 x float> [[TMP32]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT48:%.*]] = insertelement <1 x float> poison, float [[TMP54]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT49:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT48]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP55:%.*]] = fmul reassoc <1 x float> [[BLOCK47]], [[SPLAT_SPLAT49]]
+; CHECK-NEXT:    [[BLOCK50:%.*]] = shufflevector <2 x float> [[TMP30]], <2 x float> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <2 x float> [[TMP32]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT51:%.*]] = insertelement <1 x float> poison, float [[TMP56]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT52:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT51]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP57:%.*]] = fmul reassoc <1 x float> [[BLOCK50]], [[SPLAT_SPLAT52]]
+; CHECK-NEXT:    [[TMP58:%.*]] = fadd reassoc <1 x float> [[TMP55]], [[TMP57]]
+; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <1 x float> [[TMP58]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <2 x float> [[TMP53]], <2 x float> [[TMP59]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK53:%.*]] = shufflevector <2 x float> [[TMP31]], <2 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <2 x float> [[TMP46]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT54:%.*]] = insertelement <1 x float> poison, float [[TMP61]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT55:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT54]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP62:%.*]] = fmul reassoc contract <1 x float> [[BLOCK53]], [[SPLAT_SPLAT55]]
+; CHECK-NEXT:    [[BLOCK56:%.*]] = shufflevector <2 x float> [[TMP32]], <2 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <2 x float> [[TMP46]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT57:%.*]] = insertelement <1 x float> poison, float [[TMP63]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT58:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT57]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP64:%.*]] = call reassoc contract <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK56]], <1 x float> [[SPLAT_SPLAT58]], <1 x float> [[TMP62]])
+; CHECK-NEXT:    [[TMP65:%.*]] = shufflevector <1 x float> [[TMP64]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP66:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP65]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK59:%.*]] = shufflevector <2 x float> [[TMP31]], <2 x float> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <2 x float> [[TMP46]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT60:%.*]] = insertelement <1 x float> poison, float [[TMP67]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT61:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT60]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP68:%.*]] = fmul reassoc contract <1 x float> [[BLOCK59]], [[SPLAT_SPLAT61]]
+; CHECK-NEXT:    [[BLOCK62:%.*]] = shufflevector <2 x float> [[TMP32]], <2 x float> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <2 x float> [[TMP46]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT63:%.*]] = insertelement <1 x float> poison, float [[TMP69]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT64:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT63]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP70:%.*]] = call reassoc contract <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK62]], <1 x float> [[SPLAT_SPLAT64]], <1 x float> [[TMP68]])
+; CHECK-NEXT:    [[TMP71:%.*]] = shufflevector <1 x float> [[TMP70]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP72:%.*]] = shufflevector <2 x float> [[TMP66]], <2 x float> [[TMP71]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK65:%.*]] = shufflevector <2 x float> [[TMP31]], <2 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <2 x float> [[TMP60]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT66:%.*]] = insertelement <1 x float> poison, float [[TMP73]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT67:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT66]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP74:%.*]] = fmul reassoc contract <1 x float> [[BLOCK65]], [[SPLAT_SPLAT67]]
+; CHECK-NEXT:    [[BLOCK68:%.*]] = shufflevector <2 x float> [[TMP32]], <2 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <2 x float> [[TMP60]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT69:%.*]] = insertelement <1 x float> poison, float [[TMP75]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT70:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT69]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP76:%.*]] = call reassoc contract <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK68]], <1 x float> [[SPLAT_SPLAT70]], <1 x float> [[TMP74]])
+; CHECK-NEXT:    [[TMP77:%.*]] = shufflevector <1 x float> [[TMP76]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP78:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP77]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK71:%.*]] = shufflevector <2 x float> [[TMP31]], <2 x float> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <2 x float> [[TMP60]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT72:%.*]] = insertelement <1 x float> poison, float [[TMP79]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT73:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT72]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP80:%.*]] = fmul reassoc contract <1 x float> [[BLOCK71]], [[SPLAT_SPLAT73]]
+; CHECK-NEXT:    [[BLOCK74:%.*]] = shufflevector <2 x float> [[TMP32]], <2 x float> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x float> [[TMP60]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT75:%.*]] = insertelement <1 x float> poison, float [[TMP81]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT76:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT75]], <1 x float> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP82:%.*]] = call reassoc contract <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK74]], <1 x float> [[SPLAT_SPLAT76]], <1 x float> [[TMP80]])
+; CHECK-NEXT:    [[TMP83:%.*]] = shufflevector <1 x float> [[TMP82]], <1 x float> poison, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <2 x float> [[TMP78]], <2 x float> [[TMP83]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP85:%.*]] = shufflevector <2 x float> [[TMP72]], <2 x float> [[TMP84]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP85]]
+  %res = tail call fast <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %m, <4 x float> %n, i32 2, i32 2, i32 2)
+  %res.2 = fadd fast <4 x float> %res, %m
+  %res.3 = fsub reassoc <4 x float> %res.2, %n
+  %res.4 = fneg contract <4 x float> %res.3
+  %res.5 = fmul reassoc contract <4 x float> %res.3, %res.4
+  %res.6 = tail call reassoc <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %res.4, <4 x float> %res.5, i32 2, i32 2, i32 2)
+  %res.7 = tail call contract reassoc <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %res.5, <4 x float> %res.6, i32 2, i32 2, i32 2)
+  ret <4 x float> %res.7
+}
+
+declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32)