Index: llvm/docs/LangRef.rst
===================================================================
--- llvm/docs/LangRef.rst
+++ llvm/docs/LangRef.rst
@@ -15497,16 +15497,17 @@
 
 Operations on matrixes requiring shape information (like number of rows/columns
 or the memory layout) can be expressed using the matrix intrinsics. Matrixes are
-embedded in a flat vector and the intrinsics take the dimensions as arguments.
+linearized in a vector and the intrinsics take the dimensions as arguments.
 Currently column-major layout is assumed. The intrinsics support both integer
 and floating point matrixes.
 
 
 '``llvm.matrix.transpose.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
+This is an overloaded intrinsic.
 
 ::
 
@@ -15515,21 +15516,22 @@
 Overview:
 """""""""
 
-The '``llvm.matrix.transpose.*``' intrinsic treats %In as containing a matrix
-with <Rows> rows and <Cols> columns and returns the transposed matrix embedded in
-the result vector.
+The '``llvm.matrix.transpose.*``' intrinsics treat %In as a <Rows> x <Cols> matrix
+and returns the transposed matrix linearized in the result vector.
 
 Arguments:
 """"""""""
 
-The <Rows> and <Cols> arguments must be constant integers. The vector argument
-%In and the returned vector must have <Rows> * <Cols> elements.
+The <Rows> and <Cols> arguments must be positive, constant integers. The vector
+argument %In and the returned vector must have <Rows> * <Cols> elements, and
+have the %same float or integer element types.
 
 '``llvm.matrix.multiply.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
+This is an overloaded intrinsic.
 
 ::
 
@@ -15538,18 +15540,18 @@
 Overview:
 """""""""
 
-The '``llvm.matrix.multiply.*``' intrinsic treats %A as a matrix with <OuterRows>
-rows and <Inner> columns, %B as a matrix with <Inner> rows and <OuterColumns>
-columns and multiplies them. The result matrix is returned embedded in the
-result vector.
+The '``llvm.matrix.multiply.*``' intrinsics treat %A as a <OuterRows> x <Inner>
+matrix, %B as a <Inner> x <OuterColumns> matrix, and multiplies them. The result
+matrix is linearized and returned in the result vector.
 
 Arguments:
 """"""""""
 
-The <OuterRows>, <Inner> and <OuterColumns> arguments must be constant
+The <OuterRows>, <Inner> and <OuterColumns> arguments must be positive, constant
 integers. The vector argument %A must have <OuterRows> * <Inner> elements, %B
 must have <Inner> * <OuterColumns> elements and the returned vector must have
-<OuterRows> * <OuterColumns> elements.
+<OuterRows> * <OuterColumns> elements. Vectors %A, %B, and the returned vector all
+have the same float or integer element type.
 
 
 '``llvm.matrix.column.major.load.*``' Intrinsic
@@ -15557,6 +15559,7 @@
 
 Syntax:
 """""""
+This is an overloaded intrinsic.
 
 ::
 
@@ -15566,22 +15569,25 @@
 Overview:
 """""""""
 
-The '``llvm.matrix.column.major.load.*``' intrinsic loads a matrix with <Rows>
-rows and <Cols> columns, using a stride of %Stride between columns. For two
-consecutive columns A and B, %Stride refers to the distance (the number of
-elements) between the start of column A and the start of column B. The result
-matrix is returned embedded in the result vector. This allows for convenient
-loading of sub matrixes.  If <IsVolatile> is true, the intrinsic is considered
-a :ref:`volatile memory access <volatile>`.
+The '``llvm.matrix.column.major.load.*``' intrinsics load a <Rows> x <Cols>
+matrix, using a stride of %Stride between columns. For two consecutive columns
+A and B, %Stride refers to the distance (the number of elements) between the
+start of column A and the start of column B. The result matrix is linearized
+and returned in the result vector. This allows for convenient loading of sub
+matrixes. If <IsVolatile> is true, the intrinsic is considered a :ref:`volatile
+memory access <volatile>`.
 
 If the %Ptr argument is known to be aligned to some boundary, this can be
-specified as an attribute on the argument.
+specified as an attribute on the argument. %Ptr is a pointer type to the
+returned vector type.
 
 Arguments:
 """"""""""
 
-The <IsVolatile>, <Rows> and <Cols> arguments must be constant integers. The
-returned vector must have <Rows> * <Cols> elements. %Stride must be >= <Rows>.
+Argument <IsVolatile> is a boolean value. The second and third arguments,
+<Rows> and <Cols>, must be positive, constant integers. The returned vector
+must have <Rows> * <Cols> elements. %Stride is a postive, constant integer, and
+%Stride >= <Rows>.
 
 The :ref:`align <attr_align>` parameter attribute can be provided
 for the %Ptr arguments.
@@ -15592,6 +15598,7 @@
 
 Syntax:
 """""""
+This is an overloaded intrinsic.
 
 ::
 
@@ -15601,12 +15608,12 @@
 Overview:
 """""""""
 
-The '``llvm.matrix.column.major.store.*``' intrinsic stores the matrix with
-<Rows> rows and <Cols> columns embedded in %In, using a stride of %Stride
-between columns. For two consecutive columns A and B, %Stride refers to the
-distance (the number of elements) between the start of column A and the start
-of column B. If <IsVolatile> is true, the intrinsic is considered a
-:ref:`volatile memory access <volatile>`.
+The '``llvm.matrix.column.major.store.*``' intrinsics store the <Rows> x <Cols>
+matrix linearized in %In, using a stride of %Stride between columns. For two
+consecutive columns A and B, %Stride refers to the distance (the number of
+elements) between the start of column A and the start of column B. If
+<IsVolatile> is true, the intrinsic is considered a :ref:`volatile memory
+access <volatile>`. %Ptr is a pointer type to the vector type of %In.
 
 If the %Ptr argument is known to be aligned to some boundary, this can be
 specified as an attribute on the argument.
@@ -15614,8 +15621,9 @@
 Arguments:
 """"""""""
 
-The <IsVolatile>, <Rows>, <Cols> arguments must be constant integers. The
-vector argument %In must have <Rows> * <Cols> elements. %Stride must be >= <Rows>.
+Argument <IsVolatile> is a boolean value. Arguments <Rows>, <Cols> must be
+positive, constant integers. The vector argument %In must have <Rows> * <Cols>
+elements. %Stride is a positive, constant integer, and %Stride >= <Rows>.
 
 The :ref:`align <attr_align>` parameter attribute can be provided
 for the %Ptr arguments.
Index: llvm/lib/IR/Verifier.cpp
===================================================================
--- llvm/lib/IR/Verifier.cpp
+++ llvm/lib/IR/Verifier.cpp
@@ -5008,32 +5008,62 @@
   case Intrinsic::matrix_column_major_store: {
     ConstantInt *NumRows;
     ConstantInt *NumColumns;
-    VectorType *TypeToCheck;
+    VectorType *ResultTy;
+    Type *Op0ElemTy = nullptr;
+    Type *Op1ElemTy = nullptr;
     switch (ID) {
     case Intrinsic::matrix_multiply:
       NumRows = cast<ConstantInt>(Call.getArgOperand(2));
       NumColumns = cast<ConstantInt>(Call.getArgOperand(4));
-      TypeToCheck = cast<VectorType>(Call.getType());
+      ResultTy = cast<VectorType>(Call.getType());
+      Op0ElemTy =
+          cast<VectorType>(Call.getArgOperand(0)->getType())->getElementType();
+      Op1ElemTy =
+          cast<VectorType>(Call.getArgOperand(1)->getType())->getElementType();
       break;
     case Intrinsic::matrix_transpose:
       NumRows = cast<ConstantInt>(Call.getArgOperand(1));
       NumColumns = cast<ConstantInt>(Call.getArgOperand(2));
-      TypeToCheck = cast<VectorType>(Call.getType());
+      ResultTy = cast<VectorType>(Call.getType());
+      Op0ElemTy =
+          cast<VectorType>(Call.getArgOperand(0)->getType())->getElementType();
       break;
-    case Intrinsic::matrix_column_major_load:
+    case Intrinsic::matrix_column_major_load: {
       NumRows = cast<ConstantInt>(Call.getArgOperand(3));
       NumColumns = cast<ConstantInt>(Call.getArgOperand(4));
-      TypeToCheck = cast<VectorType>(Call.getType());
+      ResultTy = cast<VectorType>(Call.getType());
+      auto *VecTy = cast<VectorType>(
+          cast<PointerType>(Call.getArgOperand(0)->getType())->getElementType());
+      Op0ElemTy = VecTy->getElementType();
+      }
       break;
-    case Intrinsic::matrix_column_major_store:
+    case Intrinsic::matrix_column_major_store: {
       NumRows = cast<ConstantInt>(Call.getArgOperand(4));
       NumColumns = cast<ConstantInt>(Call.getArgOperand(5));
-      TypeToCheck = cast<VectorType>(Call.getArgOperand(0)->getType());
+      ResultTy = cast<VectorType>(Call.getArgOperand(0)->getType());
+      Op0ElemTy =
+          cast<VectorType>(Call.getArgOperand(0)->getType())->getElementType();
+      auto *VecTy = cast<VectorType>(
+          cast<PointerType>(Call.getArgOperand(1)->getType())->getElementType());
+      Op1ElemTy = VecTy->getElementType();
+      }
       break;
     default:
       llvm_unreachable("unexpected intrinsic");
     }
-    Assert(TypeToCheck->getNumElements() ==
+
+    Function *IF = Call.getCalledFunction();
+    Assert(ResultTy->getElementType()->isIntegerTy() ||
+           ResultTy->getElementType()->isFloatingPointTy(),
+           "Result type must be an integer or floating-point type!", IF);
+    Assert(ResultTy->getElementType() == Op0ElemTy,
+           "Vector element type mismatch of the result and first operand "
+           "vector!", IF);
+    if (Op1ElemTy) {
+      Assert(ResultTy->getElementType() == Op1ElemTy,
+             "Type mismatch of the result and second operand vector!", IF);
+    }
+    Assert(ResultTy->getNumElements() ==
                NumRows->getZExtValue() * NumColumns->getZExtValue(),
            "result of a matrix operation does not fit in the returned vector");
     break;
Index: llvm/test/Verifier/matrix-intrinsics.ll
===================================================================
--- llvm/test/Verifier/matrix-intrinsics.ll
+++ llvm/test/Verifier/matrix-intrinsics.ll
@@ -64,3 +64,82 @@
   call void @llvm.matrix.column.major.store.v6f32.p0v6f32(<6 x float> zeroinitializer, <6 x float>* %n, i64 %arg, i1 false, i32 3, i32 3)
   ret void
 }
+
+declare <4 x float> @llvm.matrix.transpose.v4f32.v4i32(<4 x i32>, i32, i32)
+declare <4 x i32> @llvm.matrix.transpose.v4i32.v4f32(<4 x float>, i32, i32)
+
+define <4 x float> @transpose_mixed_types(<4 x float> %fvec, <4 x i32> %ivec, i32 %arg) {
+;
+; CHECK-NEXT: Intrinsic has incorrect argument type!
+; CHECK-NEXT: <4 x float> (<4 x i32>, i32, i32)* @llvm.matrix.transpose.v4f32.v4i32
+; CHECK-NEXT: Intrinsic has incorrect argument type!
+; CHECK-NEXT: <4 x i32> (<4 x float>, i32, i32)* @llvm.matrix.transpose.v4i32.v4f32
+;
+  %result.0 = call <4 x float> @llvm.matrix.transpose.v4f32.v4i32(<4 x i32> %ivec, i32 0, i32 0)
+  %result.1 = call <4 x i32> @llvm.matrix.transpose.v4i32.v4f32(<4 x float> %result.0, i32 3, i32 2)
+  ret <4 x float> %result.0
+}
+
+declare <4 x i32>   @llvm.matrix.multiply.v4i32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.matrix.multiply.v4f32.v4i32.v4f32(<4 x i32>, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4i32(<4 x float>, <4 x i32>, i32, i32, i32)
+declare <4 x float> @llvm.matrix.multiply.v4f32.v4i32.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32)
+
+define <4 x float> @multiply_mixed_types(<4 x i32> %ivec, <4 x float> %fvec, i32 %arg) {
+;
+; CHECK-NEXT: Vector element type mismatch of the result and first operand vector!
+; CHECK-NEXT: <4 x i32> (<4 x float>, <4 x float>, i32, i32, i32)* @llvm.matrix.multiply.v4i32.v4f32.v4f32
+; CHECK-NEXT: Vector element type mismatch of the result and first operand vector!
+; CHECK-NEXT: <4 x float> (<4 x i32>, <4 x float>, i32, i32, i32)* @llvm.matrix.multiply.v4f32.v4i32.v4f32
+; CHECK-NEXT: Type mismatch of the result and second operand vector!
+; CHECK-NEXT: <4 x float> (<4 x float>, <4 x i32>, i32, i32, i32)* @llvm.matrix.multiply.v4f32.v4f32.v4i32
+; CHECK-NEXT: Vector element type mismatch of the result and first operand vector!
+; CHECK-NEXT: <4 x float> (<4 x i32>, <4 x i32>, i32, i32, i32)* @llvm.matrix.multiply.v4f32.v4i32.v4i32
+;
+  %result.0 = call <4 x i32> @llvm.matrix.multiply.v4i32.v4f32.v4f32(<4 x float> %fvec, <4 x float> %fvec, i32 2, i32 2, i32 2)
+  %result.1 = call <4 x float> @llvm.matrix.multiply.v4f32.v4i32.v4f32(<4 x i32> %result.0, <4 x float> %fvec, i32 2, i32 2, i32 2)
+  %result.2 = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4i32(<4 x float> %fvec, <4 x i32> %ivec, i32 2, i32 2, i32 2)
+  %result.3 = call <4 x float> @llvm.matrix.multiply.v4f32.v4i32.v4i32(<4 x i32> %ivec, <4 x i32> %ivec, i32 2, i32 2, i32 2)
+  ret <4 x float> %result.3
+}
+
+declare <4 x float> @llvm.matrix.column.major.load.v4f32.p0v4i32(<4 x i32>*, i64, i1, i32, i32)
+declare <4 x i32> @llvm.matrix.column.major.load.v4i32.p0v4f32(<4 x float>*, i64, i1, i32, i32)
+
+define <4 x float> @column.major_load_mixed_types(<4 x i32>* %m, <4 x float>* %n, i32 %arg) {
+;
+; CHECK-NEXT: Vector element type mismatch of the result and first operand vector!
+; CHECK-NEXT: <4 x float> (<4 x i32>*, i64, i1, i32, i32)* @llvm.matrix.column.major.load.v4f32.p0v4i32
+; CHECK-NEXT: Vector element type mismatch of the result and first operand vector!
+; CHECK-NEXT: <4 x i32> (<4 x float>*, i64, i1, i32, i32)* @llvm.matrix.column.major.load.v4i32.p0v4f32
+;
+  %result.0 = call <4 x float> @llvm.matrix.column.major.load.v4f32.p0v4i32(<4 x i32>* %m, i64 2, i1 false, i32 2, i32 2)
+  %result.1 = call <4 x i32> @llvm.matrix.column.major.load.v4i32.p0v4f32(<4 x float>* %n, i64 2, i1 false, i32 2, i32 2)
+  ret <4 x float> %result.0
+}
+
+declare void @llvm.matrix.column.major.store.v4i32.p0v4f32(<4 x i32>, <4 x float>*, i64, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v4f32.p0v4i32(<4 x float>, <4 x i32>*, i64, i1, i32, i32)
+
+define void @column.major_store_mixed_types(<4 x float>* %m, <4 x i32>* %n, i64 %arg) {
+;
+; CHECK-NEXT: Type mismatch of the result and second operand vector!
+; CHECK-NEXT: void (<4 x i32>, <4 x float>*, i64, i1, i32, i32)* @llvm.matrix.column.major.store.v4i32.p0v4f32
+; CHECK-NEXT: Type mismatch of the result and second operand vector!
+; CHECK-NEXT: void (<4 x float>, <4 x i32>*, i64, i1, i32, i32)* @llvm.matrix.column.major.store.v4f32.p0v4i32
+;
+  call void @llvm.matrix.column.major.store.v4i32.p0v4f32(<4 x i32> zeroinitializer, <4 x float>* %m, i64 2, i1 false, i32 2, i32 2)
+  call void @llvm.matrix.column.major.store.v4f32.p0v4i32(<4 x float> zeroinitializer, <4 x i32>* %n, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}
+
+declare void @llvm.matrix.column.major.store.v4f32p0.p0v4f32(<4 x float*>, <4 x float>*, i64, i1, i32, i32)
+
+define void @column.major_store_non_int_float_type(<4 x float>* %m, <4 x float>* %n, i64 %arg) {
+;
+; CHECK-NEXT: Result type must be an integer or floating-point type!
+; CHECK-NEXT: void (<4 x float*>, <4 x float>*, i64, i1, i32, i32)* @llvm.matrix.column.major.store.v4p0f32.p0v4f32
+;
+  call void @llvm.matrix.column.major.store.v4f32p0.p0v4f32(<4 x float*> zeroinitializer, <4 x float>* %n, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}