diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15466,7 +15466,7 @@
 <OuterRows> * <OuterColumns> elements.
 
 
-'``llvm.matrix.columnwise.load.*``' Intrinsic
+'``llvm.matrix.column.major.load.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -15474,25 +15474,34 @@
 
 ::
 
-      declare vectorty @llvm.matrix.columnwise.load.*(ptrty %Ptr, i32 %Stride, i32 <Rows>, i32 <Cols>)
+      declare vectorty @llvm.matrix.column.major.load.*(
+          ptrty %Ptr, i64 %Stride, i1 <IsVolatile>, i32 <Rows>, i32 <Cols>)
 
 Overview:
 """""""""
 
-The '``llvm.matrix.columnwise.load.*``' intrinsic loads a matrix with <Rows>
+The '``llvm.matrix.column.major.load.*``' intrinsic loads a matrix with <Rows>
 rows and <Cols> columns, using a stride of %Stride between columns. For two
 consecutive columns A and B, %Stride refers to the distance (the number of
 elements) between the start of column A and the start of column B. The result
 matrix is returned embedded in the result vector. This allows for convenient
-loading of sub matrixes.
+loading of sub matrixes.  If <IsVolatile> is true, the intrinsic is considered
+a :ref:`volatile memory access <volatile>.`
+
+If the %Ptr argument is known to be aligned to some boundary, this can be
+specified as an attribute on the argument.
 
 Arguments:
 """"""""""
 
-The <Rows> and <Cols> arguments must be constant integers. The returned vector
-must have <Rows> * <Cols> elements. %Stride must be >= <Rows>.
+The <IsVolatile>, <Rows> and <Cols> arguments must be constant integers. The
+returned vector must have <Rows> * <Cols> elements. %Stride must be >= <Rows>.
+
+The :ref:`align <attr_align>` parameter attribute can be provided
+for the %Ptr arguments.
 
-'``llvm.matrix.columnwise.store.*``' Intrinsic
+
+'``llvm.matrix.column.major.store.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -15500,22 +15509,31 @@
 
 ::
 
-      declare void @llvm.matrix.columnwise.store.*(vectorty %In, ptrty %Ptr, i32 %Stride, i32 <Rows>, i32 <Cols>)
+      declare void @llvm.matrix.column.major.store.*(
+          vectorty %In, ptrty %Ptr, i64 %Stride, i1 <IsVolatile>, i32 <Rows>, i32 <Cols>)
 
 Overview:
 """""""""
 
-The '``llvm.matrix.columnwise.store.*``' intrinsic stores the matrix with
+The '``llvm.matrix.column.major.store.*``' intrinsic stores the matrix with
 <Rows> rows and <Cols> columns embedded in %In, using a stride of %Stride
 between columns. For two consecutive columns A and B, %Stride refers to the
 distance (the number of elements) between the start of column A and the start
-of column B.
+of column B. If <IsVolatile> is true, the intrinsic is considered a
+:ref:`volatile memory access <volatile>.`
+
+If the %Ptr argument is known to be aligned to some boundary, this can be
+specified as an attribute on the argument.
 
 Arguments:
 """"""""""
 
-The <Rows> and <Cols> arguments must be constant integers. The vector argument
-%In must have <Rows> * <Cols> elements. %Stride must be >= <Rows>.
+The <IsVolatile>, <Rows>, <Cols> arguments must be constant integers. The
+vector argument %In must have <Rows> * <Cols> elements. %Stride must be >= <Rows>.
+
+The :ref:`align <attr_align>` parameter attribute can be provided
+for the %Ptr arguments.
+
 
 Half Precision Floating-Point Intrinsics
 ----------------------------------------
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1449,21 +1449,21 @@
               [IntrNoSync, IntrWillReturn, IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>,
                ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 
-def int_matrix_columnwise_load
+def int_matrix_column_major_load
   : Intrinsic<[llvm_anyvector_ty],
-              [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty, llvm_i32_ty,
-               llvm_i32_ty],
+              [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i64_ty, llvm_i1_ty,
+               llvm_i32_ty, llvm_i32_ty],
               [IntrNoSync, IntrWillReturn, IntrArgMemOnly, IntrReadMem,
-               NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<2>>,
-               ImmArg<ArgIndex<3>>]>;
+               NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>,
+               ImmArg<ArgIndex<4>>]>;
 
-def int_matrix_columnwise_store
+def int_matrix_column_major_store
   : Intrinsic<[],
               [llvm_anyvector_ty, LLVMAnyPointerType<LLVMMatchType<0>>,
-               llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+               llvm_i64_ty, llvm_i1_ty, llvm_i32_ty, llvm_i32_ty],
               [IntrNoSync, IntrWillReturn, IntrArgMemOnly, IntrWriteMem,
                WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
-               ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
+               ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 //===---------- Intrinsics to control hardware supported loops ----------===//
 
diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h
--- a/llvm/include/llvm/IR/MatrixBuilder.h
+++ b/llvm/include/llvm/IR/MatrixBuilder.h
@@ -22,6 +22,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Support/Alignment.h"
 
 namespace llvm {
 
@@ -51,14 +52,14 @@
 public:
   MatrixBuilder(IRBuilderTy &Builder) : B(Builder) {}
 
-  /// Create a columnwise, strided matrix load.
+  /// Create a column major, strided matrix load.
   /// \p DataPtr - Start address of the matrix read
   /// \p Rows    - Number of rows in matrix (must be a constant)
   /// \p Columns - Number of columns in matrix (must be a constant)
   /// \p Stride  - Space between columns
-  CallInst *CreateMatrixColumnwiseLoad(Value *DataPtr, unsigned Rows,
-                                       unsigned Columns, Value *Stride,
-                                       const Twine &Name = "") {
+  CallInst *CreateColumnMajorLoad(Value *DataPtr, Align Alignment,
+                                  Value *Stride, bool IsVolatile, unsigned Rows,
+                                  unsigned Columns, const Twine &Name = "") {
 
     // Deal with the pointer
     PointerType *PtrTy = cast<PointerType>(DataPtr->getType());
@@ -66,30 +67,41 @@
 
     auto *RetType = FixedVectorType::get(EltTy, Rows * Columns);
 
-    Value *Ops[] = {DataPtr, Stride, B.getInt32(Rows), B.getInt32(Columns)};
+    Value *Ops[] = {DataPtr, Stride, B.getInt1(IsVolatile), B.getInt32(Rows),
+                    B.getInt32(Columns)};
     Type *OverloadedTypes[] = {RetType, PtrTy};
 
     Function *TheFn = Intrinsic::getDeclaration(
-        getModule(), Intrinsic::matrix_columnwise_load, OverloadedTypes);
+        getModule(), Intrinsic::matrix_column_major_load, OverloadedTypes);
 
-    return B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name);
+    CallInst *Call = B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name);
+    Attribute AlignAttr =
+        Attribute::getWithAlignment(Call->getContext(), Alignment);
+    Call->addAttribute(1, AlignAttr);
+    return Call;
   }
 
-  /// Create a columnwise, strided matrix store.
+  /// Create a column major, strided matrix store.
   /// \p Matrix  - Matrix to store
   /// \p Ptr     - Pointer to write back to
   /// \p Stride  - Space between columns
-  CallInst *CreateMatrixColumnwiseStore(Value *Matrix, Value *Ptr,
-                                        Value *Stride, unsigned Rows,
-                                        unsigned Columns,
-                                        const Twine &Name = "") {
-    Value *Ops[] = {Matrix, Ptr, Stride, B.getInt32(Rows), B.getInt32(Columns)};
+  CallInst *CreateColumnMajorStore(Value *Matrix, Value *Ptr, Align Alignment,
+                                   Value *Stride, bool IsVolatile,
+                                   unsigned Rows, unsigned Columns,
+                                   const Twine &Name = "") {
+    Value *Ops[] = {Matrix,           Ptr,
+                    Stride,           B.getInt1(IsVolatile),
+                    B.getInt32(Rows), B.getInt32(Columns)};
     Type *OverloadedTypes[] = {Matrix->getType(), Ptr->getType()};
 
     Function *TheFn = Intrinsic::getDeclaration(
-        getModule(), Intrinsic::matrix_columnwise_store, OverloadedTypes);
+        getModule(), Intrinsic::matrix_column_major_store, OverloadedTypes);
 
-    return B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name);
+    CallInst *Call = B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name);
+    Attribute AlignAttr =
+        Attribute::getWithAlignment(Call->getContext(), Alignment);
+    Call->addAttribute(2, AlignAttr);
+    return Call;
   }
 
   /// Create a llvm.matrix.transpose call, transposing \p Matrix with \p Rows
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4992,8 +4992,8 @@
   }
   case Intrinsic::matrix_multiply:
   case Intrinsic::matrix_transpose:
-  case Intrinsic::matrix_columnwise_load:
-  case Intrinsic::matrix_columnwise_store: {
+  case Intrinsic::matrix_column_major_load:
+  case Intrinsic::matrix_column_major_store: {
     ConstantInt *NumRows;
     ConstantInt *NumColumns;
     VectorType *TypeToCheck;
@@ -5008,14 +5008,14 @@
       NumColumns = cast<ConstantInt>(Call.getArgOperand(2));
       TypeToCheck = cast<VectorType>(Call.getType());
       break;
-    case Intrinsic::matrix_columnwise_load:
-      NumRows = cast<ConstantInt>(Call.getArgOperand(2));
-      NumColumns = cast<ConstantInt>(Call.getArgOperand(3));
-      TypeToCheck = cast<VectorType>(Call.getType());
-      break;
-    case Intrinsic::matrix_columnwise_store:
+    case Intrinsic::matrix_column_major_load:
       NumRows = cast<ConstantInt>(Call.getArgOperand(3));
       NumColumns = cast<ConstantInt>(Call.getArgOperand(4));
+      TypeToCheck = cast<VectorType>(Call.getType());
+      break;
+    case Intrinsic::matrix_column_major_store:
+      NumRows = cast<ConstantInt>(Call.getArgOperand(4));
+      NumColumns = cast<ConstantInt>(Call.getArgOperand(5));
       TypeToCheck = cast<VectorType>(Call.getArgOperand(0)->getType());
       break;
     default:
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -164,10 +164,10 @@
 ///       definition of an argument, use the produced column vectors directly.
 ///       If not, split the operand vector containing an embedded matrix into
 ///       a set of column vectors,
-///  2.2. Lower the instruction in terms of columnwise operations, which yields
-///       a set of column vectors containing result matrix. Note that we lower
-///       all instructions that have shape information. Besides the intrinsics,
-///       this includes stores for example.
+///  2.2. Lower the instruction in terms of column major operations, which
+///       yields a set of column vectors containing result matrix. Note that we
+///       lower all instructions that have shape information. Besides the
+///       intrinsics, this includes stores for example.
 ///  2.3. Update uses of the lowered instruction. If we have shape information
 ///       for a user, there is nothing to do, as we will look up the result
 ///       column matrix when lowering the user. For other uses, we embed the
@@ -376,7 +376,7 @@
   /// Maps instructions to their shape information. The shape information
   /// describes the shape to be used while lowering. This matches the shape of
   /// the result value of the instruction, with the only exceptions being store
-  /// instructions and the matrix_columnwise_store intrinsics. For those, the
+  /// instructions and the matrix_column_major_store intrinsics. For those, the
   /// shape information indicates that those instructions should be lowered
   /// using shape information as well.
   DenseMap<Value *, ShapeInfo> ShapeMap;
@@ -502,8 +502,8 @@
       switch (II->getIntrinsicID()) {
       case Intrinsic::matrix_multiply:
       case Intrinsic::matrix_transpose:
-      case Intrinsic::matrix_columnwise_load:
-      case Intrinsic::matrix_columnwise_store:
+      case Intrinsic::matrix_column_major_load:
+      case Intrinsic::matrix_column_major_store:
         return true;
       default:
         return false;
@@ -542,13 +542,13 @@
                                  m_Value(MatrixA), m_Value(M), m_Value(N)))) {
         // Flip dimensions.
         Propagate = setShapeInfo(Inst, {N, M});
-      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_columnwise_store>(
+      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_store>(
                                  m_Value(MatrixA), m_Value(), m_Value(),
-                                 m_Value(M), m_Value(N)))) {
+                                 m_Value(), m_Value(M), m_Value(N)))) {
         Propagate = setShapeInfo(Inst, {N, M});
-      } else if (match(Inst,
-                       m_Intrinsic<Intrinsic::matrix_columnwise_load>(
-                           m_Value(), m_Value(), m_Value(M), m_Value(N)))) {
+      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_load>(
+                                 m_Value(), m_Value(), m_Value(), m_Value(M),
+                                 m_Value(N)))) {
         Propagate = setShapeInfo(Inst, {M, N});
       } else if (match(Inst, m_Store(m_Value(MatrixA), m_Value()))) {
         auto OpShape = ShapeMap.find(MatrixA);
@@ -620,14 +620,14 @@
         // Flip dimensions.
         if (setShapeInfo(MatrixA, {M, N}))
           pushInstruction(MatrixA, WorkList);
-      } else if (match(V, m_Intrinsic<Intrinsic::matrix_columnwise_store>(
-                              m_Value(MatrixA), m_Value(), m_Value(),
+      } else if (match(V, m_Intrinsic<Intrinsic::matrix_column_major_store>(
+                              m_Value(MatrixA), m_Value(), m_Value(), m_Value(),
                               m_Value(M), m_Value(N)))) {
         if (setShapeInfo(MatrixA, {M, N})) {
           pushInstruction(MatrixA, WorkList);
         }
       } else if (isa<LoadInst>(V) ||
-                 match(V, m_Intrinsic<Intrinsic::matrix_columnwise_load>())) {
+                 match(V, m_Intrinsic<Intrinsic::matrix_column_major_load>())) {
         // Nothing to do, no matrix input.
       } else if (isa<StoreInst>(V)) {
         // Nothing to do.  We forward-propagated to this so we would just
@@ -666,8 +666,8 @@
           switch (II->getIntrinsicID()) {
           case Intrinsic::matrix_multiply:
           case Intrinsic::matrix_transpose:
-          case Intrinsic::matrix_columnwise_load:
-          case Intrinsic::matrix_columnwise_store:
+          case Intrinsic::matrix_column_major_load:
+          case Intrinsic::matrix_column_major_store:
             WorkList.push_back(&Inst);
             break;
           default:
@@ -763,11 +763,11 @@
     case Intrinsic::matrix_transpose:
       LowerTranspose(Inst);
       break;
-    case Intrinsic::matrix_columnwise_load:
-      LowerColumnwiseLoad(Inst);
+    case Intrinsic::matrix_column_major_load:
+      LowerColumnMajorLoad(Inst);
       break;
-    case Intrinsic::matrix_columnwise_store:
-      LowerColumnwiseStore(Inst);
+    case Intrinsic::matrix_column_major_store:
+      LowerColumnMajorStore(Inst);
       break;
     default:
       return false;
@@ -783,7 +783,7 @@
     Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
     MatrixTy Result;
     for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) {
-      Value *GEP = computeVectorAddr(EltPtr, Builder.getInt32(I), Stride,
+      Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(I), Stride,
                                      Shape.getStride(), VType->getElementType(),
                                      Builder);
       Value *Vector = createVectorLoad(GEP, VType->getElementType(), Builder);
@@ -800,7 +800,7 @@
                       IRBuilder<> &Builder) {
 
     Value *Offset = Builder.CreateAdd(
-        Builder.CreateMul(J, Builder.getInt32(MatrixShape.getStride())), I);
+        Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
 
     unsigned AS = cast<PointerType>(MatrixPtr->getType())->getAddressSpace();
     Value *EltPtr =
@@ -813,7 +813,7 @@
         Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast");
 
     return loadMatrix(TileTy, TilePtr,
-                      Builder.getInt32(MatrixShape.getStride()), ResultShape,
+                      Builder.getInt64(MatrixShape.getStride()), ResultShape,
                       Builder);
   }
 
@@ -826,16 +826,16 @@
                      Builder);
   }
 
-  /// Lowers llvm.matrix.columnwise.load.
+  /// Lowers llvm.matrix.column.major.load.
   ///
   /// The intrinsic loads a matrix from memory using a stride between columns.
-  void LowerColumnwiseLoad(CallInst *Inst) {
+  void LowerColumnMajorLoad(CallInst *Inst) {
     assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
            "Intrinsic only supports column-major layout!");
     Value *Ptr = Inst->getArgOperand(0);
     Value *Stride = Inst->getArgOperand(1);
     LowerLoad(Inst, Ptr, Stride,
-              {Inst->getArgOperand(2), Inst->getArgOperand(3)});
+              {Inst->getArgOperand(3), Inst->getArgOperand(4)});
   }
 
   /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p
@@ -844,7 +844,7 @@
                    ShapeInfo MatrixShape, Value *I, Value *J, Type *EltTy,
                    IRBuilder<> &Builder) {
     Value *Offset = Builder.CreateAdd(
-        Builder.CreateMul(J, Builder.getInt32(MatrixShape.getStride())), I);
+        Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
 
     unsigned AS = cast<PointerType>(MatrixPtr->getType())->getAddressSpace();
     Value *EltPtr =
@@ -857,7 +857,7 @@
         Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast");
 
     storeMatrix(TileTy, StoreVal, TilePtr,
-                Builder.getInt32(MatrixShape.getStride()), Builder);
+                Builder.getInt64(MatrixShape.getStride()), Builder);
   }
 
   /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between
@@ -867,7 +867,7 @@
     auto VType = cast<VectorType>(Ty);
     Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
     for (auto Vec : enumerate(StoreVal.vectors())) {
-      Value *GEP = computeVectorAddr(EltPtr, Builder.getInt32(Vec.index()),
+      Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(Vec.index()),
                                      Stride, StoreVal.getStride(),
                                      VType->getElementType(), Builder);
       createVectorStore(Vec.value(), GEP, VType->getElementType(), Builder);
@@ -886,17 +886,17 @@
         Builder);
   }
 
-  /// Lowers llvm.matrix.columnwise.store.
+  /// Lowers llvm.matrix.column.major.store.
   ///
   /// The intrinsic store a matrix back memory using a stride between columns.
-  void LowerColumnwiseStore(CallInst *Inst) {
+  void LowerColumnMajorStore(CallInst *Inst) {
     assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
            "Intrinsic only supports column-major layout!");
     Value *Matrix = Inst->getArgOperand(0);
     Value *Ptr = Inst->getArgOperand(1);
     Value *Stride = Inst->getArgOperand(2);
     LowerStore(Inst, Matrix, Ptr, Stride,
-               {Inst->getArgOperand(3), Inst->getArgOperand(4)});
+               {Inst->getArgOperand(4), Inst->getArgOperand(5)});
   }
 
   // Set elements I..I+NumElts-1 to Block
@@ -1208,14 +1208,14 @@
         for (unsigned K = 0; K < M; K += TileSize) {
           const unsigned TileM = std::min(M - K, unsigned(TileSize));
           MatrixTy A =
-              loadMatrix(APtr, LShape, Builder.getInt32(I), Builder.getInt32(K),
+              loadMatrix(APtr, LShape, Builder.getInt64(I), Builder.getInt64(K),
                          {TileR, TileM}, EltType, Builder);
           MatrixTy B =
-              loadMatrix(BPtr, RShape, Builder.getInt32(K), Builder.getInt32(J),
+              loadMatrix(BPtr, RShape, Builder.getInt64(K), Builder.getInt64(J),
                          {TileM, TileC}, EltType, Builder);
           emitMatrixMultiply(Res, A, B, AllowContract, Builder, true);
         }
-        storeMatrix(Res, CPtr, {R, M}, Builder.getInt32(I), Builder.getInt32(J),
+        storeMatrix(Res, CPtr, {R, M}, Builder.getInt64(I), Builder.getInt64(J),
                     EltType, Builder);
       }
 
@@ -1329,7 +1329,7 @@
     if (I == ShapeMap.end())
       return false;
 
-    LowerLoad(Inst, Ptr, Builder.getInt32(I->second.getStride()), I->second);
+    LowerLoad(Inst, Ptr, Builder.getInt64(I->second.getStride()), I->second);
     return true;
   }
 
@@ -1339,7 +1339,7 @@
     if (I == ShapeMap.end())
       return false;
 
-    LowerStore(Inst, StoredVal, Ptr, Builder.getInt32(I->second.getStride()),
+    LowerStore(Inst, StoredVal, Ptr, Builder.getInt64(I->second.getStride()),
                I->second);
     return true;
   }
@@ -1507,11 +1507,11 @@
           prettyPrintMatrixType(II->getOperand(0), SS);
           SS << "." << *II->getType()->getScalarType();
           break;
-        case Intrinsic::matrix_columnwise_load:
+        case Intrinsic::matrix_column_major_load:
           prettyPrintMatrixType(II, SS);
           SS << "." << *II->getType()->getScalarType();
           break;
-        case Intrinsic::matrix_columnwise_store:
+        case Intrinsic::matrix_column_major_store:
           prettyPrintMatrixType(II->getOperand(0), SS);
           SS << "." << *II->getOperand(0)->getType()->getScalarType();
           break;
@@ -1529,9 +1529,10 @@
         case Intrinsic::matrix_multiply:
           return 3;
         case Intrinsic::matrix_transpose:
-        case Intrinsic::matrix_columnwise_load:
-        case Intrinsic::matrix_columnwise_store:
           return 2;
+        case Intrinsic::matrix_column_major_load:
+        case Intrinsic::matrix_column_major_store:
+          return 3;
         default:
           return 0;
         }
@@ -1626,7 +1627,7 @@
       write(std::string("("));
 
       unsigned NumOpsToBreak = 1;
-      if (match(Expr, m_Intrinsic<Intrinsic::matrix_columnwise_load>()))
+      if (match(Expr, m_Intrinsic<Intrinsic::matrix_column_major_load>()))
         NumOpsToBreak = 2;
 
       for (Value *Op : Ops) {
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll
@@ -11,19 +11,19 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <9 x double>* [[A_PTR:%.*]] to double*
 ; CHECK-NEXT:    [[COL_CAST:%.*]] = bitcast double* [[TMP0]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST]], align 8
-; CHECK-NEXT:    [[COL_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 3
+; CHECK-NEXT:    [[COL_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 3
 ; CHECK-NEXT:    [[COL_CAST1:%.*]] = bitcast double* [[COL_GEP]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST1]], align 8
-; CHECK-NEXT:    [[COL_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i32 6
+; CHECK-NEXT:    [[COL_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i64 6
 ; CHECK-NEXT:    [[COL_CAST4:%.*]] = bitcast double* [[COL_GEP3]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST4]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <9 x double>* [[B_PTR:%.*]] to double*
 ; CHECK-NEXT:    [[COL_CAST6:%.*]] = bitcast double* [[TMP1]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD7:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST6]], align 8
-; CHECK-NEXT:    [[COL_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i32 3
+; CHECK-NEXT:    [[COL_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i64 3
 ; CHECK-NEXT:    [[COL_CAST9:%.*]] = bitcast double* [[COL_GEP8]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD10:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST9]], align 8
-; CHECK-NEXT:    [[COL_GEP11:%.*]] = getelementptr double, double* [[TMP1]], i32 6
+; CHECK-NEXT:    [[COL_GEP11:%.*]] = getelementptr double, double* [[TMP1]], i64 6
 ; CHECK-NEXT:    [[COL_CAST12:%.*]] = bitcast double* [[COL_GEP11]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST12]], align 8
 
@@ -227,10 +227,10 @@
 ; CHECK-NEXT:    [[TMP108:%.*]] = bitcast <9 x double>* [[C_PTR:%.*]] to double*
 ; CHECK-NEXT:    [[TMP109:%.*]] = bitcast double* [[TMP108]] to <3 x double>*
 ; CHECK-NEXT:    store <3 x double> [[TMP47]], <3 x double>* [[TMP109]], align 8
-; CHECK-NEXT:    [[TMP110:%.*]] = getelementptr double, double* [[TMP108]], i32 3
+; CHECK-NEXT:    [[TMP110:%.*]] = getelementptr double, double* [[TMP108]], i64 3
 ; CHECK-NEXT:    [[TMP111:%.*]] = bitcast double* [[TMP110]] to <3 x double>*
 ; CHECK-NEXT:    store <3 x double> [[TMP77]], <3 x double>* [[TMP111]], align 8
-; CHECK-NEXT:    [[TMP112:%.*]] = getelementptr double, double* [[TMP108]], i32 6
+; CHECK-NEXT:    [[TMP112:%.*]] = getelementptr double, double* [[TMP108]], i64 6
 ; CHECK-NEXT:    [[TMP113:%.*]] = bitcast double* [[TMP112]] to <3 x double>*
 ; CHECK-NEXT:    store <3 x double> [[TMP107]], <3 x double>* [[TMP113]], align 8
 ; CHECK-NEXT:    ret void
@@ -255,19 +255,19 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <9 x double>* [[A_PTR:%.*]] to double*
 ; CHECK-NEXT:    [[COL_CAST:%.*]] = bitcast double* [[TMP0]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST]], align 8
-; CHECK-NEXT:    [[COL_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 3
+; CHECK-NEXT:    [[COL_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 3
 ; CHECK-NEXT:    [[COL_CAST1:%.*]] = bitcast double* [[COL_GEP]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST1]], align 8
-; CHECK-NEXT:    [[COL_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i32 6
+; CHECK-NEXT:    [[COL_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i64 6
 ; CHECK-NEXT:    [[COL_CAST4:%.*]] = bitcast double* [[COL_GEP3]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST4]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <9 x double>* [[B_PTR:%.*]] to double*
 ; CHECK-NEXT:    [[COL_CAST6:%.*]] = bitcast double* [[TMP1]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD7:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST6]], align 8
-; CHECK-NEXT:    [[COL_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i32 3
+; CHECK-NEXT:    [[COL_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i64 3
 ; CHECK-NEXT:    [[COL_CAST9:%.*]] = bitcast double* [[COL_GEP8]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD10:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST9]], align 8
-; CHECK-NEXT:    [[COL_GEP11:%.*]] = getelementptr double, double* [[TMP1]], i32 6
+; CHECK-NEXT:    [[COL_GEP11:%.*]] = getelementptr double, double* [[TMP1]], i64 6
 ; CHECK-NEXT:    [[COL_CAST12:%.*]] = bitcast double* [[COL_GEP11]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST12]], align 8
 
@@ -474,10 +474,10 @@
 ; CHECK-NEXT:    [[TMP110:%.*]] = bitcast <9 x double>* [[C_PTR:%.*]] to double*
 ; CHECK-NEXT:    [[COL_CAST92:%.*]] = bitcast double* [[TMP110]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD93:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST92]], align 8
-; CHECK-NEXT:    [[COL_GEP94:%.*]] = getelementptr double, double* [[TMP110]], i32 3
+; CHECK-NEXT:    [[COL_GEP94:%.*]] = getelementptr double, double* [[TMP110]], i64 3
 ; CHECK-NEXT:    [[COL_CAST95:%.*]] = bitcast double* [[COL_GEP94]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD96:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST95]], align 8
-; CHECK-NEXT:    [[COL_GEP97:%.*]] = getelementptr double, double* [[TMP110]], i32 6
+; CHECK-NEXT:    [[COL_GEP97:%.*]] = getelementptr double, double* [[TMP110]], i64 6
 ; CHECK-NEXT:    [[COL_CAST98:%.*]] = bitcast double* [[COL_GEP97]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD99:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST98]], align 8
 
@@ -492,10 +492,10 @@
 ; CHECK-NEXT:    [[TMP111:%.*]] = bitcast <9 x double>* [[C_PTR]] to double*
 ; CHECK-NEXT:    [[TMP112:%.*]] = bitcast double* [[TMP111]] to <3 x double>*
 ; CHECK-NEXT:    store <3 x double> [[TMP108]], <3 x double>* [[TMP112]], align 8
-; CHECK-NEXT:    [[TMP113:%.*]] = getelementptr double, double* [[TMP111]], i32 3
+; CHECK-NEXT:    [[TMP113:%.*]] = getelementptr double, double* [[TMP111]], i64 3
 ; CHECK-NEXT:    [[TMP114:%.*]] = bitcast double* [[TMP113]] to <3 x double>*
 ; CHECK-NEXT:    store <3 x double> [[TMP109]], <3 x double>* [[TMP114]], align 8
-; CHECK-NEXT:    [[TMP115:%.*]] = getelementptr double, double* [[TMP111]], i32 6
+; CHECK-NEXT:    [[TMP115:%.*]] = getelementptr double, double* [[TMP111]], i64 6
 ; CHECK-NEXT:    [[TMP116:%.*]] = bitcast double* [[TMP115]] to <3 x double>*
 ; CHECK-NEXT:    store <3 x double> [[TMP110]], <3 x double>* [[TMP116]], align 8
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll
@@ -14,7 +14,7 @@
 ; CHECK-NEXT:    store i32 [[R:%.*]], i32* [[R_ADDR]], align 4
 ; CHECK-NEXT:    store i32 [[C:%.*]], i32* [[C_ADDR]], align 4
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* bitcast ([5 x <4 x double>]* @foo to <2 x double>*), align 8
-; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr ([5 x <4 x double>], [5 x <4 x double>]* @foo, i32 0, i32 0, i32 2) to <2 x double>*), align 8
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr ([5 x <4 x double>], [5 x <4 x double>]* @foo, i32 0, i32 0, i64 2) to <2 x double>*), align 8
 ; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x double> [[COL_LOAD]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> undef, double [[TMP0]], i32 0
@@ -68,7 +68,7 @@
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <1 x double> [[TMP25]], <1 x double> undef, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <2 x double> [[TMP20]], <2 x double> [[TMP26]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    store <2 x double> [[COL_LOAD]], <2 x double>* bitcast (double* getelementptr inbounds ([5 x <4 x double>], [5 x <4 x double>]* @foo, i64 0, i64 2, i32 0) to <2 x double>*), align 8
-; CHECK-NEXT:    store <2 x double> [[COL_LOAD1]], <2 x double>* bitcast (double* getelementptr ([5 x <4 x double>], [5 x <4 x double>]* @foo, i64 0, i64 2, i32 2) to <2 x double>*), align 8
+; CHECK-NEXT:    store <2 x double> [[COL_LOAD1]], <2 x double>* bitcast (double* getelementptr ([5 x <4 x double>], [5 x <4 x double>]* @foo, i64 0, i64 2, i64 2) to <2 x double>*), align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-add-sub-double-row-major.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-add-sub-double-row-major.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-add-sub-double-row-major.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-add-sub-double-row-major.ll
@@ -11,16 +11,16 @@
 ; RM-NEXT:    [[TMP0:%.*]] = bitcast <6 x double>* [[A_PTR:%.*]] to double*
 ; RM-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[TMP0]] to <3 x double>*
 ; RM-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST]], align 8
-; RM-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 3
+; RM-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 3
 ; RM-NEXT:    [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>*
 ; RM-NEXT:    [[COL_LOAD2:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST1]], align 8
 ; RM-NEXT:    [[TMP1:%.*]] = bitcast <6 x double>* [[B_PTR:%.*]] to double*
 ; RM-NEXT:    [[VEC_CAST3:%.*]] = bitcast double* [[TMP1]] to <2 x double>*
 ; RM-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST3]], align 8
-; RM-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, double* [[TMP1]], i32 2
+; RM-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, double* [[TMP1]], i64 2
 ; RM-NEXT:    [[VEC_CAST6:%.*]] = bitcast double* [[VEC_GEP5]] to <2 x double>*
 ; RM-NEXT:    [[COL_LOAD7:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST6]], align 8
-; RM-NEXT:    [[VEC_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i32 4
+; RM-NEXT:    [[VEC_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i64 4
 ; RM-NEXT:    [[VEC_CAST9:%.*]] = bitcast double* [[VEC_GEP8]] to <2 x double>*
 ; RM-NEXT:    [[COL_LOAD10:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST9]], align 8
 ; RM-NEXT:    [[TMP2:%.*]] = fadd <3 x double> [[COL_LOAD]], [[COL_LOAD]]
@@ -28,7 +28,7 @@
 ; RM-NEXT:    [[TMP4:%.*]] = bitcast <6 x double>* [[A_PTR]] to double*
 ; RM-NEXT:    [[VEC_CAST11:%.*]] = bitcast double* [[TMP4]] to <3 x double>*
 ; RM-NEXT:    store <3 x double> [[TMP2]], <3 x double>* [[VEC_CAST11]], align 8
-; RM-NEXT:    [[VEC_GEP12:%.*]] = getelementptr double, double* [[TMP4]], i32 3
+; RM-NEXT:    [[VEC_GEP12:%.*]] = getelementptr double, double* [[TMP4]], i64 3
 ; RM-NEXT:    [[VEC_CAST13:%.*]] = bitcast double* [[VEC_GEP12]] to <3 x double>*
 ; RM-NEXT:    store <3 x double> [[TMP3]], <3 x double>* [[VEC_CAST13]], align 8
 ; RM-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[COL_LOAD4]], <double 1.000000e+00, double 1.000000e+00>
@@ -37,10 +37,10 @@
 ; RM-NEXT:    [[TMP8:%.*]] = bitcast <6 x double>* [[B_PTR]] to double*
 ; RM-NEXT:    [[VEC_CAST14:%.*]] = bitcast double* [[TMP8]] to <2 x double>*
 ; RM-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[VEC_CAST14]], align 8
-; RM-NEXT:    [[VEC_GEP15:%.*]] = getelementptr double, double* [[TMP8]], i32 2
+; RM-NEXT:    [[VEC_GEP15:%.*]] = getelementptr double, double* [[TMP8]], i64 2
 ; RM-NEXT:    [[VEC_CAST16:%.*]] = bitcast double* [[VEC_GEP15]] to <2 x double>*
 ; RM-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[VEC_CAST16]], align 8
-; RM-NEXT:    [[VEC_GEP17:%.*]] = getelementptr double, double* [[TMP8]], i32 4
+; RM-NEXT:    [[VEC_GEP17:%.*]] = getelementptr double, double* [[TMP8]], i64 4
 ; RM-NEXT:    [[VEC_CAST18:%.*]] = bitcast double* [[VEC_GEP17]] to <2 x double>*
 ; RM-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[VEC_CAST18]], align 8
 ; RM-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> undef, <1 x i32> zeroinitializer
@@ -122,7 +122,7 @@
 ; RM-NEXT:    [[TMP49:%.*]] = bitcast <4 x double>* [[C_PTR:%.*]] to double*
 ; RM-NEXT:    [[VEC_CAST52:%.*]] = bitcast double* [[TMP49]] to <2 x double>*
 ; RM-NEXT:    [[COL_LOAD53:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST52]], align 8
-; RM-NEXT:    [[VEC_GEP54:%.*]] = getelementptr double, double* [[TMP49]], i32 2
+; RM-NEXT:    [[VEC_GEP54:%.*]] = getelementptr double, double* [[TMP49]], i64 2
 ; RM-NEXT:    [[VEC_CAST55:%.*]] = bitcast double* [[VEC_GEP54]] to <2 x double>*
 ; RM-NEXT:    [[COL_LOAD56:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST55]], align 8
 ; RM-NEXT:    [[TMP50:%.*]] = fsub <2 x double> [[COL_LOAD53]], [[TMP28]]
@@ -130,7 +130,7 @@
 ; RM-NEXT:    [[TMP52:%.*]] = bitcast <4 x double>* [[C_PTR]] to double*
 ; RM-NEXT:    [[VEC_CAST57:%.*]] = bitcast double* [[TMP52]] to <2 x double>*
 ; RM-NEXT:    store <2 x double> [[TMP50]], <2 x double>* [[VEC_CAST57]], align 8
-; RM-NEXT:    [[VEC_GEP58:%.*]] = getelementptr double, double* [[TMP52]], i32 2
+; RM-NEXT:    [[VEC_GEP58:%.*]] = getelementptr double, double* [[TMP52]], i64 2
 ; RM-NEXT:    [[VEC_CAST59:%.*]] = bitcast double* [[VEC_GEP58]] to <2 x double>*
 ; RM-NEXT:    store <2 x double> [[TMP51]], <2 x double>* [[VEC_CAST59]], align 8
 ; RM-NEXT:    ret void
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll
@@ -48,13 +48,13 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x double>* [[A_PTR:%.*]] to double*
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[TMP0]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 2
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 2
 ; CHECK-NEXT:    [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i32 4
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i64 4
 ; CHECK-NEXT:    [[VEC_CAST4:%.*]] = bitcast double* [[VEC_GEP3]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST4]], align 8
-; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP0]], i32 6
+; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP0]], i64 6
 ; CHECK-NEXT:    [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <2 x double>*
 ; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST7]], align 8
 ; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> undef, <2 x i32> <i32 0, i32 1>
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-forward.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-forward.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-forward.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-forward.ll
@@ -30,7 +30,7 @@
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x double>* [[PTR:%.*]] to double*
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[TMP16]] to <4 x double>*
 ; CHECK-NEXT:    store <4 x double> [[TMP7]], <4 x double>* [[VEC_CAST]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP16]], i32 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP16]], i64 4
 ; CHECK-NEXT:    [[VEC_CAST4:%.*]] = bitcast double* [[VEC_GEP]] to <4 x double>*
 ; CHECK-NEXT:    store <4 x double> [[TMP15]], <4 x double>* [[VEC_CAST4]], align 8
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-mixed-users.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-mixed-users.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-mixed-users.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-mixed-users.ll
@@ -30,13 +30,13 @@
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x double>* [[PTR:%.*]] to double*
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[TMP20]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[VEC_CAST]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP20]], i32 2
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP20]], i64 2
 ; CHECK-NEXT:    [[VEC_CAST2:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[VEC_CAST2]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, double* [[TMP20]], i32 4
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, double* [[TMP20]], i64 4
 ; CHECK-NEXT:    [[VEC_CAST4:%.*]] = bitcast double* [[VEC_GEP3]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP12]], <2 x double>* [[VEC_CAST4]], align 8
-; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, double* [[TMP20]], i32 6
+; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, double* [[TMP20]], i64 6
 ; CHECK-NEXT:    [[VEC_CAST6:%.*]] = bitcast double* [[VEC_GEP5]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP16]], <2 x double>* [[VEC_CAST6]], align 8
 ; CHECK-NEXT:    call void @foo(<8 x double> [[TMP19]])
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-multiple-iterations.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-multiple-iterations.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-multiple-iterations.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-multiple-iterations.ll
@@ -11,13 +11,13 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x double>* [[A_PTR:%.*]] to double*
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[TMP1]] to <4 x double>*
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP1]], i32 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP1]], i64 4
 ; CHECK-NEXT:    [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <4 x double>*
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST1]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, double* [[TMP1]], i32 8
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, double* [[TMP1]], i64 8
 ; CHECK-NEXT:    [[VEC_CAST4:%.*]] = bitcast double* [[VEC_GEP3]] to <4 x double>*
 ; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST4]], align 8
-; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP1]], i32 12
+; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP1]], i64 12
 ; CHECK-NEXT:    [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <4 x double>*
 ; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST7]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[COL_LOAD]], i64 0
@@ -55,13 +55,13 @@
 ; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x double>* [[B_PTR:%.*]] to double*
 ; CHECK-NEXT:    [[VEC_CAST9:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
 ; CHECK-NEXT:    [[COL_LOAD10:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST9]], align 8
-; CHECK-NEXT:    [[VEC_GEP11:%.*]] = getelementptr double, double* [[TMP34]], i32 4
+; CHECK-NEXT:    [[VEC_GEP11:%.*]] = getelementptr double, double* [[TMP34]], i64 4
 ; CHECK-NEXT:    [[VEC_CAST12:%.*]] = bitcast double* [[VEC_GEP11]] to <4 x double>*
 ; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST12]], align 8
-; CHECK-NEXT:    [[VEC_GEP14:%.*]] = getelementptr double, double* [[TMP34]], i32 8
+; CHECK-NEXT:    [[VEC_GEP14:%.*]] = getelementptr double, double* [[TMP34]], i64 8
 ; CHECK-NEXT:    [[VEC_CAST15:%.*]] = bitcast double* [[VEC_GEP14]] to <4 x double>*
 ; CHECK-NEXT:    [[COL_LOAD16:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST15]], align 8
-; CHECK-NEXT:    [[VEC_GEP17:%.*]] = getelementptr double, double* [[TMP34]], i32 12
+; CHECK-NEXT:    [[VEC_GEP17:%.*]] = getelementptr double, double* [[TMP34]], i64 12
 ; CHECK-NEXT:    [[VEC_CAST18:%.*]] = bitcast double* [[VEC_GEP17]] to <4 x double>*
 ; CHECK-NEXT:    [[COL_LOAD19:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST18]], align 8
 ; CHECK-NEXT:    [[TMP35:%.*]] = fmul <4 x double> [[COL_LOAD]], [[COL_LOAD10]]
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll
@@ -51,7 +51,7 @@
 ; CHECK-NEXT:  load(addr %A)
 
 ; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops
-; CHECK-NEXT:  columnwise.load.3x5.double(addr %B, 5)
+; CHECK-NEXT:  column.major.load.3x5.double(addr %B, 5)
 
 ; CHECK-LABEL: remark: load.h:41:11: Lowered with 0 stores, 1 loads, 0 compute ops
 ; CHECK-NEXT: load(addr %D)
@@ -60,13 +60,13 @@
 ; CHECK-NEXT:  load(addr %A)
 
 ; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops
-; CHECK-NEXT:  columnwise.load.3x5.double(addr %B, 5)
+; CHECK-NEXT:  column.major.load.3x5.double(addr %B, 5)
 
 ; CHECK-LABEL: remark: toplevel.c:410:0: Lowered with 10 stores, 20 loads, 10 compute ops
 ; CHECK-NEXT:  store(
 ; CHECK-NEXT:   fadd(
 ; CHECK-NEXT:    load(addr %A),
-; CHECK-NEXT:    columnwise.load.3x5.double(addr %B, 5)),
+; CHECK-NEXT:    column.major.load.3x5.double(addr %B, 5)),
 ; CHECK-NEXT:   addr %C)
 
 ; CHECK-LABEL: remark: toplevel.c:510:0: Lowered with 1 stores, 1 loads, 8 compute ops
@@ -95,7 +95,7 @@
 define void @toplevel(<15 x double>* %A, <15 x double>* %B, <15 x double>* %C, <2 x float>* %D) !dbg !16 {
 entry:
   %a = load <15 x double>, <15 x double> *%A, align 16, !dbg !3791
-  %b = call <15 x double> @llvm.matrix.columnwise.load(<15 x double>* %B, i32 5, i32 3, i32 5), !dbg !3793
+  %b = call <15 x double> @llvm.matrix.column.major.load(<15 x double>* %B, i64 5, i1 false, i32 3, i32 5), !dbg !3793
   %c  = fadd <15 x double> %a, %b, !dbg !100
   store <15 x double> %c, <15 x double> *%C, align 16, !dbg !102
 
@@ -106,7 +106,7 @@
   ret void
 }
 
-declare <15 x double> @llvm.matrix.columnwise.load(<15 x double>*, i32, i32, i32)
+declare <15 x double> @llvm.matrix.column.major.load(<15 x double>*, i64, i1, i32, i32)
 declare <2 x float> @llvm.matrix.transpose(<2 x float>, i32, i32)
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll
@@ -28,8 +28,8 @@
 ; YAML-NEXT:    - String:          ' compute ops'
 ; YAML-NEXT:    - String:          ' are shared with other expressions'
 ; YAML-NEXT:    - String:           |
-; YAML:           columnwise.store.4x2.double(
-; YAML-NEXT:         shared with remark at line 35 column 45 (transpose.2x4.double(columnwise.load.2x4.double(addr %arg1,
+; YAML:           column.major.store.4x2.double(
+; YAML-NEXT:         shared with remark at line 35 column 45 (transpose.2x4.double(column.major.load.2x4.double(addr %arg1,
 ; YAML-NEXT:           scalar)),
 ; YAML-NEXT:         addr %arg3,
 ; YAML-NEXT:         10)
@@ -57,55 +57,55 @@
 ; YAML-NEXT:    - String:          ' compute ops'
 ; YAML-NEXT:    - String:          ' are shared with other expressions'
 ; YAML-NEXT:    - String:           |
-; YAML:            columnwise.store.4x15.double(
+; YAML:            column.major.store.4x15.double(
 ; YAML-NEXT:         fsub(
-; YAML-NEXT:          columnwise.load.4x15.double(addr %arg2, 20),
+; YAML-NEXT:          column.major.load.4x15.double(addr %arg2, 20),
 ; YAML-NEXT:          multiply.4x2.2x15.double(
-; YAML-NEXT:           shared with remark at line 35 column 71 (transpose.2x4.double(columnwise.load.2x4.double(addr %arg1,
+; YAML-NEXT:           shared with remark at line 35 column 71 (transpose.2x4.double(column.major.load.2x4.double(addr %arg1,
 ; YAML-NEXT:             scalar)),
-; YAML-NEXT:           columnwise.load.2x15.double(addr %arg3, scalar))),
+; YAML-NEXT:           column.major.load.2x15.double(addr %arg3, scalar))),
 ; YAML-NEXT:         addr %arg2,
 ; YAML-NEXT:         10)
 
 
 ; STDERR-LABEL: remark: test.cpp:35:71: Lowered with 4 stores, 0 loads, 0 compute ops,
 ; STDERR-NEXT:  additionally 0 stores, 4 loads, 16 compute ops are shared with other expressions
-; STDERR-NEXT:  columnwise.store.4x2.double(
-; STDERR-NEXT:   shared with remark at line 35 column 45 (transpose.2x4.double(columnwise.load.2x4.double(addr %arg1,
+; STDERR-NEXT:  column.major.store.4x2.double(
+; STDERR-NEXT:   shared with remark at line 35 column 45 (transpose.2x4.double(column.major.load.2x4.double(addr %arg1,
 ; STDERR-NEXT:     scalar)),
 ; STDERR-NEXT:   addr %arg3,
 ; STDERR-NEXT:   10)
 
 ; STDERR-LABEL: remark: test.cpp:35:45: Lowered with 30 stores, 45 loads, 120 compute ops,
 ; STDERR-NEXT:  additionally 0 stores, 4 loads, 16 compute ops are shared with other expressions
-; STDERR-NEXT:  columnwise.store.4x15.double(
+; STDERR-NEXT:  column.major.store.4x15.double(
 ; STDERR-NEXT:   fsub(
-; STDERR-NEXT:    columnwise.load.4x15.double(addr %arg2, 20),
+; STDERR-NEXT:    column.major.load.4x15.double(addr %arg2, 20),
 ; STDERR-NEXT:    multiply.4x2.2x15.double(
-; STDERR-NEXT:     shared with remark at line 35 column 71 (transpose.2x4.double(columnwise.load.2x4.double(addr %arg1,
+; STDERR-NEXT:     shared with remark at line 35 column 71 (transpose.2x4.double(column.major.load.2x4.double(addr %arg1,
 ; STDERR-NEXT:       scalar)),
-; STDERR-NEXT:     columnwise.load.2x15.double(addr %arg3, scalar))),
+; STDERR-NEXT:     column.major.load.2x15.double(addr %arg3, scalar))),
 ; STDERR-NEXT:   addr %arg2,
 ; STDERR-NEXT:   10)
-define void @test_2leafs(double* %arg1, double* %arg2, double* %arg3, i32 %stride, i32 %offset) !dbg !8 {
+define void @test_2leafs(double* %arg1, double* %arg2, double* %arg3, i64 %stride) !dbg !8 {
 bb:
-  %shared.load = tail call <8 x double> @llvm.matrix.columnwise.load.v8f64.p0f64(double* %arg1, i32 %stride, i32 2, i32 4), !dbg !10, !noalias !10
-  %shared.load.2 = tail call <30 x double> @llvm.matrix.columnwise.load.v30f64.p0f64(double* %arg3, i32 %stride, i32 2, i32 15), !dbg !10, !noalias !10
+  %shared.load = tail call <8 x double> @llvm.matrix.column.major.load.v8f64.p0f64(double* %arg1, i64 %stride, i1 false, i32 2, i32 4), !dbg !10, !noalias !10
+  %shared.load.2 = tail call <30 x double> @llvm.matrix.column.major.load.v30f64.p0f64(double* %arg3, i64 %stride, i1 false, i32 2, i32 15), !dbg !10, !noalias !10
   %tmp17 = tail call <8 x double> @llvm.matrix.transpose.v8f64(<8 x double> %shared.load, i32 2, i32 4), !dbg !10
-  tail call void @llvm.matrix.columnwise.store.v8f64.p0f64(<8 x double> %tmp17, double* %arg3, i32 10, i32 4, i32 2), !dbg !10
-  %tmp18 = tail call <60 x double> @llvm.matrix.columnwise.load.v60f64.p0f64(double* %arg2, i32 20, i32 4, i32 15), !dbg !11
+  tail call void @llvm.matrix.column.major.store.v8f64.p0f64(<8 x double> %tmp17, double* %arg3, i64 10, i1 false, i32 4, i32 2), !dbg !10
+  %tmp18 = tail call <60 x double> @llvm.matrix.column.major.load.v60f64.p0f64(double* %arg2, i64 20, i1 false, i32 4, i32 15), !dbg !11
   %tmp48 = tail call <60 x double> @llvm.matrix.multiply.v60f64.v8f64.v30f64(<8 x double> %tmp17, <30 x double> %shared.load.2, i32 4, i32 2, i32 15), !dbg !11
   %tmp49 = fsub <60 x double> %tmp18, %tmp48, !dbg !11
-  tail call void @llvm.matrix.columnwise.store.v60f64.p0f64(<60 x double> %tmp49, double* %arg2, i32 10, i32 4, i32 15), !dbg !11
+  tail call void @llvm.matrix.column.major.store.v60f64.p0f64(<60 x double> %tmp49, double* %arg2, i64 10, i1 false, i32 4, i32 15), !dbg !11
   ret void
 }
 
 declare <8 x double> @llvm.matrix.transpose.v8f64(<8 x double>, i32 immarg, i32 immarg)
-declare <8 x double> @llvm.matrix.columnwise.load.v8f64.p0f64(double*, i32, i32 immarg, i32 immarg)
-declare <30 x double> @llvm.matrix.columnwise.load.v30f64.p0f64(double*, i32, i32 immarg, i32 immarg)
-declare <60 x double> @llvm.matrix.columnwise.load.v60f64.p0f64(double*, i32, i32 immarg, i32 immarg)
-declare void @llvm.matrix.columnwise.store.v60f64.p0f64(<60 x double>, double* writeonly, i32, i32 immarg, i32 immarg)
-declare void @llvm.matrix.columnwise.store.v8f64.p0f64(<8 x double>, double* writeonly, i32, i32 immarg, i32 immarg)
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.p0f64(double*, i64, i1 immarg, i32 immarg, i32 immarg)
+declare <30 x double> @llvm.matrix.column.major.load.v30f64.p0f64(double*, i64, i1 immarg, i32 immarg, i32 immarg)
+declare <60 x double> @llvm.matrix.column.major.load.v60f64.p0f64(double*, i64, i1 immarg, i32 immarg, i32 immarg)
+declare void @llvm.matrix.column.major.store.v60f64.p0f64(<60 x double>, double* writeonly, i64, i1 immarg, i32 immarg, i32 immarg)
+declare void @llvm.matrix.column.major.store.v8f64.p0f64(<8 x double>, double* writeonly, i64, i1 immarg, i32 immarg, i32 immarg)
 declare <60 x double> @llvm.matrix.multiply.v60f64.v8f64.v30f64(<8 x double>, <30 x double>, i32 immarg, i32 immarg, i32 immarg)
 
 !llvm.module.flags = !{!0, !1, !2, !3}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
@@ -36,54 +36,54 @@
 
 ; CHECK-LABEL: remark: test.h:60:20: Lowered with 6 stores, 6 loads, 0 compute ops
 ; CHECK-NEXT:  store(
-; CHECK-NEXT:   columnwise.load.3x3.double(addr %A, 5),
+; CHECK-NEXT:   column.major.load.3x3.double(addr %A, 5),
 ; CHECK-NEXT:   addr %B)
-define void @columnwise.load(<9 x double>* %A, <9 x double>* %B) !dbg !27 {
-  %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !28
+define void @column.major.load(<9 x double>* %A, <9 x double>* %B) !dbg !27 {
+  %A.matrix = call <9 x double> @llvm.matrix.column.major.load(<9 x double>* %A, i64 5, i1 false, i32 3, i32 3), !dbg !28
   store <9 x double> %A.matrix, <9 x double>* %B, !dbg !28
   ret void
 }
 
-declare <9 x double> @llvm.matrix.columnwise.load(<9 x double>*, i32, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load(<9 x double>*, i64, i1, i32, i32)
 
 ; CHECK-LABEL: remark: test.h:70:20: Lowered with 6 stores, 6 loads, 0 compute ops
-; CHECK-NEXT:  columnwise.store.3x3.double(
-; CHECK-NEXT:   columnwise.load.3x3.double(addr %A, 5),
+; CHECK-NEXT:  column.major.store.3x3.double(
+; CHECK-NEXT:   column.major.load.3x3.double(addr %A, 5),
 ; CHECK-NEXT:   addr %B,
 ; CHECK-NEXT:   10)
-define void @columnwise.store(<9 x double>* %A, <9 x double>* %B) !dbg !29 {
-  %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !30
-  call void @llvm.matrix.columnwise.store(<9 x double> %A.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !30
+define void @column.major.store(<9 x double>* %A, <9 x double>* %B) !dbg !29 {
+  %A.matrix = call <9 x double> @llvm.matrix.column.major.load(<9 x double>* %A, i64 5, i1 false, i32 3, i32 3), !dbg !30
+  call void @llvm.matrix.column.major.store(<9 x double> %A.matrix, <9 x double>* %B, i64 10, i1 false, i32 3, i32 3), !dbg !30
   ret void
 }
 
-declare void @llvm.matrix.columnwise.store(<9 x double>, <9 x double>*, i32, i32, i32)
+declare void @llvm.matrix.column.major.store(<9 x double>, <9 x double>*, i64, i1, i32, i32)
 
 ; CHECK-LABEL: remark: test.h:80:20: Lowered with 6 stores, 6 loads, 12 compute ops
-; CHECK-NEXT:  columnwise.store.3x3.double(
+; CHECK-NEXT:  column.major.store.3x3.double(
 ; CHECK-NEXT:   fmul(
 ; CHECK-NEXT:    fadd(
-; CHECK-NEXT:     columnwise.load.3x3.double(addr %A, 5)
-; CHECK-NEXT:     (reused) columnwise.load.3x3.double(addr %A, 5)),
-; CHECK-NEXT:    (reused) columnwise.load.3x3.double(addr %A, 5)),
+; CHECK-NEXT:     column.major.load.3x3.double(addr %A, 5)
+; CHECK-NEXT:     (reused) column.major.load.3x3.double(addr %A, 5)),
+; CHECK-NEXT:    (reused) column.major.load.3x3.double(addr %A, 5)),
 ; CHECK-NEXT:   addr %B,
 ; CHECK-NEXT:   10)
 
 define void @binaryops(<9 x double>* %A, <9 x double>* %B) !dbg !31 {
-  %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !32
+  %A.matrix = call <9 x double> @llvm.matrix.column.major.load(<9 x double>* %A, i64 5, i1 false, i32 3, i32 3), !dbg !32
   %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !32
   %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !32
-  call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !32
+  call void @llvm.matrix.column.major.store(<9 x double> %R2.matrix, <9 x double>* %B, i64 10, i1 false, i32 3, i32 3), !dbg !32
   ret void
 }
 
 ; CHECK-LABEL: remark: test.h:90:20: Lowered with 6 stores, 6 loads, 12 compute ops
-; CHECK-NEXT:  columnwise.store.3x3.double(
+; CHECK-NEXT:  column.major.store.3x3.double(
 ; CHECK-NEXT:   fmul(
 ; CHECK-NEXT:    fadd(
-; CHECK-NEXT:     columnwise.load.3x3.double(addr %A, 5)
-; CHECK-NEXT:     (reused) columnwise.load.3x3.double(addr %A, 5)),
-; CHECK-NEXT:    (reused) columnwise.load.3x3.double(addr %A, 5)),
+; CHECK-NEXT:     column.major.load.3x3.double(addr %A, 5)
+; CHECK-NEXT:     (reused) column.major.load.3x3.double(addr %A, 5)),
+; CHECK-NEXT:    (reused) column.major.load.3x3.double(addr %A, 5)),
 ; CHECK-NEXT:   addr %B,
 ; CHECK-NEXT:   10)
 ; CHECK-NEXT:  remark: test.h:90:20: Lowered with 2 stores, 12 loads, 22 compute ops
@@ -94,10 +94,10 @@
 ; CHECK-NEXT:   addr %E)
 
 define void @multiple_expressions(<9 x double>* %A, <9 x double>* %B, <12 x double>* %C, <12 x double>* %D, <4 x double>* %E) !dbg !33 {
-  %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !34
+  %A.matrix = call <9 x double> @llvm.matrix.column.major.load(<9 x double>* %A, i64 5, i1 false, i32 3, i32 3), !dbg !34
   %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !34
   %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !34
-  call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !34
+  call void @llvm.matrix.column.major.store(<9 x double> %R2.matrix, <9 x double>* %B, i64 10, i1 false, i32 3, i32 3), !dbg !34
 
   %C.matrix = load <12 x double>, <12 x double>* %C, !dbg !34
   %D.matrix = load <12 x double>, <12 x double>* %D, !dbg !34
@@ -108,20 +108,20 @@
 }
 
 ; CHECK-LABEL: remark: test.h:100:20: Lowered with 6 stores, 6 loads, 12 compute ops
-; CHECK-NEXT:  columnwise.store.3x3.double(
+; CHECK-NEXT:  column.major.store.3x3.double(
 ; CHECK-NEXT:   fmul(
 ; CHECK-NEXT:    fadd(
-; CHECK-NEXT:     columnwise.load.3x3.double(addr %A, 5)
-; CHECK-NEXT:     (reused) columnwise.load.3x3.double(addr %A, 5)),
-; CHECK-NEXT:    (reused) columnwise.load.3x3.double(addr %A, 5)),
+; CHECK-NEXT:     column.major.load.3x3.double(addr %A, 5)
+; CHECK-NEXT:     (reused) column.major.load.3x3.double(addr %A, 5)),
+; CHECK-NEXT:    (reused) column.major.load.3x3.double(addr %A, 5)),
 ; CHECK-NEXT:   stack addr %B,
 ; CHECK-NEXT:   10)
 define void @stackaddresses(<9 x double>* %A) !dbg !35 {
   %B = alloca <9 x double>
-  %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !36
+  %A.matrix = call <9 x double> @llvm.matrix.column.major.load(<9 x double>* %A, i64 5, i1 false, i32 3, i32 3), !dbg !36
   %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !36
   %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !36
-  call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !36
+  call void @llvm.matrix.column.major.store(<9 x double> %R2.matrix, <9 x double>* %B, i64 10, i1 false, i32 3, i32 3), !dbg !36
   ret void
 }
 
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
@@ -2,20 +2,20 @@
 ; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s
 ; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
 
-define <9 x double> @strided_load_3x3(<9 x double>* %in, i32 %stride) {
+define <9 x double> @strided_load_3x3(<9 x double>* %in, i64 %stride) {
 ; CHECK-LABEL: @strided_load_3x3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <9 x double>* [[IN:%.*]] to double*
-; CHECK-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 [[VEC_START]]
+; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 [[VEC_START]]
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST]], align 8
-; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, double* [[TMP0]], i32 [[VEC_START1]]
+; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, double* [[TMP0]], i64 [[VEC_START1]]
 ; CHECK-NEXT:    [[VEC_CAST3:%.*]] = bitcast double* [[VEC_GEP2]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST3]], align 8
-; CHECK-NEXT:    [[VEC_START5:%.*]] = mul i32 2, [[STRIDE]]
-; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP0]], i32 [[VEC_START5]]
+; CHECK-NEXT:    [[VEC_START5:%.*]] = mul i64 2, [[STRIDE]]
+; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP0]], i64 [[VEC_START5]]
 ; CHECK-NEXT:    [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <3 x double>*
 ; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST7]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
@@ -24,51 +24,51 @@
 ; CHECK-NEXT:    ret <9 x double> [[TMP3]]
 ;
 entry:
-  %load = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %in, i32 %stride, i32 3, i32 3)
+  %load = call <9 x double> @llvm.matrix.column.major.load(<9 x double>* %in, i64 %stride, i1 false, i32 3, i32 3)
   ret <9 x double> %load
 }
 
-declare <9 x double> @llvm.matrix.columnwise.load(<9 x double>*, i32, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load(<9 x double>*, i64, i1, i32, i32)
 
-define <9 x double> @strided_load_9x1(<9 x double>* %in, i32 %stride) {
+define <9 x double> @strided_load_9x1(<9 x double>* %in, i64 %stride) {
 ; CHECK-LABEL: @strided_load_9x1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <9 x double>* [[IN:%.*]] to double*
-; CHECK-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 [[VEC_START]]
+; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 [[VEC_START]]
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[VEC_GEP]] to <9 x double>*
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <9 x double>, <9 x double>* [[VEC_CAST]], align 8
 ; CHECK-NEXT:    ret <9 x double> [[COL_LOAD]]
 ;
 entry:
-  %load = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %in, i32 %stride, i32 9, i32 1)
+  %load = call <9 x double> @llvm.matrix.column.major.load(<9 x double>* %in, i64 %stride, i1 false, i32 9, i32 1)
   ret <9 x double> %load
 }
 
-declare <8 x double> @llvm.matrix.columnwise.load.v8f64(<8 x double>*, i32, i32, i32)
+declare <8 x double> @llvm.matrix.column.major.load.v8f64(<8 x double>*, i64, i1, i32, i32)
 
-define <8 x double> @strided_load_4x2(<8 x double>* %in, i32 %stride) {
+define <8 x double> @strided_load_4x2(<8 x double>* %in, i64 %stride) {
 ; CHECK-LABEL: @strided_load_4x2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x double>* [[IN:%.*]] to double*
-; CHECK-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 [[VEC_START]]
+; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 [[VEC_START]]
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[VEC_GEP]] to <4 x double>*
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST]], align 8
-; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, double* [[TMP0]], i32 [[VEC_START1]]
+; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, double* [[TMP0]], i64 [[VEC_START1]]
 ; CHECK-NEXT:    [[VEC_CAST3:%.*]] = bitcast double* [[VEC_GEP2]] to <4 x double>*
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> [[COL_LOAD4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
 entry:
-  %load = call <8 x double> @llvm.matrix.columnwise.load.v8f64(<8 x double>* %in, i32 %stride, i32 4, i32 2)
+  %load = call <8 x double> @llvm.matrix.column.major.load.v8f64(<8 x double>* %in, i64 %stride, i1 false, i32 4, i32 2)
   ret <8 x double> %load
 }
 
-; CHECK: declare <9 x double> @llvm.matrix.columnwise.load.v9f64.p0v9f64(<9 x double>* nocapture, i32, i32 immarg, i32 immarg) [[READONLY:#[0-9]]]
+; CHECK: declare <9 x double> @llvm.matrix.column.major.load.v9f64.p0v9f64(<9 x double>* nocapture, i64, i1 immarg, i32 immarg, i32 immarg) [[READONLY:#[0-9]]]
 
-; CHECK: declare <8 x double> @llvm.matrix.columnwise.load.v8f64.p0v8f64(<8 x double>* nocapture, i32, i32 immarg, i32 immarg) [[READONLY]]
+; CHECK: declare <8 x double> @llvm.matrix.column.major.load.v8f64.p0v8f64(<8 x double>* nocapture, i64, i1 immarg, i32 immarg, i32 immarg) [[READONLY]]
 
 ; CHECK: attributes [[READONLY]] = { argmemonly nosync nounwind readonly willreturn }
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-float.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-float.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-float.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-float.ll
@@ -2,20 +2,20 @@
 ; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s
 ; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
 
-define <9 x float> @strided_load_3x3(<9 x float>* %in, i32 %stride) {
+define <9 x float> @strided_load_3x3(<9 x float>* %in, i64 %stride) {
 ; CHECK-LABEL: @strided_load_3x3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <9 x float>* [[IN:%.*]] to float*
-; CHECK-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, float* [[TMP0]], i32 [[VEC_START]]
+; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, float* [[TMP0]], i64 [[VEC_START]]
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast float* [[VEC_GEP]] to <3 x float>*
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, <3 x float>* [[VEC_CAST]], align 4
-; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, float* [[TMP0]], i32 [[VEC_START1]]
+; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, float* [[TMP0]], i64 [[VEC_START1]]
 ; CHECK-NEXT:    [[VEC_CAST3:%.*]] = bitcast float* [[VEC_GEP2]] to <3 x float>*
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <3 x float>, <3 x float>* [[VEC_CAST3]], align 4
-; CHECK-NEXT:    [[VEC_START5:%.*]] = mul i32 2, [[STRIDE]]
-; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr float, float* [[TMP0]], i32 [[VEC_START5]]
+; CHECK-NEXT:    [[VEC_START5:%.*]] = mul i64 2, [[STRIDE]]
+; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr float, float* [[TMP0]], i64 [[VEC_START5]]
 ; CHECK-NEXT:    [[VEC_CAST7:%.*]] = bitcast float* [[VEC_GEP6]] to <3 x float>*
 ; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <3 x float>, <3 x float>* [[VEC_CAST7]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> [[COL_LOAD4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
@@ -24,45 +24,45 @@
 ; CHECK-NEXT:    ret <9 x float> [[TMP3]]
 ;
 entry:
-  %load = call <9 x float> @llvm.matrix.columnwise.load(<9 x float>* %in, i32 %stride, i32 3, i32 3)
+  %load = call <9 x float> @llvm.matrix.column.major.load(<9 x float>* %in, i64 %stride, i1 false, i32 3, i32 3)
   ret <9 x float> %load
 }
 
-declare <9 x float> @llvm.matrix.columnwise.load(<9 x float>*, i32, i32, i32)
+declare <9 x float> @llvm.matrix.column.major.load(<9 x float>*, i64, i1, i32, i32)
 
-define <9 x float> @strided_load_9x1(<9 x float>* %in, i32 %stride) {
+define <9 x float> @strided_load_9x1(<9 x float>* %in, i64 %stride) {
 ; CHECK-LABEL: @strided_load_9x1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <9 x float>* [[IN:%.*]] to float*
-; CHECK-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, float* [[TMP0]], i32 [[VEC_START]]
+; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, float* [[TMP0]], i64 [[VEC_START]]
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast float* [[VEC_GEP]] to <9 x float>*
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <9 x float>, <9 x float>* [[VEC_CAST]], align 4
 ; CHECK-NEXT:    ret <9 x float> [[COL_LOAD]]
 ;
 entry:
-  %load = call <9 x float> @llvm.matrix.columnwise.load(<9 x float>* %in, i32 %stride, i32 9, i32 1)
+  %load = call <9 x float> @llvm.matrix.column.major.load(<9 x float>* %in, i64 %stride, i1 false, i32 9, i32 1)
   ret <9 x float> %load
 }
 
-declare <8 x float> @llvm.matrix.columnwise.load.v8f32(<8 x float>*, i32, i32, i32)
+declare <8 x float> @llvm.matrix.column.major.load.v8f32(<8 x float>*, i64, i1, i32, i32)
 
-define <8 x float> @strided_load_4x2(<8 x float>* %in, i32 %stride) {
+define <8 x float> @strided_load_4x2(<8 x float>* %in, i64 %stride) {
 ; CHECK-LABEL: @strided_load_4x2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x float>* [[IN:%.*]] to float*
-; CHECK-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, float* [[TMP0]], i32 [[VEC_START]]
+; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, float* [[TMP0]], i64 [[VEC_START]]
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast float* [[VEC_GEP]] to <4 x float>*
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <4 x float>, <4 x float>* [[VEC_CAST]], align 4
-; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, float* [[TMP0]], i32 [[VEC_START1]]
+; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, float* [[TMP0]], i64 [[VEC_START1]]
 ; CHECK-NEXT:    [[VEC_CAST3:%.*]] = bitcast float* [[VEC_GEP2]] to <4 x float>*
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[VEC_CAST3]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[COL_LOAD]], <4 x float> [[COL_LOAD4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
 entry:
-  %load = call <8 x float> @llvm.matrix.columnwise.load.v8f32(<8 x float>* %in, i32 %stride, i32 4, i32 2)
+  %load = call <8 x float> @llvm.matrix.column.major.load.v8f32(<8 x float>* %in, i64 %stride, i1 false, i32 4, i32 2)
   ret <8 x float> %load
 }
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-i32.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-i32.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-i32.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-i32.ll
@@ -2,20 +2,20 @@
 ; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s
 ; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
 
-define <9 x i32> @strided_load_3x3(<9 x i32>* %in, i32 %stride) {
+define <9 x i32> @strided_load_3x3(<9 x i32>* %in, i64 %stride) {
 ; CHECK-LABEL: @strided_load_3x3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <9 x i32>* [[IN:%.*]] to i32*
-; CHECK-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[VEC_START]]
+; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[VEC_START]]
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast i32* [[VEC_GEP]] to <3 x i32>*
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x i32>, <3 x i32>* [[VEC_CAST]], align 4
-; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[VEC_START1]]
+; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[VEC_START1]]
 ; CHECK-NEXT:    [[VEC_CAST3:%.*]] = bitcast i32* [[VEC_GEP2]] to <3 x i32>*
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <3 x i32>, <3 x i32>* [[VEC_CAST3]], align 4
-; CHECK-NEXT:    [[VEC_START5:%.*]] = mul i32 2, [[STRIDE]]
-; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[VEC_START5]]
+; CHECK-NEXT:    [[VEC_START5:%.*]] = mul i64 2, [[STRIDE]]
+; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[VEC_START5]]
 ; CHECK-NEXT:    [[VEC_CAST7:%.*]] = bitcast i32* [[VEC_GEP6]] to <3 x i32>*
 ; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <3 x i32>, <3 x i32>* [[VEC_CAST7]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i32> [[COL_LOAD]], <3 x i32> [[COL_LOAD4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
@@ -24,45 +24,45 @@
 ; CHECK-NEXT:    ret <9 x i32> [[TMP3]]
 ;
 entry:
-  %load = call <9 x i32> @llvm.matrix.columnwise.load(<9 x i32>* %in, i32 %stride, i32 3, i32 3)
+  %load = call <9 x i32> @llvm.matrix.column.major.load(<9 x i32>* %in, i64 %stride, i1 false, i32 3, i32 3)
   ret <9 x i32> %load
 }
 
-declare <9 x i32> @llvm.matrix.columnwise.load(<9 x i32>*, i32, i32, i32)
+declare <9 x i32> @llvm.matrix.column.major.load(<9 x i32>*, i64, i1, i32, i32)
 
-define <9 x i32> @strided_load_9x1(<9 x i32>* %in, i32 %stride) {
+define <9 x i32> @strided_load_9x1(<9 x i32>* %in, i64 %stride) {
 ; CHECK-LABEL: @strided_load_9x1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <9 x i32>* [[IN:%.*]] to i32*
-; CHECK-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[VEC_START]]
+; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[VEC_START]]
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast i32* [[VEC_GEP]] to <9 x i32>*
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <9 x i32>, <9 x i32>* [[VEC_CAST]], align 4
 ; CHECK-NEXT:    ret <9 x i32> [[COL_LOAD]]
 ;
 entry:
-  %load = call <9 x i32> @llvm.matrix.columnwise.load(<9 x i32>* %in, i32 %stride, i32 9, i32 1)
+  %load = call <9 x i32> @llvm.matrix.column.major.load(<9 x i32>* %in, i64 %stride, i1 false, i32 9, i32 1)
   ret <9 x i32> %load
 }
 
-declare <8 x i32> @llvm.matrix.columnwise.load.v8i32(<8 x i32>*, i32, i32, i32)
+declare <8 x i32> @llvm.matrix.column.major.load.v8i32(<8 x i32>*, i64, i1, i32, i32)
 
-define <8 x i32> @strided_load_4x2(<8 x i32>* %in, i32 %stride) {
+define <8 x i32> @strided_load_4x2(<8 x i32>* %in, i64 %stride) {
 ; CHECK-LABEL: @strided_load_4x2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i32>* [[IN:%.*]] to i32*
-; CHECK-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[VEC_START]]
+; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[VEC_START]]
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast i32* [[VEC_GEP]] to <4 x i32>*
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[VEC_CAST]], align 4
-; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[VEC_START1]]
+; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[VEC_START1]]
 ; CHECK-NEXT:    [[VEC_CAST3:%.*]] = bitcast i32* [[VEC_GEP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[VEC_CAST3]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[COL_LOAD]], <4 x i32> [[COL_LOAD4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
 entry:
-  %load = call <8 x i32> @llvm.matrix.columnwise.load.v8i32(<8 x i32>* %in, i32 %stride, i32 4, i32 2)
+  %load = call <8 x i32> @llvm.matrix.column.major.load.v8i32(<8 x i32>* %in, i64 %stride, i1 false, i32 4, i32 2)
   ret <8 x i32> %load
 }
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
@@ -8,35 +8,35 @@
 ; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <6 x double> [[IN]], <6 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[OUT:%.*]] to <3 x double>*
 ; CHECK-NEXT:    store <3 x double> [[SPLIT]], <3 x double>* [[VEC_CAST]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[OUT]], i32 5
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[OUT]], i64 5
 ; CHECK-NEXT:    [[VEC_CAST2:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>*
 ; CHECK-NEXT:    store <3 x double> [[SPLIT1]], <3 x double>* [[VEC_CAST2]], align 8
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.matrix.columnwise.store(<6 x double> %in, double* %out, i32 5, i32 3, i32 2)
+  call void @llvm.matrix.column.major.store(<6 x double> %in, double* %out, i64 5, i1 false, i32 3, i32 2)
   ret void
 }
 
-define void @strided_store_3x2_nonconst_stride(<6 x double> %in, i32 %stride, double* %out) {
+define void @strided_store_3x2_nonconst_stride(<6 x double> %in, i64 %stride, double* %out) {
 ; CHECK-LABEL: @strided_store_3x2_nonconst_stride(
 ; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <6 x double> [[IN:%.*]], <6 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
 ; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <6 x double> [[IN]], <6 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i32 [[VEC_START]]
+; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[VEC_START]]
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>*
 ; CHECK-NEXT:    store <3 x double> [[SPLIT]], <3 x double>* [[VEC_CAST]], align 8
-; CHECK-NEXT:    [[VEC_START2:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, double* [[OUT]], i32 [[VEC_START2]]
+; CHECK-NEXT:    [[VEC_START2:%.*]] = mul i64 1, [[STRIDE]]
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, double* [[OUT]], i64 [[VEC_START2]]
 ; CHECK-NEXT:    [[VEC_CAST4:%.*]] = bitcast double* [[VEC_GEP3]] to <3 x double>*
 ; CHECK-NEXT:    store <3 x double> [[SPLIT1]], <3 x double>* [[VEC_CAST4]], align 8
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.matrix.columnwise.store(<6 x double> %in, double* %out, i32 %stride, i32 3, i32 2)
+  call void @llvm.matrix.column.major.store(<6 x double> %in, double* %out, i64 %stride, i1 false, i32 3, i32 2)
   ret void
 }
 
 
-declare void @llvm.matrix.columnwise.store(<6 x double>, double*, i32, i32, i32)
+declare void @llvm.matrix.column.major.store(<6 x double>, double*, i64, i1, i32, i32)
 
 define void @strided_store_2x3(<10 x double> %in, double* %out) {
 ; CHECK-LABEL: @strided_store_2x3(
@@ -47,28 +47,28 @@
 ; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <10 x double> [[IN]], <10 x double> undef, <2 x i32> <i32 8, i32 9>
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[OUT:%.*]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[SPLIT]], <2 x double>* [[VEC_CAST]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[OUT]], i32 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[OUT]], i64 4
 ; CHECK-NEXT:    [[VEC_CAST5:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[SPLIT1]], <2 x double>* [[VEC_CAST5]], align 8
-; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr double, double* [[OUT]], i32 8
+; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr double, double* [[OUT]], i64 8
 ; CHECK-NEXT:    [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[SPLIT2]], <2 x double>* [[VEC_CAST7]], align 8
-; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr double, double* [[OUT]], i32 12
+; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr double, double* [[OUT]], i64 12
 ; CHECK-NEXT:    [[VEC_CAST9:%.*]] = bitcast double* [[VEC_GEP8]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[SPLIT3]], <2 x double>* [[VEC_CAST9]], align 8
-; CHECK-NEXT:    [[VEC_GEP10:%.*]] = getelementptr double, double* [[OUT]], i32 16
+; CHECK-NEXT:    [[VEC_GEP10:%.*]] = getelementptr double, double* [[OUT]], i64 16
 ; CHECK-NEXT:    [[VEC_CAST11:%.*]] = bitcast double* [[VEC_GEP10]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[SPLIT4]], <2 x double>* [[VEC_CAST11]], align 8
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.matrix.columnwise.store.v10f64(<10 x double> %in, double* %out, i32 4, i32 2, i32 5)
+  call void @llvm.matrix.column.major.store.v10f64(<10 x double> %in, double* %out, i64 4, i1 false, i32 2, i32 5)
   ret void
 }
 
-declare void @llvm.matrix.columnwise.store.v10f64(<10 x double>, double*, i32, i32, i32)
+declare void @llvm.matrix.column.major.store.v10f64(<10 x double>, double*, i64, i1, i32, i32)
 
-; CHECK: declare void @llvm.matrix.columnwise.store.v6f64.p0f64(<6 x double>, double* nocapture writeonly, i32, i32 immarg, i32 immarg) [[WRITEONLY:#[0-9]]]
+; CHECK: declare void @llvm.matrix.column.major.store.v6f64.p0f64(<6 x double>, double* nocapture writeonly, i64, i1 immarg, i32 immarg, i32 immarg) [[WRITEONLY:#[0-9]]]
 
-; CHECK: declare void @llvm.matrix.columnwise.store.v10f64.p0f64(<10 x double>, double* nocapture writeonly, i32, i32 immarg, i32 immarg) [[WRITEONLY]]
+; CHECK: declare void @llvm.matrix.column.major.store.v10f64.p0f64(<10 x double>, double* nocapture writeonly, i64, i1 immarg, i32 immarg, i32 immarg) [[WRITEONLY]]
 
 ; CHECK: attributes [[WRITEONLY]] = { argmemonly nosync nounwind willreturn writeonly }
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-float.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-float.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-float.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-float.ll
@@ -8,35 +8,35 @@
 ; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <6 x float> [[IN]], <6 x float> undef, <3 x i32> <i32 3, i32 4, i32 5>
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast float* [[OUT:%.*]] to <3 x float>*
 ; CHECK-NEXT:    store <3 x float> [[SPLIT]], <3 x float>* [[VEC_CAST]], align 4
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, float* [[OUT]], i32 5
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, float* [[OUT]], i64 5
 ; CHECK-NEXT:    [[VEC_CAST2:%.*]] = bitcast float* [[VEC_GEP]] to <3 x float>*
 ; CHECK-NEXT:    store <3 x float> [[SPLIT1]], <3 x float>* [[VEC_CAST2]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.matrix.columnwise.store(<6 x float> %in, float* %out, i32 5, i32 3, i32 2)
+  call void @llvm.matrix.column.major.store(<6 x float> %in, float* %out, i64 5, i1 false, i32 3, i32 2)
   ret void
 }
 
-define void @strided_store_3x2_nonconst_stride(<6 x float> %in, i32 %stride, float* %out) {
+define void @strided_store_3x2_nonconst_stride(<6 x float> %in, i64 %stride, float* %out) {
 ; CHECK-LABEL: @strided_store_3x2_nonconst_stride(
 ; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <6 x float> [[IN:%.*]], <6 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
 ; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <6 x float> [[IN]], <6 x float> undef, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, float* [[OUT:%.*]], i32 [[VEC_START]]
+; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[VEC_START]]
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast float* [[VEC_GEP]] to <3 x float>*
 ; CHECK-NEXT:    store <3 x float> [[SPLIT]], <3 x float>* [[VEC_CAST]], align 4
-; CHECK-NEXT:    [[VEC_START2:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr float, float* [[OUT]], i32 [[VEC_START2]]
+; CHECK-NEXT:    [[VEC_START2:%.*]] = mul i64 1, [[STRIDE]]
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr float, float* [[OUT]], i64 [[VEC_START2]]
 ; CHECK-NEXT:    [[VEC_CAST4:%.*]] = bitcast float* [[VEC_GEP3]] to <3 x float>*
 ; CHECK-NEXT:    store <3 x float> [[SPLIT1]], <3 x float>* [[VEC_CAST4]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.matrix.columnwise.store(<6 x float> %in, float* %out, i32 %stride, i32 3, i32 2)
+  call void @llvm.matrix.column.major.store(<6 x float> %in, float* %out, i64 %stride, i1 false, i32 3, i32 2)
   ret void
 }
 
 
-declare void @llvm.matrix.columnwise.store(<6 x float>, float*, i32, i32, i32)
+declare void @llvm.matrix.column.major.store(<6 x float>, float*, i64, i1, i32, i32)
 
 define void @strided_store_2x3(<10 x float> %in, float* %out) {
 ; CHECK-LABEL: @strided_store_2x3(
@@ -47,22 +47,22 @@
 ; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <10 x float> [[IN]], <10 x float> undef, <2 x i32> <i32 8, i32 9>
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast float* [[OUT:%.*]] to <2 x float>*
 ; CHECK-NEXT:    store <2 x float> [[SPLIT]], <2 x float>* [[VEC_CAST]], align 4
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, float* [[OUT]], i32 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, float* [[OUT]], i64 4
 ; CHECK-NEXT:    [[VEC_CAST5:%.*]] = bitcast float* [[VEC_GEP]] to <2 x float>*
 ; CHECK-NEXT:    store <2 x float> [[SPLIT1]], <2 x float>* [[VEC_CAST5]], align 4
-; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr float, float* [[OUT]], i32 8
+; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr float, float* [[OUT]], i64 8
 ; CHECK-NEXT:    [[VEC_CAST7:%.*]] = bitcast float* [[VEC_GEP6]] to <2 x float>*
 ; CHECK-NEXT:    store <2 x float> [[SPLIT2]], <2 x float>* [[VEC_CAST7]], align 4
-; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr float, float* [[OUT]], i32 12
+; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr float, float* [[OUT]], i64 12
 ; CHECK-NEXT:    [[VEC_CAST9:%.*]] = bitcast float* [[VEC_GEP8]] to <2 x float>*
 ; CHECK-NEXT:    store <2 x float> [[SPLIT3]], <2 x float>* [[VEC_CAST9]], align 4
-; CHECK-NEXT:    [[VEC_GEP10:%.*]] = getelementptr float, float* [[OUT]], i32 16
+; CHECK-NEXT:    [[VEC_GEP10:%.*]] = getelementptr float, float* [[OUT]], i64 16
 ; CHECK-NEXT:    [[VEC_CAST11:%.*]] = bitcast float* [[VEC_GEP10]] to <2 x float>*
 ; CHECK-NEXT:    store <2 x float> [[SPLIT4]], <2 x float>* [[VEC_CAST11]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.matrix.columnwise.store.v10f32(<10 x float> %in, float* %out, i32 4, i32 2, i32 5)
+  call void @llvm.matrix.column.major.store.v10f32(<10 x float> %in, float* %out, i64 4, i1 false, i32 2, i32 5)
   ret void
 }
 
-declare void @llvm.matrix.columnwise.store.v10f32(<10 x float>, float*, i32, i32, i32)
+declare void @llvm.matrix.column.major.store.v10f32(<10 x float>, float*, i64, i1, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-i32.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-i32.ll
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-i32.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-i32.ll
@@ -8,35 +8,35 @@
 ; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <6 x i32> [[IN]], <6 x i32> undef, <3 x i32> <i32 3, i32 4, i32 5>
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast i32* [[OUT:%.*]] to <3 x i32>*
 ; CHECK-NEXT:    store <3 x i32> [[SPLIT]], <3 x i32>* [[VEC_CAST]], align 4
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, i32* [[OUT]], i32 5
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, i32* [[OUT]], i64 5
 ; CHECK-NEXT:    [[VEC_CAST2:%.*]] = bitcast i32* [[VEC_GEP]] to <3 x i32>*
 ; CHECK-NEXT:    store <3 x i32> [[SPLIT1]], <3 x i32>* [[VEC_CAST2]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.matrix.columnwise.store(<6 x i32> %in, i32* %out, i32 5, i32 3, i32 2)
+  call void @llvm.matrix.column.major.store(<6 x i32> %in, i32* %out, i64 5, i1 false, i32 3, i32 2)
   ret void
 }
 
-define void @strided_store_3x2_nonconst_stride(<6 x i32> %in, i32 %stride, i32* %out) {
+define void @strided_store_3x2_nonconst_stride(<6 x i32> %in, i64 %stride, i32* %out) {
 ; CHECK-LABEL: @strided_store_3x2_nonconst_stride(
 ; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <6 x i32> [[IN:%.*]], <6 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
 ; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <6 x i32> [[IN]], <6 x i32> undef, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i32 [[VEC_START]]
+; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i64 [[VEC_START]]
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast i32* [[VEC_GEP]] to <3 x i32>*
 ; CHECK-NEXT:    store <3 x i32> [[SPLIT]], <3 x i32>* [[VEC_CAST]], align 4
-; CHECK-NEXT:    [[VEC_START2:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i32, i32* [[OUT]], i32 [[VEC_START2]]
+; CHECK-NEXT:    [[VEC_START2:%.*]] = mul i64 1, [[STRIDE]]
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i32, i32* [[OUT]], i64 [[VEC_START2]]
 ; CHECK-NEXT:    [[VEC_CAST4:%.*]] = bitcast i32* [[VEC_GEP3]] to <3 x i32>*
 ; CHECK-NEXT:    store <3 x i32> [[SPLIT1]], <3 x i32>* [[VEC_CAST4]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.matrix.columnwise.store(<6 x i32> %in, i32* %out, i32 %stride, i32 3, i32 2)
+  call void @llvm.matrix.column.major.store(<6 x i32> %in, i32* %out, i64 %stride, i1 false, i32 3, i32 2)
   ret void
 }
 
 
-declare void @llvm.matrix.columnwise.store(<6 x i32>, i32*, i32, i32, i32)
+declare void @llvm.matrix.column.major.store(<6 x i32>, i32*, i64, i1, i32, i32)
 
 define void @strided_store_2x3(<10 x i32> %in, i32* %out) {
 ; CHECK-LABEL: @strided_store_2x3(
@@ -47,22 +47,22 @@
 ; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <10 x i32> [[IN]], <10 x i32> undef, <2 x i32> <i32 8, i32 9>
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast i32* [[OUT:%.*]] to <2 x i32>*
 ; CHECK-NEXT:    store <2 x i32> [[SPLIT]], <2 x i32>* [[VEC_CAST]], align 4
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, i32* [[OUT]], i32 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, i32* [[OUT]], i64 4
 ; CHECK-NEXT:    [[VEC_CAST5:%.*]] = bitcast i32* [[VEC_GEP]] to <2 x i32>*
 ; CHECK-NEXT:    store <2 x i32> [[SPLIT1]], <2 x i32>* [[VEC_CAST5]], align 4
-; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr i32, i32* [[OUT]], i32 8
+; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr i32, i32* [[OUT]], i64 8
 ; CHECK-NEXT:    [[VEC_CAST7:%.*]] = bitcast i32* [[VEC_GEP6]] to <2 x i32>*
 ; CHECK-NEXT:    store <2 x i32> [[SPLIT2]], <2 x i32>* [[VEC_CAST7]], align 4
-; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr i32, i32* [[OUT]], i32 12
+; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr i32, i32* [[OUT]], i64 12
 ; CHECK-NEXT:    [[VEC_CAST9:%.*]] = bitcast i32* [[VEC_GEP8]] to <2 x i32>*
 ; CHECK-NEXT:    store <2 x i32> [[SPLIT3]], <2 x i32>* [[VEC_CAST9]], align 4
-; CHECK-NEXT:    [[VEC_GEP10:%.*]] = getelementptr i32, i32* [[OUT]], i32 16
+; CHECK-NEXT:    [[VEC_GEP10:%.*]] = getelementptr i32, i32* [[OUT]], i64 16
 ; CHECK-NEXT:    [[VEC_CAST11:%.*]] = bitcast i32* [[VEC_GEP10]] to <2 x i32>*
 ; CHECK-NEXT:    store <2 x i32> [[SPLIT4]], <2 x i32>* [[VEC_CAST11]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.matrix.columnwise.store.v10i32(<10 x i32> %in, i32* %out, i32 4, i32 2, i32 5)
+  call void @llvm.matrix.column.major.store.v10i32(<10 x i32> %in, i32* %out, i64 4, i1 false, i32 2, i32 5)
   ret void
 }
 
-declare void @llvm.matrix.columnwise.store.v10i32(<10 x i32>, i32*, i32, i32, i32)
+declare void @llvm.matrix.column.major.store.v10i32(<10 x i32>, i32*, i64, i1, i32, i32)
diff --git a/llvm/test/Verifier/matrix-intrinsics.ll b/llvm/test/Verifier/matrix-intrinsics.ll
--- a/llvm/test/Verifier/matrix-intrinsics.ll
+++ b/llvm/test/Verifier/matrix-intrinsics.ll
@@ -19,22 +19,22 @@
   ret <4 x float> %result.2
 }
 
-declare <4 x float> @llvm.matrix.columnwise.load.v4f32.p0v4f32(<4 x float>*, i32, i32, i32)
-declare <6 x float> @llvm.matrix.columnwise.load.v6f32.p0v6f32(<6 x float>*, i32, i32, i32)
-define <4 x float> @columnwise_load(<4 x float>* %m, <6 x float>* %n) {
+declare <4 x float> @llvm.matrix.column.major.load.v4f32.p0v4f32(<4 x float>*, i64, i1, i32, i32)
+declare <6 x float> @llvm.matrix.column.major.load.v6f32.p0v6f32(<6 x float>*, i64, i1, i32, i32)
+define <4 x float> @column.major_load(<4 x float>* %m, <6 x float>* %n) {
 ; CHECK-NEXT: result of a matrix operation does not fit in the returned vector
 ; CHECK-NEXT: result of a matrix operation does not fit in the returned vector
-  %result.1 = call <4 x float> @llvm.matrix.columnwise.load.v4f32.p0v4f32(<4 x float>* %m, i32 2, i32 1, i32 2)
-  %result.2 = call <6 x float> @llvm.matrix.columnwise.load.v6f32.p0v6f32(<6 x float>* %n, i32 2, i32 3, i32 3)
+  %result.1 = call <4 x float> @llvm.matrix.column.major.load.v4f32.p0v4f32(<4 x float>* %m, i64 2, i1 false, i32 1, i32 2)
+  %result.2 = call <6 x float> @llvm.matrix.column.major.load.v6f32.p0v6f32(<6 x float>* %n, i64 2, i1 true, i32 3, i32 3)
   ret <4 x float> %result.1
 }
 
-declare void @llvm.matrix.columnwise.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, i32, i32)
-declare void @llvm.matrix.columnwise.store.v6f32.p0v6f32(<6 x float>, <6 x float>*, i32, i32, i32)
-define void @columnwise_store(<4 x float>* %m, <6 x float>* %n) {
+declare void @llvm.matrix.column.major.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i64, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v6f32.p0v6f32(<6 x float>, <6 x float>*, i64, i1, i32, i32)
+define void @column.major_store(<4 x float>* %m, <6 x float>* %n) {
 ; CHECK-NEXT: result of a matrix operation does not fit in the returned vector
 ; CHECK-NEXT: result of a matrix operation does not fit in the returned vector
-  call void @llvm.matrix.columnwise.store.v4f32.p0v4f32(<4 x float> zeroinitializer, <4 x float>* %m, i32 2, i32 1, i32 2)
-  call void @llvm.matrix.columnwise.store.v6f32.p0v6f32(<6 x float> zeroinitializer, <6 x float>* %n, i32 2, i32 3, i32 3)
+  call void @llvm.matrix.column.major.store.v4f32.p0v4f32(<4 x float> zeroinitializer, <4 x float>* %m, i64 2, i1 false, i32 1, i32 2)
+  call void @llvm.matrix.column.major.store.v6f32.p0v6f32(<6 x float> zeroinitializer, <6 x float>* %n, i64 2, i1 false, i32 3, i32 3)
   ret void
 }
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -815,40 +815,52 @@
 // LLVM Matrix operations.
 //
 
-/// Create a columnwise, strided 2-D matrix load, as specified in the LLVM
+/// Create a column major, strided 2-D matrix load, as specified in the LLVM
 /// MatrixBuilder.
-/// data    - Start address of the matrix read
-/// rows    - Number of rows in matrix (must be a constant)
-/// columns - Number of columns in matrix (must be a constant)
-/// stride  - Space between columns
-def LLVM_MatrixColumnsWiseLoadOp
-    : LLVM_OneResultOp<"intr.matrix.columnwise.load">,
-      Arguments<(ins LLVM_Type:$data, LLVM_Type:$stride,
+/// data       - Start address of the matrix read
+/// rows       - Number of rows in matrix (must be a constant)
+/// isVolatile - True if the load operation is marked as volatile.
+/// columns    - Number of columns in matrix (must be a constant)
+/// stride     - Space between columns
+def LLVM_MatrixColumnMajorLoadOp
+    : LLVM_OneResultOp<"intr.matrix.column.major.load">,
+      Arguments<(ins LLVM_Type:$data, LLVM_Type:$stride, I1Attr:$isVolatile,
                  I32Attr:$rows, I32Attr:$columns)> {
   string llvmBuilder = [{
     llvm::MatrixBuilder<decltype(builder)> mb(builder);
-    $res = mb.CreateMatrixColumnwiseLoad(
-      $data, $rows.getZExtValue(), $columns.getZExtValue(), $stride);
+    const llvm::DataLayout &dl =
+      builder.GetInsertBlock()->getModule()->getDataLayout();
+    llvm::Align align = dl.getABITypeAlign(
+      $data->getType()->getPointerElementType());
+    $res = mb.CreateColumnMajorLoad(
+      $data, align, $stride, $isVolatile.getZExtValue(), $rows.getZExtValue(),
+      $columns.getZExtValue());
   }];
   let assemblyFormat = "$data `,` `<` `stride` `=` $stride `>` attr-dict"
     "`:` type($res) `from` type($data) `stride` type($stride)";
 }
 
-/// Create a columnwise, strided 2-D matrix store, as specified in the LLVM
+/// Create a column major, strided 2-D matrix store, as specified in the LLVM
 /// MatrixBuilder.
-/// matrix  - Matrix to store
-/// ptr     - Pointer to write back to
-/// rows    - Number of rows in matrix (must be a constant)
-/// columns - Number of columns in matrix (must be a constant)
-/// stride  - Space between columns
-def LLVM_MatrixColumnsWiseStoreOp
-    : LLVM_ZeroResultOp<"intr.matrix.columnwise.store">,
+/// matrix     - Matrix to store
+/// ptr        - Pointer to write back to
+/// isVolatile - True if the load operation is marked as volatile.
+/// rows       - Number of rows in matrix (must be a constant)
+/// columns    - Number of columns in matrix (must be a constant)
+/// stride     - Space between columns
+def LLVM_MatrixColumnMajorStoreOp
+    : LLVM_ZeroResultOp<"intr.matrix.column.major.store">,
       Arguments<(ins LLVM_Type:$matrix, LLVM_Type:$data, LLVM_Type:$stride,
-                 I32Attr:$rows, I32Attr:$columns)> {
+                 I1Attr:$isVolatile, I32Attr:$rows, I32Attr:$columns)> {
   string llvmBuilder = [{
     llvm::MatrixBuilder<decltype(builder)> mb(builder);
-    mb.CreateMatrixColumnwiseStore(
-      $matrix, $data, $stride, $rows.getZExtValue(), $columns.getZExtValue());
+    const llvm::DataLayout &dl =
+      builder.GetInsertBlock()->getModule()->getDataLayout();
+    llvm::Align align = dl.getABITypeAlign(
+      $data->getType()->getPointerElementType());
+    mb.CreateColumnMajorStore(
+      $matrix, $data, align, $stride, $isVolatile.getZExtValue(),
+      $rows.getZExtValue(), $columns.getZExtValue());
   }];
   let assemblyFormat = "$matrix `,` $data `,` `<` `stride` `=` $stride `>` "
     "attr-dict`:` type($matrix) `to` type($data) `stride` type($stride)";
diff --git a/mlir/test/Target/llvmir-intrinsics.mlir b/mlir/test/Target/llvmir-intrinsics.mlir
--- a/mlir/test/Target/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/llvmir-intrinsics.mlir
@@ -151,7 +151,7 @@
 // CHECK-LABEL: @matrix_intrinsics
 //                                       4x16                       16x3
 llvm.func @matrix_intrinsics(%A: !llvm<"<64 x float>">, %B: !llvm<"<48 x float>">,
-                             %ptr: !llvm<"float*">, %stride: !llvm.i32) {
+                             %ptr: !llvm<"float*">, %stride: !llvm.i64) {
   // CHECK: call <12 x float> @llvm.matrix.multiply.v12f32.v64f32.v48f32(<64 x float> %0, <48 x float> %1, i32 4, i32 16, i32 3)
   %C = llvm.intr.matrix.multiply %A, %B
     { lhs_rows = 4: i32, lhs_columns = 16: i32 , rhs_columns = 3: i32} :
@@ -159,14 +159,14 @@
   // CHECK: call <48 x float> @llvm.matrix.transpose.v48f32(<48 x float> %1, i32 3, i32 16)
   %D = llvm.intr.matrix.transpose %B { rows = 3: i32, columns = 16: i32} :
     !llvm<"<48 x float>"> into !llvm<"<48 x float>">
-  // CHECK: call <48 x float> @llvm.matrix.columnwise.load.v48f32.p0f32(float* %2, i32 %3, i32 3, i32 16)
-  %E = llvm.intr.matrix.columnwise.load %ptr, <stride=%stride>
-    { rows = 3: i32, columns = 16: i32} :
-    !llvm<"<48 x float>"> from !llvm<"float*"> stride !llvm.i32
-  // CHECK: call void @llvm.matrix.columnwise.store.v48f32.p0f32(<48 x float> %7, float* %2, i32 %3, i32 3, i32 16)
-  llvm.intr.matrix.columnwise.store %E, %ptr, <stride=%stride>
-    { rows = 3: i32, columns = 16: i32} :
-    !llvm<"<48 x float>"> to !llvm<"float*"> stride !llvm.i32
+  // CHECK: call <48 x float> @llvm.matrix.column.major.load.v48f32.p0f32(float* align 4 %2, i64 %3, i1 false, i32 3, i32 16)
+  %E = llvm.intr.matrix.column.major.load %ptr, <stride=%stride>
+    { isVolatile = 0: i1, rows = 3: i32, columns = 16: i32} :
+    !llvm<"<48 x float>"> from !llvm<"float*"> stride !llvm.i64
+  // CHECK: call void @llvm.matrix.column.major.store.v48f32.p0f32(<48 x float> %7, float* align 4 %2, i64 %3, i1 false, i32 3, i32 16)
+  llvm.intr.matrix.column.major.store %E, %ptr, <stride=%stride>
+    { isVolatile = 0: i1, rows = 3: i32, columns = 16: i32} :
+    !llvm<"<48 x float>"> to !llvm<"float*"> stride !llvm.i64
   llvm.return
 }
 
@@ -209,7 +209,7 @@
 // CHECK-DAG: declare float @llvm.copysign.f32(float, float)
 // CHECK-DAG: declare <12 x float> @llvm.matrix.multiply.v12f32.v64f32.v48f32(<64 x float>, <48 x float>, i32 immarg, i32 immarg, i32 immarg)
 // CHECK-DAG: declare <48 x float> @llvm.matrix.transpose.v48f32(<48 x float>, i32 immarg, i32 immarg)
-// CHECK-DAG: declare <48 x float> @llvm.matrix.columnwise.load.v48f32.p0f32(float* nocapture, i32, i32 immarg, i32 immarg)
-// CHECK-DAG: declare void @llvm.matrix.columnwise.store.v48f32.p0f32(<48 x float>, float* nocapture writeonly, i32, i32 immarg, i32 immarg)
+// CHECK-DAG: declare <48 x float> @llvm.matrix.column.major.load.v48f32.p0f32(float* nocapture, i64, i1 immarg, i32 immarg, i32 immarg)
+// CHECK-DAG: declare void @llvm.matrix.column.major.store.v48f32.p0f32(<48 x float>, float* nocapture writeonly, i64, i1 immarg, i32 immarg, i32 immarg)
 // CHECK-DAG: declare <7 x float> @llvm.masked.load.v7f32.p0v7f32(<7 x float>*, i32 immarg, <7 x i1>, <7 x float>)
 // CHECK-DAG: declare void @llvm.masked.store.v7f32.p0v7f32(<7 x float>, <7 x float>*, i32 immarg, <7 x i1>)