diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -15466,7 +15466,7 @@ * elements. -'``llvm.matrix.columnwise.load.*``' Intrinsic +'``llvm.matrix.column.major.load.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -15474,25 +15474,34 @@ :: - declare vectorty @llvm.matrix.columnwise.load.*(ptrty %Ptr, i32 %Stride, i32 , i32 ) + declare vectorty @llvm.matrix.column.major.load.*( + ptrty %Ptr, i64 %Stride, i1 , i32 , i32 ) Overview: """"""""" -The '``llvm.matrix.columnwise.load.*``' intrinsic loads a matrix with +The '``llvm.matrix.column.major.load.*``' intrinsic loads a matrix with rows and columns, using a stride of %Stride between columns. For two consecutive columns A and B, %Stride refers to the distance (the number of elements) between the start of column A and the start of column B. The result matrix is returned embedded in the result vector. This allows for convenient -loading of sub matrixes. +loading of sub matrixes. If is true, the intrinsic is considered +a :ref:`volatile memory access .` + +If the %Ptr argument is known to be aligned to some boundary, this can be +specified as an attribute on the argument. Arguments: """""""""" -The and arguments must be constant integers. The returned vector -must have * elements. %Stride must be >= . +The , and arguments must be constant integers. The +returned vector must have * elements. %Stride must be >= . + +The :ref:`align ` parameter attribute can be provided +for the %Ptr arguments. -'``llvm.matrix.columnwise.store.*``' Intrinsic + +'``llvm.matrix.column.major.store.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -15500,22 +15509,31 @@ :: - declare void @llvm.matrix.columnwise.store.*(vectorty %In, ptrty %Ptr, i32 %Stride, i32 , i32 ) + declare void @llvm.matrix.column.major.store.*( + vectorty %In, ptrty %Ptr, i64 %Stride, i1 , i32 , i32 ) Overview: """"""""" -The '``llvm.matrix.columnwise.store.*``' intrinsic stores the matrix with +The '``llvm.matrix.column.major.store.*``' intrinsic stores the matrix with rows and columns embedded in %In, using a stride of %Stride between columns. For two consecutive columns A and B, %Stride refers to the distance (the number of elements) between the start of column A and the start -of column B. +of column B. If is true, the intrinsic is considered a +:ref:`volatile memory access .` + +If the %Ptr argument is known to be aligned to some boundary, this can be +specified as an attribute on the argument. Arguments: """""""""" -The and arguments must be constant integers. The vector argument -%In must have * elements. %Stride must be >= . +The , , arguments must be constant integers. The +vector argument %In must have * elements. %Stride must be >= . + +The :ref:`align ` parameter attribute can be provided +for the %Ptr arguments. + Half Precision Floating-Point Intrinsics ---------------------------------------- diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1449,21 +1449,21 @@ [IntrNoSync, IntrWillReturn, IntrNoMem, IntrSpeculatable, ImmArg>, ImmArg>, ImmArg>]>; -def int_matrix_columnwise_load +def int_matrix_column_major_load : Intrinsic<[llvm_anyvector_ty], - [LLVMAnyPointerType>, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty], + [LLVMAnyPointerType>, llvm_i64_ty, llvm_i1_ty, + llvm_i32_ty, llvm_i32_ty], [IntrNoSync, IntrWillReturn, IntrArgMemOnly, IntrReadMem, - NoCapture>, ImmArg>, - ImmArg>]>; + NoCapture>, ImmArg>, ImmArg>, + ImmArg>]>; -def int_matrix_columnwise_store +def int_matrix_column_major_store : Intrinsic<[], [llvm_anyvector_ty, LLVMAnyPointerType>, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + llvm_i64_ty, llvm_i1_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoSync, IntrWillReturn, IntrArgMemOnly, IntrWriteMem, WriteOnly>, NoCapture>, - ImmArg>, ImmArg>]>; + ImmArg>, ImmArg>, ImmArg>]>; //===---------- Intrinsics to control hardware supported loops ----------===// diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h --- a/llvm/include/llvm/IR/MatrixBuilder.h +++ b/llvm/include/llvm/IR/MatrixBuilder.h @@ -22,6 +22,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/Support/Alignment.h" namespace llvm { @@ -51,14 +52,14 @@ public: MatrixBuilder(IRBuilderTy &Builder) : B(Builder) {} - /// Create a columnwise, strided matrix load. + /// Create a column major, strided matrix load. /// \p DataPtr - Start address of the matrix read /// \p Rows - Number of rows in matrix (must be a constant) /// \p Columns - Number of columns in matrix (must be a constant) /// \p Stride - Space between columns - CallInst *CreateMatrixColumnwiseLoad(Value *DataPtr, unsigned Rows, - unsigned Columns, Value *Stride, - const Twine &Name = "") { + CallInst *CreateColumnMajorLoad(Value *DataPtr, Align Alignment, + Value *Stride, bool IsVolatile, unsigned Rows, + unsigned Columns, const Twine &Name = "") { // Deal with the pointer PointerType *PtrTy = cast(DataPtr->getType()); @@ -66,30 +67,41 @@ auto *RetType = FixedVectorType::get(EltTy, Rows * Columns); - Value *Ops[] = {DataPtr, Stride, B.getInt32(Rows), B.getInt32(Columns)}; + Value *Ops[] = {DataPtr, Stride, B.getInt1(IsVolatile), B.getInt32(Rows), + B.getInt32(Columns)}; Type *OverloadedTypes[] = {RetType, PtrTy}; Function *TheFn = Intrinsic::getDeclaration( - getModule(), Intrinsic::matrix_columnwise_load, OverloadedTypes); + getModule(), Intrinsic::matrix_column_major_load, OverloadedTypes); - return B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name); + CallInst *Call = B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name); + Attribute AlignAttr = + Attribute::getWithAlignment(Call->getContext(), Alignment); + Call->addAttribute(1, AlignAttr); + return Call; } - /// Create a columnwise, strided matrix store. + /// Create a column major, strided matrix store. /// \p Matrix - Matrix to store /// \p Ptr - Pointer to write back to /// \p Stride - Space between columns - CallInst *CreateMatrixColumnwiseStore(Value *Matrix, Value *Ptr, - Value *Stride, unsigned Rows, - unsigned Columns, - const Twine &Name = "") { - Value *Ops[] = {Matrix, Ptr, Stride, B.getInt32(Rows), B.getInt32(Columns)}; + CallInst *CreateColumnMajorStore(Value *Matrix, Value *Ptr, Align Alignment, + Value *Stride, bool IsVolatile, + unsigned Rows, unsigned Columns, + const Twine &Name = "") { + Value *Ops[] = {Matrix, Ptr, + Stride, B.getInt1(IsVolatile), + B.getInt32(Rows), B.getInt32(Columns)}; Type *OverloadedTypes[] = {Matrix->getType(), Ptr->getType()}; Function *TheFn = Intrinsic::getDeclaration( - getModule(), Intrinsic::matrix_columnwise_store, OverloadedTypes); + getModule(), Intrinsic::matrix_column_major_store, OverloadedTypes); - return B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name); + CallInst *Call = B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name); + Attribute AlignAttr = + Attribute::getWithAlignment(Call->getContext(), Alignment); + Call->addAttribute(2, AlignAttr); + return Call; } /// Create a llvm.matrix.transpose call, transposing \p Matrix with \p Rows diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -4992,8 +4992,8 @@ } case Intrinsic::matrix_multiply: case Intrinsic::matrix_transpose: - case Intrinsic::matrix_columnwise_load: - case Intrinsic::matrix_columnwise_store: { + case Intrinsic::matrix_column_major_load: + case Intrinsic::matrix_column_major_store: { ConstantInt *NumRows; ConstantInt *NumColumns; VectorType *TypeToCheck; @@ -5008,14 +5008,14 @@ NumColumns = cast(Call.getArgOperand(2)); TypeToCheck = cast(Call.getType()); break; - case Intrinsic::matrix_columnwise_load: - NumRows = cast(Call.getArgOperand(2)); - NumColumns = cast(Call.getArgOperand(3)); - TypeToCheck = cast(Call.getType()); - break; - case Intrinsic::matrix_columnwise_store: + case Intrinsic::matrix_column_major_load: NumRows = cast(Call.getArgOperand(3)); NumColumns = cast(Call.getArgOperand(4)); + TypeToCheck = cast(Call.getType()); + break; + case Intrinsic::matrix_column_major_store: + NumRows = cast(Call.getArgOperand(4)); + NumColumns = cast(Call.getArgOperand(5)); TypeToCheck = cast(Call.getArgOperand(0)->getType()); break; default: diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -164,10 +164,10 @@ /// definition of an argument, use the produced column vectors directly. /// If not, split the operand vector containing an embedded matrix into /// a set of column vectors, -/// 2.2. Lower the instruction in terms of columnwise operations, which yields -/// a set of column vectors containing result matrix. Note that we lower -/// all instructions that have shape information. Besides the intrinsics, -/// this includes stores for example. +/// 2.2. Lower the instruction in terms of column major operations, which +/// yields a set of column vectors containing result matrix. Note that we +/// lower all instructions that have shape information. Besides the +/// intrinsics, this includes stores for example. /// 2.3. Update uses of the lowered instruction. If we have shape information /// for a user, there is nothing to do, as we will look up the result /// column matrix when lowering the user. For other uses, we embed the @@ -376,7 +376,7 @@ /// Maps instructions to their shape information. The shape information /// describes the shape to be used while lowering. This matches the shape of /// the result value of the instruction, with the only exceptions being store - /// instructions and the matrix_columnwise_store intrinsics. For those, the + /// instructions and the matrix_column_major_store intrinsics. For those, the /// shape information indicates that those instructions should be lowered /// using shape information as well. DenseMap ShapeMap; @@ -502,8 +502,8 @@ switch (II->getIntrinsicID()) { case Intrinsic::matrix_multiply: case Intrinsic::matrix_transpose: - case Intrinsic::matrix_columnwise_load: - case Intrinsic::matrix_columnwise_store: + case Intrinsic::matrix_column_major_load: + case Intrinsic::matrix_column_major_store: return true; default: return false; @@ -542,13 +542,13 @@ m_Value(MatrixA), m_Value(M), m_Value(N)))) { // Flip dimensions. Propagate = setShapeInfo(Inst, {N, M}); - } else if (match(Inst, m_Intrinsic( + } else if (match(Inst, m_Intrinsic( m_Value(MatrixA), m_Value(), m_Value(), - m_Value(M), m_Value(N)))) { + m_Value(), m_Value(M), m_Value(N)))) { Propagate = setShapeInfo(Inst, {N, M}); - } else if (match(Inst, - m_Intrinsic( - m_Value(), m_Value(), m_Value(M), m_Value(N)))) { + } else if (match(Inst, m_Intrinsic( + m_Value(), m_Value(), m_Value(), m_Value(M), + m_Value(N)))) { Propagate = setShapeInfo(Inst, {M, N}); } else if (match(Inst, m_Store(m_Value(MatrixA), m_Value()))) { auto OpShape = ShapeMap.find(MatrixA); @@ -620,14 +620,14 @@ // Flip dimensions. if (setShapeInfo(MatrixA, {M, N})) pushInstruction(MatrixA, WorkList); - } else if (match(V, m_Intrinsic( - m_Value(MatrixA), m_Value(), m_Value(), + } else if (match(V, m_Intrinsic( + m_Value(MatrixA), m_Value(), m_Value(), m_Value(), m_Value(M), m_Value(N)))) { if (setShapeInfo(MatrixA, {M, N})) { pushInstruction(MatrixA, WorkList); } } else if (isa(V) || - match(V, m_Intrinsic())) { + match(V, m_Intrinsic())) { // Nothing to do, no matrix input. } else if (isa(V)) { // Nothing to do. We forward-propagated to this so we would just @@ -666,8 +666,8 @@ switch (II->getIntrinsicID()) { case Intrinsic::matrix_multiply: case Intrinsic::matrix_transpose: - case Intrinsic::matrix_columnwise_load: - case Intrinsic::matrix_columnwise_store: + case Intrinsic::matrix_column_major_load: + case Intrinsic::matrix_column_major_store: WorkList.push_back(&Inst); break; default: @@ -763,11 +763,11 @@ case Intrinsic::matrix_transpose: LowerTranspose(Inst); break; - case Intrinsic::matrix_columnwise_load: - LowerColumnwiseLoad(Inst); + case Intrinsic::matrix_column_major_load: + LowerColumnMajorLoad(Inst); break; - case Intrinsic::matrix_columnwise_store: - LowerColumnwiseStore(Inst); + case Intrinsic::matrix_column_major_store: + LowerColumnMajorStore(Inst); break; default: return false; @@ -783,7 +783,7 @@ Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); MatrixTy Result; for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) { - Value *GEP = computeVectorAddr(EltPtr, Builder.getInt32(I), Stride, + Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(I), Stride, Shape.getStride(), VType->getElementType(), Builder); Value *Vector = createVectorLoad(GEP, VType->getElementType(), Builder); @@ -800,7 +800,7 @@ IRBuilder<> &Builder) { Value *Offset = Builder.CreateAdd( - Builder.CreateMul(J, Builder.getInt32(MatrixShape.getStride())), I); + Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I); unsigned AS = cast(MatrixPtr->getType())->getAddressSpace(); Value *EltPtr = @@ -813,7 +813,7 @@ Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast"); return loadMatrix(TileTy, TilePtr, - Builder.getInt32(MatrixShape.getStride()), ResultShape, + Builder.getInt64(MatrixShape.getStride()), ResultShape, Builder); } @@ -826,16 +826,16 @@ Builder); } - /// Lowers llvm.matrix.columnwise.load. + /// Lowers llvm.matrix.column.major.load. /// /// The intrinsic loads a matrix from memory using a stride between columns. - void LowerColumnwiseLoad(CallInst *Inst) { + void LowerColumnMajorLoad(CallInst *Inst) { assert(MatrixLayout == MatrixLayoutTy::ColumnMajor && "Intrinsic only supports column-major layout!"); Value *Ptr = Inst->getArgOperand(0); Value *Stride = Inst->getArgOperand(1); LowerLoad(Inst, Ptr, Stride, - {Inst->getArgOperand(2), Inst->getArgOperand(3)}); + {Inst->getArgOperand(3), Inst->getArgOperand(4)}); } /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p @@ -844,7 +844,7 @@ ShapeInfo MatrixShape, Value *I, Value *J, Type *EltTy, IRBuilder<> &Builder) { Value *Offset = Builder.CreateAdd( - Builder.CreateMul(J, Builder.getInt32(MatrixShape.getStride())), I); + Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I); unsigned AS = cast(MatrixPtr->getType())->getAddressSpace(); Value *EltPtr = @@ -857,7 +857,7 @@ Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast"); storeMatrix(TileTy, StoreVal, TilePtr, - Builder.getInt32(MatrixShape.getStride()), Builder); + Builder.getInt64(MatrixShape.getStride()), Builder); } /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between @@ -867,7 +867,7 @@ auto VType = cast(Ty); Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); for (auto Vec : enumerate(StoreVal.vectors())) { - Value *GEP = computeVectorAddr(EltPtr, Builder.getInt32(Vec.index()), + Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(Vec.index()), Stride, StoreVal.getStride(), VType->getElementType(), Builder); createVectorStore(Vec.value(), GEP, VType->getElementType(), Builder); @@ -886,17 +886,17 @@ Builder); } - /// Lowers llvm.matrix.columnwise.store. + /// Lowers llvm.matrix.column.major.store. /// /// The intrinsic store a matrix back memory using a stride between columns. - void LowerColumnwiseStore(CallInst *Inst) { + void LowerColumnMajorStore(CallInst *Inst) { assert(MatrixLayout == MatrixLayoutTy::ColumnMajor && "Intrinsic only supports column-major layout!"); Value *Matrix = Inst->getArgOperand(0); Value *Ptr = Inst->getArgOperand(1); Value *Stride = Inst->getArgOperand(2); LowerStore(Inst, Matrix, Ptr, Stride, - {Inst->getArgOperand(3), Inst->getArgOperand(4)}); + {Inst->getArgOperand(4), Inst->getArgOperand(5)}); } // Set elements I..I+NumElts-1 to Block @@ -1208,14 +1208,14 @@ for (unsigned K = 0; K < M; K += TileSize) { const unsigned TileM = std::min(M - K, unsigned(TileSize)); MatrixTy A = - loadMatrix(APtr, LShape, Builder.getInt32(I), Builder.getInt32(K), + loadMatrix(APtr, LShape, Builder.getInt64(I), Builder.getInt64(K), {TileR, TileM}, EltType, Builder); MatrixTy B = - loadMatrix(BPtr, RShape, Builder.getInt32(K), Builder.getInt32(J), + loadMatrix(BPtr, RShape, Builder.getInt64(K), Builder.getInt64(J), {TileM, TileC}, EltType, Builder); emitMatrixMultiply(Res, A, B, AllowContract, Builder, true); } - storeMatrix(Res, CPtr, {R, M}, Builder.getInt32(I), Builder.getInt32(J), + storeMatrix(Res, CPtr, {R, M}, Builder.getInt64(I), Builder.getInt64(J), EltType, Builder); } @@ -1329,7 +1329,7 @@ if (I == ShapeMap.end()) return false; - LowerLoad(Inst, Ptr, Builder.getInt32(I->second.getStride()), I->second); + LowerLoad(Inst, Ptr, Builder.getInt64(I->second.getStride()), I->second); return true; } @@ -1339,7 +1339,7 @@ if (I == ShapeMap.end()) return false; - LowerStore(Inst, StoredVal, Ptr, Builder.getInt32(I->second.getStride()), + LowerStore(Inst, StoredVal, Ptr, Builder.getInt64(I->second.getStride()), I->second); return true; } @@ -1507,11 +1507,11 @@ prettyPrintMatrixType(II->getOperand(0), SS); SS << "." << *II->getType()->getScalarType(); break; - case Intrinsic::matrix_columnwise_load: + case Intrinsic::matrix_column_major_load: prettyPrintMatrixType(II, SS); SS << "." << *II->getType()->getScalarType(); break; - case Intrinsic::matrix_columnwise_store: + case Intrinsic::matrix_column_major_store: prettyPrintMatrixType(II->getOperand(0), SS); SS << "." << *II->getOperand(0)->getType()->getScalarType(); break; @@ -1529,9 +1529,10 @@ case Intrinsic::matrix_multiply: return 3; case Intrinsic::matrix_transpose: - case Intrinsic::matrix_columnwise_load: - case Intrinsic::matrix_columnwise_store: return 2; + case Intrinsic::matrix_column_major_load: + case Intrinsic::matrix_column_major_store: + return 3; default: return 0; } @@ -1626,7 +1627,7 @@ write(std::string("(")); unsigned NumOpsToBreak = 1; - if (match(Expr, m_Intrinsic())) + if (match(Expr, m_Intrinsic())) NumOpsToBreak = 2; for (Value *Op : Ops) { diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll @@ -11,19 +11,19 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <9 x double>* [[A_PTR:%.*]] to double* ; CHECK-NEXT: [[COL_CAST:%.*]] = bitcast double* [[TMP0]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST]], align 8 -; CHECK-NEXT: [[COL_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 3 +; CHECK-NEXT: [[COL_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 3 ; CHECK-NEXT: [[COL_CAST1:%.*]] = bitcast double* [[COL_GEP]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST1]], align 8 -; CHECK-NEXT: [[COL_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i32 6 +; CHECK-NEXT: [[COL_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i64 6 ; CHECK-NEXT: [[COL_CAST4:%.*]] = bitcast double* [[COL_GEP3]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD5:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST4]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <9 x double>* [[B_PTR:%.*]] to double* ; CHECK-NEXT: [[COL_CAST6:%.*]] = bitcast double* [[TMP1]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD7:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST6]], align 8 -; CHECK-NEXT: [[COL_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i32 3 +; CHECK-NEXT: [[COL_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i64 3 ; CHECK-NEXT: [[COL_CAST9:%.*]] = bitcast double* [[COL_GEP8]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD10:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST9]], align 8 -; CHECK-NEXT: [[COL_GEP11:%.*]] = getelementptr double, double* [[TMP1]], i32 6 +; CHECK-NEXT: [[COL_GEP11:%.*]] = getelementptr double, double* [[TMP1]], i64 6 ; CHECK-NEXT: [[COL_CAST12:%.*]] = bitcast double* [[COL_GEP11]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD13:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST12]], align 8 @@ -227,10 +227,10 @@ ; CHECK-NEXT: [[TMP108:%.*]] = bitcast <9 x double>* [[C_PTR:%.*]] to double* ; CHECK-NEXT: [[TMP109:%.*]] = bitcast double* [[TMP108]] to <3 x double>* ; CHECK-NEXT: store <3 x double> [[TMP47]], <3 x double>* [[TMP109]], align 8 -; CHECK-NEXT: [[TMP110:%.*]] = getelementptr double, double* [[TMP108]], i32 3 +; CHECK-NEXT: [[TMP110:%.*]] = getelementptr double, double* [[TMP108]], i64 3 ; CHECK-NEXT: [[TMP111:%.*]] = bitcast double* [[TMP110]] to <3 x double>* ; CHECK-NEXT: store <3 x double> [[TMP77]], <3 x double>* [[TMP111]], align 8 -; CHECK-NEXT: [[TMP112:%.*]] = getelementptr double, double* [[TMP108]], i32 6 +; CHECK-NEXT: [[TMP112:%.*]] = getelementptr double, double* [[TMP108]], i64 6 ; CHECK-NEXT: [[TMP113:%.*]] = bitcast double* [[TMP112]] to <3 x double>* ; CHECK-NEXT: store <3 x double> [[TMP107]], <3 x double>* [[TMP113]], align 8 ; CHECK-NEXT: ret void @@ -255,19 +255,19 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <9 x double>* [[A_PTR:%.*]] to double* ; CHECK-NEXT: [[COL_CAST:%.*]] = bitcast double* [[TMP0]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST]], align 8 -; CHECK-NEXT: [[COL_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 3 +; CHECK-NEXT: [[COL_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 3 ; CHECK-NEXT: [[COL_CAST1:%.*]] = bitcast double* [[COL_GEP]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST1]], align 8 -; CHECK-NEXT: [[COL_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i32 6 +; CHECK-NEXT: [[COL_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i64 6 ; CHECK-NEXT: [[COL_CAST4:%.*]] = bitcast double* [[COL_GEP3]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD5:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST4]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <9 x double>* [[B_PTR:%.*]] to double* ; CHECK-NEXT: [[COL_CAST6:%.*]] = bitcast double* [[TMP1]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD7:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST6]], align 8 -; CHECK-NEXT: [[COL_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i32 3 +; CHECK-NEXT: [[COL_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i64 3 ; CHECK-NEXT: [[COL_CAST9:%.*]] = bitcast double* [[COL_GEP8]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD10:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST9]], align 8 -; CHECK-NEXT: [[COL_GEP11:%.*]] = getelementptr double, double* [[TMP1]], i32 6 +; CHECK-NEXT: [[COL_GEP11:%.*]] = getelementptr double, double* [[TMP1]], i64 6 ; CHECK-NEXT: [[COL_CAST12:%.*]] = bitcast double* [[COL_GEP11]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD13:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST12]], align 8 @@ -474,10 +474,10 @@ ; CHECK-NEXT: [[TMP110:%.*]] = bitcast <9 x double>* [[C_PTR:%.*]] to double* ; CHECK-NEXT: [[COL_CAST92:%.*]] = bitcast double* [[TMP110]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD93:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST92]], align 8 -; CHECK-NEXT: [[COL_GEP94:%.*]] = getelementptr double, double* [[TMP110]], i32 3 +; CHECK-NEXT: [[COL_GEP94:%.*]] = getelementptr double, double* [[TMP110]], i64 3 ; CHECK-NEXT: [[COL_CAST95:%.*]] = bitcast double* [[COL_GEP94]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD96:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST95]], align 8 -; CHECK-NEXT: [[COL_GEP97:%.*]] = getelementptr double, double* [[TMP110]], i32 6 +; CHECK-NEXT: [[COL_GEP97:%.*]] = getelementptr double, double* [[TMP110]], i64 6 ; CHECK-NEXT: [[COL_CAST98:%.*]] = bitcast double* [[COL_GEP97]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD99:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST98]], align 8 @@ -492,10 +492,10 @@ ; CHECK-NEXT: [[TMP111:%.*]] = bitcast <9 x double>* [[C_PTR]] to double* ; CHECK-NEXT: [[TMP112:%.*]] = bitcast double* [[TMP111]] to <3 x double>* ; CHECK-NEXT: store <3 x double> [[TMP108]], <3 x double>* [[TMP112]], align 8 -; CHECK-NEXT: [[TMP113:%.*]] = getelementptr double, double* [[TMP111]], i32 3 +; CHECK-NEXT: [[TMP113:%.*]] = getelementptr double, double* [[TMP111]], i64 3 ; CHECK-NEXT: [[TMP114:%.*]] = bitcast double* [[TMP113]] to <3 x double>* ; CHECK-NEXT: store <3 x double> [[TMP109]], <3 x double>* [[TMP114]], align 8 -; CHECK-NEXT: [[TMP115:%.*]] = getelementptr double, double* [[TMP111]], i32 6 +; CHECK-NEXT: [[TMP115:%.*]] = getelementptr double, double* [[TMP111]], i64 6 ; CHECK-NEXT: [[TMP116:%.*]] = bitcast double* [[TMP115]] to <3 x double>* ; CHECK-NEXT: store <3 x double> [[TMP110]], <3 x double>* [[TMP116]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll @@ -14,7 +14,7 @@ ; CHECK-NEXT: store i32 [[R:%.*]], i32* [[R_ADDR]], align 4 ; CHECK-NEXT: store i32 [[C:%.*]], i32* [[C_ADDR]], align 4 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* bitcast ([5 x <4 x double>]* @foo to <2 x double>*), align 8 -; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr ([5 x <4 x double>], [5 x <4 x double>]* @foo, i32 0, i32 0, i32 2) to <2 x double>*), align 8 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr ([5 x <4 x double>], [5 x <4 x double>]* @foo, i32 0, i32 0, i64 2) to <2 x double>*), align 8 ; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> undef, <1 x i32> zeroinitializer ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[COL_LOAD]], i64 0 ; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> undef, double [[TMP0]], i32 0 @@ -68,7 +68,7 @@ ; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x double> [[TMP25]], <1 x double> undef, <2 x i32> ; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x double> [[TMP20]], <2 x double> [[TMP26]], <2 x i32> ; CHECK-NEXT: store <2 x double> [[COL_LOAD]], <2 x double>* bitcast (double* getelementptr inbounds ([5 x <4 x double>], [5 x <4 x double>]* @foo, i64 0, i64 2, i32 0) to <2 x double>*), align 8 -; CHECK-NEXT: store <2 x double> [[COL_LOAD1]], <2 x double>* bitcast (double* getelementptr ([5 x <4 x double>], [5 x <4 x double>]* @foo, i64 0, i64 2, i32 2) to <2 x double>*), align 8 +; CHECK-NEXT: store <2 x double> [[COL_LOAD1]], <2 x double>* bitcast (double* getelementptr ([5 x <4 x double>], [5 x <4 x double>]* @foo, i64 0, i64 2, i64 2) to <2 x double>*), align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-add-sub-double-row-major.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-add-sub-double-row-major.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-add-sub-double-row-major.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-add-sub-double-row-major.ll @@ -11,16 +11,16 @@ ; RM-NEXT: [[TMP0:%.*]] = bitcast <6 x double>* [[A_PTR:%.*]] to double* ; RM-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[TMP0]] to <3 x double>* ; RM-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST]], align 8 -; RM-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 3 +; RM-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 3 ; RM-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>* ; RM-NEXT: [[COL_LOAD2:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST1]], align 8 ; RM-NEXT: [[TMP1:%.*]] = bitcast <6 x double>* [[B_PTR:%.*]] to double* ; RM-NEXT: [[VEC_CAST3:%.*]] = bitcast double* [[TMP1]] to <2 x double>* ; RM-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST3]], align 8 -; RM-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, double* [[TMP1]], i32 2 +; RM-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, double* [[TMP1]], i64 2 ; RM-NEXT: [[VEC_CAST6:%.*]] = bitcast double* [[VEC_GEP5]] to <2 x double>* ; RM-NEXT: [[COL_LOAD7:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST6]], align 8 -; RM-NEXT: [[VEC_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i32 4 +; RM-NEXT: [[VEC_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i64 4 ; RM-NEXT: [[VEC_CAST9:%.*]] = bitcast double* [[VEC_GEP8]] to <2 x double>* ; RM-NEXT: [[COL_LOAD10:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST9]], align 8 ; RM-NEXT: [[TMP2:%.*]] = fadd <3 x double> [[COL_LOAD]], [[COL_LOAD]] @@ -28,7 +28,7 @@ ; RM-NEXT: [[TMP4:%.*]] = bitcast <6 x double>* [[A_PTR]] to double* ; RM-NEXT: [[VEC_CAST11:%.*]] = bitcast double* [[TMP4]] to <3 x double>* ; RM-NEXT: store <3 x double> [[TMP2]], <3 x double>* [[VEC_CAST11]], align 8 -; RM-NEXT: [[VEC_GEP12:%.*]] = getelementptr double, double* [[TMP4]], i32 3 +; RM-NEXT: [[VEC_GEP12:%.*]] = getelementptr double, double* [[TMP4]], i64 3 ; RM-NEXT: [[VEC_CAST13:%.*]] = bitcast double* [[VEC_GEP12]] to <3 x double>* ; RM-NEXT: store <3 x double> [[TMP3]], <3 x double>* [[VEC_CAST13]], align 8 ; RM-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[COL_LOAD4]], @@ -37,10 +37,10 @@ ; RM-NEXT: [[TMP8:%.*]] = bitcast <6 x double>* [[B_PTR]] to double* ; RM-NEXT: [[VEC_CAST14:%.*]] = bitcast double* [[TMP8]] to <2 x double>* ; RM-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[VEC_CAST14]], align 8 -; RM-NEXT: [[VEC_GEP15:%.*]] = getelementptr double, double* [[TMP8]], i32 2 +; RM-NEXT: [[VEC_GEP15:%.*]] = getelementptr double, double* [[TMP8]], i64 2 ; RM-NEXT: [[VEC_CAST16:%.*]] = bitcast double* [[VEC_GEP15]] to <2 x double>* ; RM-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[VEC_CAST16]], align 8 -; RM-NEXT: [[VEC_GEP17:%.*]] = getelementptr double, double* [[TMP8]], i32 4 +; RM-NEXT: [[VEC_GEP17:%.*]] = getelementptr double, double* [[TMP8]], i64 4 ; RM-NEXT: [[VEC_CAST18:%.*]] = bitcast double* [[VEC_GEP17]] to <2 x double>* ; RM-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[VEC_CAST18]], align 8 ; RM-NEXT: [[BLOCK:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> undef, <1 x i32> zeroinitializer @@ -122,7 +122,7 @@ ; RM-NEXT: [[TMP49:%.*]] = bitcast <4 x double>* [[C_PTR:%.*]] to double* ; RM-NEXT: [[VEC_CAST52:%.*]] = bitcast double* [[TMP49]] to <2 x double>* ; RM-NEXT: [[COL_LOAD53:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST52]], align 8 -; RM-NEXT: [[VEC_GEP54:%.*]] = getelementptr double, double* [[TMP49]], i32 2 +; RM-NEXT: [[VEC_GEP54:%.*]] = getelementptr double, double* [[TMP49]], i64 2 ; RM-NEXT: [[VEC_CAST55:%.*]] = bitcast double* [[VEC_GEP54]] to <2 x double>* ; RM-NEXT: [[COL_LOAD56:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST55]], align 8 ; RM-NEXT: [[TMP50:%.*]] = fsub <2 x double> [[COL_LOAD53]], [[TMP28]] @@ -130,7 +130,7 @@ ; RM-NEXT: [[TMP52:%.*]] = bitcast <4 x double>* [[C_PTR]] to double* ; RM-NEXT: [[VEC_CAST57:%.*]] = bitcast double* [[TMP52]] to <2 x double>* ; RM-NEXT: store <2 x double> [[TMP50]], <2 x double>* [[VEC_CAST57]], align 8 -; RM-NEXT: [[VEC_GEP58:%.*]] = getelementptr double, double* [[TMP52]], i32 2 +; RM-NEXT: [[VEC_GEP58:%.*]] = getelementptr double, double* [[TMP52]], i64 2 ; RM-NEXT: [[VEC_CAST59:%.*]] = bitcast double* [[VEC_GEP58]] to <2 x double>* ; RM-NEXT: store <2 x double> [[TMP51]], <2 x double>* [[VEC_CAST59]], align 8 ; RM-NEXT: ret void diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll @@ -48,13 +48,13 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x double>* [[A_PTR:%.*]] to double* ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[TMP0]] to <2 x double>* ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 2 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 2 ; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>* ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8 -; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i32 4 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i64 4 ; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast double* [[VEC_GEP3]] to <2 x double>* ; CHECK-NEXT: [[COL_LOAD5:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST4]], align 8 -; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP0]], i32 6 +; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP0]], i64 6 ; CHECK-NEXT: [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <2 x double>* ; CHECK-NEXT: [[COL_LOAD8:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST7]], align 8 ; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> undef, <2 x i32> diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-forward.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-forward.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-forward.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-forward.ll @@ -30,7 +30,7 @@ ; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x double>* [[PTR:%.*]] to double* ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[TMP16]] to <4 x double>* ; CHECK-NEXT: store <4 x double> [[TMP7]], <4 x double>* [[VEC_CAST]], align 8 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP16]], i32 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP16]], i64 4 ; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast double* [[VEC_GEP]] to <4 x double>* ; CHECK-NEXT: store <4 x double> [[TMP15]], <4 x double>* [[VEC_CAST4]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-mixed-users.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-mixed-users.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-mixed-users.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-mixed-users.ll @@ -30,13 +30,13 @@ ; CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x double>* [[PTR:%.*]] to double* ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[TMP20]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[VEC_CAST]], align 8 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP20]], i32 2 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP20]], i64 2 ; CHECK-NEXT: [[VEC_CAST2:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[VEC_CAST2]], align 8 -; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, double* [[TMP20]], i32 4 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, double* [[TMP20]], i64 4 ; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast double* [[VEC_GEP3]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[VEC_CAST4]], align 8 -; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, double* [[TMP20]], i32 6 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, double* [[TMP20]], i64 6 ; CHECK-NEXT: [[VEC_CAST6:%.*]] = bitcast double* [[VEC_GEP5]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP16]], <2 x double>* [[VEC_CAST6]], align 8 ; CHECK-NEXT: call void @foo(<8 x double> [[TMP19]]) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-multiple-iterations.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-multiple-iterations.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-multiple-iterations.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-multiple-iterations.ll @@ -11,13 +11,13 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x double>* [[A_PTR:%.*]] to double* ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[TMP1]] to <4 x double>* ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST]], align 8 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP1]], i32 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP1]], i64 4 ; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <4 x double>* ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST1]], align 8 -; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, double* [[TMP1]], i32 8 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, double* [[TMP1]], i64 8 ; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast double* [[VEC_GEP3]] to <4 x double>* ; CHECK-NEXT: [[COL_LOAD5:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST4]], align 8 -; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP1]], i32 12 +; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP1]], i64 12 ; CHECK-NEXT: [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <4 x double>* ; CHECK-NEXT: [[COL_LOAD8:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST7]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[COL_LOAD]], i64 0 @@ -55,13 +55,13 @@ ; CHECK-NEXT: [[TMP34:%.*]] = bitcast <16 x double>* [[B_PTR:%.*]] to double* ; CHECK-NEXT: [[VEC_CAST9:%.*]] = bitcast double* [[TMP34]] to <4 x double>* ; CHECK-NEXT: [[COL_LOAD10:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST9]], align 8 -; CHECK-NEXT: [[VEC_GEP11:%.*]] = getelementptr double, double* [[TMP34]], i32 4 +; CHECK-NEXT: [[VEC_GEP11:%.*]] = getelementptr double, double* [[TMP34]], i64 4 ; CHECK-NEXT: [[VEC_CAST12:%.*]] = bitcast double* [[VEC_GEP11]] to <4 x double>* ; CHECK-NEXT: [[COL_LOAD13:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST12]], align 8 -; CHECK-NEXT: [[VEC_GEP14:%.*]] = getelementptr double, double* [[TMP34]], i32 8 +; CHECK-NEXT: [[VEC_GEP14:%.*]] = getelementptr double, double* [[TMP34]], i64 8 ; CHECK-NEXT: [[VEC_CAST15:%.*]] = bitcast double* [[VEC_GEP14]] to <4 x double>* ; CHECK-NEXT: [[COL_LOAD16:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST15]], align 8 -; CHECK-NEXT: [[VEC_GEP17:%.*]] = getelementptr double, double* [[TMP34]], i32 12 +; CHECK-NEXT: [[VEC_GEP17:%.*]] = getelementptr double, double* [[TMP34]], i64 12 ; CHECK-NEXT: [[VEC_CAST18:%.*]] = bitcast double* [[VEC_GEP17]] to <4 x double>* ; CHECK-NEXT: [[COL_LOAD19:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST18]], align 8 ; CHECK-NEXT: [[TMP35:%.*]] = fmul <4 x double> [[COL_LOAD]], [[COL_LOAD10]] diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll @@ -51,7 +51,7 @@ ; CHECK-NEXT: load(addr %A) ; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops -; CHECK-NEXT: columnwise.load.3x5.double(addr %B, 5) +; CHECK-NEXT: column.major.load.3x5.double(addr %B, 5) ; CHECK-LABEL: remark: load.h:41:11: Lowered with 0 stores, 1 loads, 0 compute ops ; CHECK-NEXT: load(addr %D) @@ -60,13 +60,13 @@ ; CHECK-NEXT: load(addr %A) ; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops -; CHECK-NEXT: columnwise.load.3x5.double(addr %B, 5) +; CHECK-NEXT: column.major.load.3x5.double(addr %B, 5) ; CHECK-LABEL: remark: toplevel.c:410:0: Lowered with 10 stores, 20 loads, 10 compute ops ; CHECK-NEXT: store( ; CHECK-NEXT: fadd( ; CHECK-NEXT: load(addr %A), -; CHECK-NEXT: columnwise.load.3x5.double(addr %B, 5)), +; CHECK-NEXT: column.major.load.3x5.double(addr %B, 5)), ; CHECK-NEXT: addr %C) ; CHECK-LABEL: remark: toplevel.c:510:0: Lowered with 1 stores, 1 loads, 8 compute ops @@ -95,7 +95,7 @@ define void @toplevel(<15 x double>* %A, <15 x double>* %B, <15 x double>* %C, <2 x float>* %D) !dbg !16 { entry: %a = load <15 x double>, <15 x double> *%A, align 16, !dbg !3791 - %b = call <15 x double> @llvm.matrix.columnwise.load(<15 x double>* %B, i32 5, i32 3, i32 5), !dbg !3793 + %b = call <15 x double> @llvm.matrix.column.major.load(<15 x double>* %B, i64 5, i1 false, i32 3, i32 5), !dbg !3793 %c = fadd <15 x double> %a, %b, !dbg !100 store <15 x double> %c, <15 x double> *%C, align 16, !dbg !102 @@ -106,7 +106,7 @@ ret void } -declare <15 x double> @llvm.matrix.columnwise.load(<15 x double>*, i32, i32, i32) +declare <15 x double> @llvm.matrix.column.major.load(<15 x double>*, i64, i1, i32, i32) declare <2 x float> @llvm.matrix.transpose(<2 x float>, i32, i32) !llvm.dbg.cu = !{!0} diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll @@ -28,8 +28,8 @@ ; YAML-NEXT: - String: ' compute ops' ; YAML-NEXT: - String: ' are shared with other expressions' ; YAML-NEXT: - String: | -; YAML: columnwise.store.4x2.double( -; YAML-NEXT: shared with remark at line 35 column 45 (transpose.2x4.double(columnwise.load.2x4.double(addr %arg1, +; YAML: column.major.store.4x2.double( +; YAML-NEXT: shared with remark at line 35 column 45 (transpose.2x4.double(column.major.load.2x4.double(addr %arg1, ; YAML-NEXT: scalar)), ; YAML-NEXT: addr %arg3, ; YAML-NEXT: 10) @@ -57,55 +57,55 @@ ; YAML-NEXT: - String: ' compute ops' ; YAML-NEXT: - String: ' are shared with other expressions' ; YAML-NEXT: - String: | -; YAML: columnwise.store.4x15.double( +; YAML: column.major.store.4x15.double( ; YAML-NEXT: fsub( -; YAML-NEXT: columnwise.load.4x15.double(addr %arg2, 20), +; YAML-NEXT: column.major.load.4x15.double(addr %arg2, 20), ; YAML-NEXT: multiply.4x2.2x15.double( -; YAML-NEXT: shared with remark at line 35 column 71 (transpose.2x4.double(columnwise.load.2x4.double(addr %arg1, +; YAML-NEXT: shared with remark at line 35 column 71 (transpose.2x4.double(column.major.load.2x4.double(addr %arg1, ; YAML-NEXT: scalar)), -; YAML-NEXT: columnwise.load.2x15.double(addr %arg3, scalar))), +; YAML-NEXT: column.major.load.2x15.double(addr %arg3, scalar))), ; YAML-NEXT: addr %arg2, ; YAML-NEXT: 10) ; STDERR-LABEL: remark: test.cpp:35:71: Lowered with 4 stores, 0 loads, 0 compute ops, ; STDERR-NEXT: additionally 0 stores, 4 loads, 16 compute ops are shared with other expressions -; STDERR-NEXT: columnwise.store.4x2.double( -; STDERR-NEXT: shared with remark at line 35 column 45 (transpose.2x4.double(columnwise.load.2x4.double(addr %arg1, +; STDERR-NEXT: column.major.store.4x2.double( +; STDERR-NEXT: shared with remark at line 35 column 45 (transpose.2x4.double(column.major.load.2x4.double(addr %arg1, ; STDERR-NEXT: scalar)), ; STDERR-NEXT: addr %arg3, ; STDERR-NEXT: 10) ; STDERR-LABEL: remark: test.cpp:35:45: Lowered with 30 stores, 45 loads, 120 compute ops, ; STDERR-NEXT: additionally 0 stores, 4 loads, 16 compute ops are shared with other expressions -; STDERR-NEXT: columnwise.store.4x15.double( +; STDERR-NEXT: column.major.store.4x15.double( ; STDERR-NEXT: fsub( -; STDERR-NEXT: columnwise.load.4x15.double(addr %arg2, 20), +; STDERR-NEXT: column.major.load.4x15.double(addr %arg2, 20), ; STDERR-NEXT: multiply.4x2.2x15.double( -; STDERR-NEXT: shared with remark at line 35 column 71 (transpose.2x4.double(columnwise.load.2x4.double(addr %arg1, +; STDERR-NEXT: shared with remark at line 35 column 71 (transpose.2x4.double(column.major.load.2x4.double(addr %arg1, ; STDERR-NEXT: scalar)), -; STDERR-NEXT: columnwise.load.2x15.double(addr %arg3, scalar))), +; STDERR-NEXT: column.major.load.2x15.double(addr %arg3, scalar))), ; STDERR-NEXT: addr %arg2, ; STDERR-NEXT: 10) -define void @test_2leafs(double* %arg1, double* %arg2, double* %arg3, i32 %stride, i32 %offset) !dbg !8 { +define void @test_2leafs(double* %arg1, double* %arg2, double* %arg3, i64 %stride) !dbg !8 { bb: - %shared.load = tail call <8 x double> @llvm.matrix.columnwise.load.v8f64.p0f64(double* %arg1, i32 %stride, i32 2, i32 4), !dbg !10, !noalias !10 - %shared.load.2 = tail call <30 x double> @llvm.matrix.columnwise.load.v30f64.p0f64(double* %arg3, i32 %stride, i32 2, i32 15), !dbg !10, !noalias !10 + %shared.load = tail call <8 x double> @llvm.matrix.column.major.load.v8f64.p0f64(double* %arg1, i64 %stride, i1 false, i32 2, i32 4), !dbg !10, !noalias !10 + %shared.load.2 = tail call <30 x double> @llvm.matrix.column.major.load.v30f64.p0f64(double* %arg3, i64 %stride, i1 false, i32 2, i32 15), !dbg !10, !noalias !10 %tmp17 = tail call <8 x double> @llvm.matrix.transpose.v8f64(<8 x double> %shared.load, i32 2, i32 4), !dbg !10 - tail call void @llvm.matrix.columnwise.store.v8f64.p0f64(<8 x double> %tmp17, double* %arg3, i32 10, i32 4, i32 2), !dbg !10 - %tmp18 = tail call <60 x double> @llvm.matrix.columnwise.load.v60f64.p0f64(double* %arg2, i32 20, i32 4, i32 15), !dbg !11 + tail call void @llvm.matrix.column.major.store.v8f64.p0f64(<8 x double> %tmp17, double* %arg3, i64 10, i1 false, i32 4, i32 2), !dbg !10 + %tmp18 = tail call <60 x double> @llvm.matrix.column.major.load.v60f64.p0f64(double* %arg2, i64 20, i1 false, i32 4, i32 15), !dbg !11 %tmp48 = tail call <60 x double> @llvm.matrix.multiply.v60f64.v8f64.v30f64(<8 x double> %tmp17, <30 x double> %shared.load.2, i32 4, i32 2, i32 15), !dbg !11 %tmp49 = fsub <60 x double> %tmp18, %tmp48, !dbg !11 - tail call void @llvm.matrix.columnwise.store.v60f64.p0f64(<60 x double> %tmp49, double* %arg2, i32 10, i32 4, i32 15), !dbg !11 + tail call void @llvm.matrix.column.major.store.v60f64.p0f64(<60 x double> %tmp49, double* %arg2, i64 10, i1 false, i32 4, i32 15), !dbg !11 ret void } declare <8 x double> @llvm.matrix.transpose.v8f64(<8 x double>, i32 immarg, i32 immarg) -declare <8 x double> @llvm.matrix.columnwise.load.v8f64.p0f64(double*, i32, i32 immarg, i32 immarg) -declare <30 x double> @llvm.matrix.columnwise.load.v30f64.p0f64(double*, i32, i32 immarg, i32 immarg) -declare <60 x double> @llvm.matrix.columnwise.load.v60f64.p0f64(double*, i32, i32 immarg, i32 immarg) -declare void @llvm.matrix.columnwise.store.v60f64.p0f64(<60 x double>, double* writeonly, i32, i32 immarg, i32 immarg) -declare void @llvm.matrix.columnwise.store.v8f64.p0f64(<8 x double>, double* writeonly, i32, i32 immarg, i32 immarg) +declare <8 x double> @llvm.matrix.column.major.load.v8f64.p0f64(double*, i64, i1 immarg, i32 immarg, i32 immarg) +declare <30 x double> @llvm.matrix.column.major.load.v30f64.p0f64(double*, i64, i1 immarg, i32 immarg, i32 immarg) +declare <60 x double> @llvm.matrix.column.major.load.v60f64.p0f64(double*, i64, i1 immarg, i32 immarg, i32 immarg) +declare void @llvm.matrix.column.major.store.v60f64.p0f64(<60 x double>, double* writeonly, i64, i1 immarg, i32 immarg, i32 immarg) +declare void @llvm.matrix.column.major.store.v8f64.p0f64(<8 x double>, double* writeonly, i64, i1 immarg, i32 immarg, i32 immarg) declare <60 x double> @llvm.matrix.multiply.v60f64.v8f64.v30f64(<8 x double>, <30 x double>, i32 immarg, i32 immarg, i32 immarg) !llvm.module.flags = !{!0, !1, !2, !3} diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll @@ -36,54 +36,54 @@ ; CHECK-LABEL: remark: test.h:60:20: Lowered with 6 stores, 6 loads, 0 compute ops ; CHECK-NEXT: store( -; CHECK-NEXT: columnwise.load.3x3.double(addr %A, 5), +; CHECK-NEXT: column.major.load.3x3.double(addr %A, 5), ; CHECK-NEXT: addr %B) -define void @columnwise.load(<9 x double>* %A, <9 x double>* %B) !dbg !27 { - %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !28 +define void @column.major.load(<9 x double>* %A, <9 x double>* %B) !dbg !27 { + %A.matrix = call <9 x double> @llvm.matrix.column.major.load(<9 x double>* %A, i64 5, i1 false, i32 3, i32 3), !dbg !28 store <9 x double> %A.matrix, <9 x double>* %B, !dbg !28 ret void } -declare <9 x double> @llvm.matrix.columnwise.load(<9 x double>*, i32, i32, i32) +declare <9 x double> @llvm.matrix.column.major.load(<9 x double>*, i64, i1, i32, i32) ; CHECK-LABEL: remark: test.h:70:20: Lowered with 6 stores, 6 loads, 0 compute ops -; CHECK-NEXT: columnwise.store.3x3.double( -; CHECK-NEXT: columnwise.load.3x3.double(addr %A, 5), +; CHECK-NEXT: column.major.store.3x3.double( +; CHECK-NEXT: column.major.load.3x3.double(addr %A, 5), ; CHECK-NEXT: addr %B, ; CHECK-NEXT: 10) -define void @columnwise.store(<9 x double>* %A, <9 x double>* %B) !dbg !29 { - %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !30 - call void @llvm.matrix.columnwise.store(<9 x double> %A.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !30 +define void @column.major.store(<9 x double>* %A, <9 x double>* %B) !dbg !29 { + %A.matrix = call <9 x double> @llvm.matrix.column.major.load(<9 x double>* %A, i64 5, i1 false, i32 3, i32 3), !dbg !30 + call void @llvm.matrix.column.major.store(<9 x double> %A.matrix, <9 x double>* %B, i64 10, i1 false, i32 3, i32 3), !dbg !30 ret void } -declare void @llvm.matrix.columnwise.store(<9 x double>, <9 x double>*, i32, i32, i32) +declare void @llvm.matrix.column.major.store(<9 x double>, <9 x double>*, i64, i1, i32, i32) ; CHECK-LABEL: remark: test.h:80:20: Lowered with 6 stores, 6 loads, 12 compute ops -; CHECK-NEXT: columnwise.store.3x3.double( +; CHECK-NEXT: column.major.store.3x3.double( ; CHECK-NEXT: fmul( ; CHECK-NEXT: fadd( -; CHECK-NEXT: columnwise.load.3x3.double(addr %A, 5) -; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)), -; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)), +; CHECK-NEXT: column.major.load.3x3.double(addr %A, 5) +; CHECK-NEXT: (reused) column.major.load.3x3.double(addr %A, 5)), +; CHECK-NEXT: (reused) column.major.load.3x3.double(addr %A, 5)), ; CHECK-NEXT: addr %B, ; CHECK-NEXT: 10) define void @binaryops(<9 x double>* %A, <9 x double>* %B) !dbg !31 { - %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !32 + %A.matrix = call <9 x double> @llvm.matrix.column.major.load(<9 x double>* %A, i64 5, i1 false, i32 3, i32 3), !dbg !32 %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !32 %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !32 - call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !32 + call void @llvm.matrix.column.major.store(<9 x double> %R2.matrix, <9 x double>* %B, i64 10, i1 false, i32 3, i32 3), !dbg !32 ret void } ; CHECK-LABEL: remark: test.h:90:20: Lowered with 6 stores, 6 loads, 12 compute ops -; CHECK-NEXT: columnwise.store.3x3.double( +; CHECK-NEXT: column.major.store.3x3.double( ; CHECK-NEXT: fmul( ; CHECK-NEXT: fadd( -; CHECK-NEXT: columnwise.load.3x3.double(addr %A, 5) -; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)), -; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)), +; CHECK-NEXT: column.major.load.3x3.double(addr %A, 5) +; CHECK-NEXT: (reused) column.major.load.3x3.double(addr %A, 5)), +; CHECK-NEXT: (reused) column.major.load.3x3.double(addr %A, 5)), ; CHECK-NEXT: addr %B, ; CHECK-NEXT: 10) ; CHECK-NEXT: remark: test.h:90:20: Lowered with 2 stores, 12 loads, 22 compute ops @@ -94,10 +94,10 @@ ; CHECK-NEXT: addr %E) define void @multiple_expressions(<9 x double>* %A, <9 x double>* %B, <12 x double>* %C, <12 x double>* %D, <4 x double>* %E) !dbg !33 { - %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !34 + %A.matrix = call <9 x double> @llvm.matrix.column.major.load(<9 x double>* %A, i64 5, i1 false, i32 3, i32 3), !dbg !34 %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !34 %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !34 - call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !34 + call void @llvm.matrix.column.major.store(<9 x double> %R2.matrix, <9 x double>* %B, i64 10, i1 false, i32 3, i32 3), !dbg !34 %C.matrix = load <12 x double>, <12 x double>* %C, !dbg !34 %D.matrix = load <12 x double>, <12 x double>* %D, !dbg !34 @@ -108,20 +108,20 @@ } ; CHECK-LABEL: remark: test.h:100:20: Lowered with 6 stores, 6 loads, 12 compute ops -; CHECK-NEXT: columnwise.store.3x3.double( +; CHECK-NEXT: column.major.store.3x3.double( ; CHECK-NEXT: fmul( ; CHECK-NEXT: fadd( -; CHECK-NEXT: columnwise.load.3x3.double(addr %A, 5) -; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)), -; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)), +; CHECK-NEXT: column.major.load.3x3.double(addr %A, 5) +; CHECK-NEXT: (reused) column.major.load.3x3.double(addr %A, 5)), +; CHECK-NEXT: (reused) column.major.load.3x3.double(addr %A, 5)), ; CHECK-NEXT: stack addr %B, ; CHECK-NEXT: 10) define void @stackaddresses(<9 x double>* %A) !dbg !35 { %B = alloca <9 x double> - %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !36 + %A.matrix = call <9 x double> @llvm.matrix.column.major.load(<9 x double>* %A, i64 5, i1 false, i32 3, i32 3), !dbg !36 %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !36 %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !36 - call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !36 + call void @llvm.matrix.column.major.store(<9 x double> %R2.matrix, <9 x double>* %B, i64 10, i1 false, i32 3, i32 3), !dbg !36 ret void } diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll @@ -2,20 +2,20 @@ ; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s ; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s -define <9 x double> @strided_load_3x3(<9 x double>* %in, i32 %stride) { +define <9 x double> @strided_load_3x3(<9 x double>* %in, i64 %stride) { ; CHECK-LABEL: @strided_load_3x3( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <9 x double>* [[IN:%.*]] to double* -; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]] -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 [[VEC_START]] +; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 [[VEC_START]] ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST]], align 8 -; CHECK-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]] -; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, double* [[TMP0]], i32 [[VEC_START1]] +; CHECK-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]] +; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, double* [[TMP0]], i64 [[VEC_START1]] ; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast double* [[VEC_GEP2]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST3]], align 8 -; CHECK-NEXT: [[VEC_START5:%.*]] = mul i32 2, [[STRIDE]] -; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP0]], i32 [[VEC_START5]] +; CHECK-NEXT: [[VEC_START5:%.*]] = mul i64 2, [[STRIDE]] +; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP0]], i64 [[VEC_START5]] ; CHECK-NEXT: [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <3 x double>* ; CHECK-NEXT: [[COL_LOAD8:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST7]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD4]], <6 x i32> @@ -24,51 +24,51 @@ ; CHECK-NEXT: ret <9 x double> [[TMP3]] ; entry: - %load = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %in, i32 %stride, i32 3, i32 3) + %load = call <9 x double> @llvm.matrix.column.major.load(<9 x double>* %in, i64 %stride, i1 false, i32 3, i32 3) ret <9 x double> %load } -declare <9 x double> @llvm.matrix.columnwise.load(<9 x double>*, i32, i32, i32) +declare <9 x double> @llvm.matrix.column.major.load(<9 x double>*, i64, i1, i32, i32) -define <9 x double> @strided_load_9x1(<9 x double>* %in, i32 %stride) { +define <9 x double> @strided_load_9x1(<9 x double>* %in, i64 %stride) { ; CHECK-LABEL: @strided_load_9x1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <9 x double>* [[IN:%.*]] to double* -; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]] -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 [[VEC_START]] +; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 [[VEC_START]] ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[VEC_GEP]] to <9 x double>* ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <9 x double>, <9 x double>* [[VEC_CAST]], align 8 ; CHECK-NEXT: ret <9 x double> [[COL_LOAD]] ; entry: - %load = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %in, i32 %stride, i32 9, i32 1) + %load = call <9 x double> @llvm.matrix.column.major.load(<9 x double>* %in, i64 %stride, i1 false, i32 9, i32 1) ret <9 x double> %load } -declare <8 x double> @llvm.matrix.columnwise.load.v8f64(<8 x double>*, i32, i32, i32) +declare <8 x double> @llvm.matrix.column.major.load.v8f64(<8 x double>*, i64, i1, i32, i32) -define <8 x double> @strided_load_4x2(<8 x double>* %in, i32 %stride) { +define <8 x double> @strided_load_4x2(<8 x double>* %in, i64 %stride) { ; CHECK-LABEL: @strided_load_4x2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x double>* [[IN:%.*]] to double* -; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]] -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 [[VEC_START]] +; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 [[VEC_START]] ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[VEC_GEP]] to <4 x double>* ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST]], align 8 -; CHECK-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]] -; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, double* [[TMP0]], i32 [[VEC_START1]] +; CHECK-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]] +; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, double* [[TMP0]], i64 [[VEC_START1]] ; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast double* [[VEC_GEP2]] to <4 x double>* ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST3]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> [[COL_LOAD4]], <8 x i32> ; CHECK-NEXT: ret <8 x double> [[TMP1]] ; entry: - %load = call <8 x double> @llvm.matrix.columnwise.load.v8f64(<8 x double>* %in, i32 %stride, i32 4, i32 2) + %load = call <8 x double> @llvm.matrix.column.major.load.v8f64(<8 x double>* %in, i64 %stride, i1 false, i32 4, i32 2) ret <8 x double> %load } -; CHECK: declare <9 x double> @llvm.matrix.columnwise.load.v9f64.p0v9f64(<9 x double>* nocapture, i32, i32 immarg, i32 immarg) [[READONLY:#[0-9]]] +; CHECK: declare <9 x double> @llvm.matrix.column.major.load.v9f64.p0v9f64(<9 x double>* nocapture, i64, i1 immarg, i32 immarg, i32 immarg) [[READONLY:#[0-9]]] -; CHECK: declare <8 x double> @llvm.matrix.columnwise.load.v8f64.p0v8f64(<8 x double>* nocapture, i32, i32 immarg, i32 immarg) [[READONLY]] +; CHECK: declare <8 x double> @llvm.matrix.column.major.load.v8f64.p0v8f64(<8 x double>* nocapture, i64, i1 immarg, i32 immarg, i32 immarg) [[READONLY]] ; CHECK: attributes [[READONLY]] = { argmemonly nosync nounwind readonly willreturn } diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-float.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-float.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-float.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-float.ll @@ -2,20 +2,20 @@ ; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s ; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s -define <9 x float> @strided_load_3x3(<9 x float>* %in, i32 %stride) { +define <9 x float> @strided_load_3x3(<9 x float>* %in, i64 %stride) { ; CHECK-LABEL: @strided_load_3x3( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <9 x float>* [[IN:%.*]] to float* -; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]] -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, float* [[TMP0]], i32 [[VEC_START]] +; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, float* [[TMP0]], i64 [[VEC_START]] ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast float* [[VEC_GEP]] to <3 x float>* ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <3 x float>, <3 x float>* [[VEC_CAST]], align 4 -; CHECK-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]] -; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr float, float* [[TMP0]], i32 [[VEC_START1]] +; CHECK-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]] +; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr float, float* [[TMP0]], i64 [[VEC_START1]] ; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast float* [[VEC_GEP2]] to <3 x float>* ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <3 x float>, <3 x float>* [[VEC_CAST3]], align 4 -; CHECK-NEXT: [[VEC_START5:%.*]] = mul i32 2, [[STRIDE]] -; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr float, float* [[TMP0]], i32 [[VEC_START5]] +; CHECK-NEXT: [[VEC_START5:%.*]] = mul i64 2, [[STRIDE]] +; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr float, float* [[TMP0]], i64 [[VEC_START5]] ; CHECK-NEXT: [[VEC_CAST7:%.*]] = bitcast float* [[VEC_GEP6]] to <3 x float>* ; CHECK-NEXT: [[COL_LOAD8:%.*]] = load <3 x float>, <3 x float>* [[VEC_CAST7]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> [[COL_LOAD4]], <6 x i32> @@ -24,45 +24,45 @@ ; CHECK-NEXT: ret <9 x float> [[TMP3]] ; entry: - %load = call <9 x float> @llvm.matrix.columnwise.load(<9 x float>* %in, i32 %stride, i32 3, i32 3) + %load = call <9 x float> @llvm.matrix.column.major.load(<9 x float>* %in, i64 %stride, i1 false, i32 3, i32 3) ret <9 x float> %load } -declare <9 x float> @llvm.matrix.columnwise.load(<9 x float>*, i32, i32, i32) +declare <9 x float> @llvm.matrix.column.major.load(<9 x float>*, i64, i1, i32, i32) -define <9 x float> @strided_load_9x1(<9 x float>* %in, i32 %stride) { +define <9 x float> @strided_load_9x1(<9 x float>* %in, i64 %stride) { ; CHECK-LABEL: @strided_load_9x1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <9 x float>* [[IN:%.*]] to float* -; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]] -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, float* [[TMP0]], i32 [[VEC_START]] +; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, float* [[TMP0]], i64 [[VEC_START]] ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast float* [[VEC_GEP]] to <9 x float>* ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <9 x float>, <9 x float>* [[VEC_CAST]], align 4 ; CHECK-NEXT: ret <9 x float> [[COL_LOAD]] ; entry: - %load = call <9 x float> @llvm.matrix.columnwise.load(<9 x float>* %in, i32 %stride, i32 9, i32 1) + %load = call <9 x float> @llvm.matrix.column.major.load(<9 x float>* %in, i64 %stride, i1 false, i32 9, i32 1) ret <9 x float> %load } -declare <8 x float> @llvm.matrix.columnwise.load.v8f32(<8 x float>*, i32, i32, i32) +declare <8 x float> @llvm.matrix.column.major.load.v8f32(<8 x float>*, i64, i1, i32, i32) -define <8 x float> @strided_load_4x2(<8 x float>* %in, i32 %stride) { +define <8 x float> @strided_load_4x2(<8 x float>* %in, i64 %stride) { ; CHECK-LABEL: @strided_load_4x2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x float>* [[IN:%.*]] to float* -; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]] -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, float* [[TMP0]], i32 [[VEC_START]] +; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, float* [[TMP0]], i64 [[VEC_START]] ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast float* [[VEC_GEP]] to <4 x float>* ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <4 x float>, <4 x float>* [[VEC_CAST]], align 4 -; CHECK-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]] -; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr float, float* [[TMP0]], i32 [[VEC_START1]] +; CHECK-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]] +; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr float, float* [[TMP0]], i64 [[VEC_START1]] ; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast float* [[VEC_GEP2]] to <4 x float>* ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[VEC_CAST3]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[COL_LOAD]], <4 x float> [[COL_LOAD4]], <8 x i32> ; CHECK-NEXT: ret <8 x float> [[TMP1]] ; entry: - %load = call <8 x float> @llvm.matrix.columnwise.load.v8f32(<8 x float>* %in, i32 %stride, i32 4, i32 2) + %load = call <8 x float> @llvm.matrix.column.major.load.v8f32(<8 x float>* %in, i64 %stride, i1 false, i32 4, i32 2) ret <8 x float> %load } diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-i32.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-i32.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-i32.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-i32.ll @@ -2,20 +2,20 @@ ; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s ; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s -define <9 x i32> @strided_load_3x3(<9 x i32>* %in, i32 %stride) { +define <9 x i32> @strided_load_3x3(<9 x i32>* %in, i64 %stride) { ; CHECK-LABEL: @strided_load_3x3( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <9 x i32>* [[IN:%.*]] to i32* -; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]] -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[VEC_START]] +; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[VEC_START]] ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast i32* [[VEC_GEP]] to <3 x i32>* ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <3 x i32>, <3 x i32>* [[VEC_CAST]], align 4 -; CHECK-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]] -; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[VEC_START1]] +; CHECK-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]] +; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[VEC_START1]] ; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast i32* [[VEC_GEP2]] to <3 x i32>* ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <3 x i32>, <3 x i32>* [[VEC_CAST3]], align 4 -; CHECK-NEXT: [[VEC_START5:%.*]] = mul i32 2, [[STRIDE]] -; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[VEC_START5]] +; CHECK-NEXT: [[VEC_START5:%.*]] = mul i64 2, [[STRIDE]] +; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[VEC_START5]] ; CHECK-NEXT: [[VEC_CAST7:%.*]] = bitcast i32* [[VEC_GEP6]] to <3 x i32>* ; CHECK-NEXT: [[COL_LOAD8:%.*]] = load <3 x i32>, <3 x i32>* [[VEC_CAST7]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[COL_LOAD]], <3 x i32> [[COL_LOAD4]], <6 x i32> @@ -24,45 +24,45 @@ ; CHECK-NEXT: ret <9 x i32> [[TMP3]] ; entry: - %load = call <9 x i32> @llvm.matrix.columnwise.load(<9 x i32>* %in, i32 %stride, i32 3, i32 3) + %load = call <9 x i32> @llvm.matrix.column.major.load(<9 x i32>* %in, i64 %stride, i1 false, i32 3, i32 3) ret <9 x i32> %load } -declare <9 x i32> @llvm.matrix.columnwise.load(<9 x i32>*, i32, i32, i32) +declare <9 x i32> @llvm.matrix.column.major.load(<9 x i32>*, i64, i1, i32, i32) -define <9 x i32> @strided_load_9x1(<9 x i32>* %in, i32 %stride) { +define <9 x i32> @strided_load_9x1(<9 x i32>* %in, i64 %stride) { ; CHECK-LABEL: @strided_load_9x1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <9 x i32>* [[IN:%.*]] to i32* -; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]] -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[VEC_START]] +; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[VEC_START]] ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast i32* [[VEC_GEP]] to <9 x i32>* ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <9 x i32>, <9 x i32>* [[VEC_CAST]], align 4 ; CHECK-NEXT: ret <9 x i32> [[COL_LOAD]] ; entry: - %load = call <9 x i32> @llvm.matrix.columnwise.load(<9 x i32>* %in, i32 %stride, i32 9, i32 1) + %load = call <9 x i32> @llvm.matrix.column.major.load(<9 x i32>* %in, i64 %stride, i1 false, i32 9, i32 1) ret <9 x i32> %load } -declare <8 x i32> @llvm.matrix.columnwise.load.v8i32(<8 x i32>*, i32, i32, i32) +declare <8 x i32> @llvm.matrix.column.major.load.v8i32(<8 x i32>*, i64, i1, i32, i32) -define <8 x i32> @strided_load_4x2(<8 x i32>* %in, i32 %stride) { +define <8 x i32> @strided_load_4x2(<8 x i32>* %in, i64 %stride) { ; CHECK-LABEL: @strided_load_4x2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i32>* [[IN:%.*]] to i32* -; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]] -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[VEC_START]] +; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[VEC_START]] ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast i32* [[VEC_GEP]] to <4 x i32>* ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[VEC_CAST]], align 4 -; CHECK-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]] -; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[VEC_START1]] +; CHECK-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]] +; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[VEC_START1]] ; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast i32* [[VEC_GEP2]] to <4 x i32>* ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[VEC_CAST3]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[COL_LOAD]], <4 x i32> [[COL_LOAD4]], <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] ; entry: - %load = call <8 x i32> @llvm.matrix.columnwise.load.v8i32(<8 x i32>* %in, i32 %stride, i32 4, i32 2) + %load = call <8 x i32> @llvm.matrix.column.major.load.v8i32(<8 x i32>* %in, i64 %stride, i1 false, i32 4, i32 2) ret <8 x i32> %load } diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll @@ -8,35 +8,35 @@ ; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x double> [[IN]], <6 x double> undef, <3 x i32> ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[OUT:%.*]] to <3 x double>* ; CHECK-NEXT: store <3 x double> [[SPLIT]], <3 x double>* [[VEC_CAST]], align 8 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[OUT]], i32 5 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[OUT]], i64 5 ; CHECK-NEXT: [[VEC_CAST2:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>* ; CHECK-NEXT: store <3 x double> [[SPLIT1]], <3 x double>* [[VEC_CAST2]], align 8 ; CHECK-NEXT: ret void ; - call void @llvm.matrix.columnwise.store(<6 x double> %in, double* %out, i32 5, i32 3, i32 2) + call void @llvm.matrix.column.major.store(<6 x double> %in, double* %out, i64 5, i1 false, i32 3, i32 2) ret void } -define void @strided_store_3x2_nonconst_stride(<6 x double> %in, i32 %stride, double* %out) { +define void @strided_store_3x2_nonconst_stride(<6 x double> %in, i64 %stride, double* %out) { ; CHECK-LABEL: @strided_store_3x2_nonconst_stride( ; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x double> [[IN:%.*]], <6 x double> undef, <3 x i32> ; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x double> [[IN]], <6 x double> undef, <3 x i32> -; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]] -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i32 [[VEC_START]] +; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[VEC_START]] ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>* ; CHECK-NEXT: store <3 x double> [[SPLIT]], <3 x double>* [[VEC_CAST]], align 8 -; CHECK-NEXT: [[VEC_START2:%.*]] = mul i32 1, [[STRIDE]] -; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, double* [[OUT]], i32 [[VEC_START2]] +; CHECK-NEXT: [[VEC_START2:%.*]] = mul i64 1, [[STRIDE]] +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, double* [[OUT]], i64 [[VEC_START2]] ; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast double* [[VEC_GEP3]] to <3 x double>* ; CHECK-NEXT: store <3 x double> [[SPLIT1]], <3 x double>* [[VEC_CAST4]], align 8 ; CHECK-NEXT: ret void ; - call void @llvm.matrix.columnwise.store(<6 x double> %in, double* %out, i32 %stride, i32 3, i32 2) + call void @llvm.matrix.column.major.store(<6 x double> %in, double* %out, i64 %stride, i1 false, i32 3, i32 2) ret void } -declare void @llvm.matrix.columnwise.store(<6 x double>, double*, i32, i32, i32) +declare void @llvm.matrix.column.major.store(<6 x double>, double*, i64, i1, i32, i32) define void @strided_store_2x3(<10 x double> %in, double* %out) { ; CHECK-LABEL: @strided_store_2x3( @@ -47,28 +47,28 @@ ; CHECK-NEXT: [[SPLIT4:%.*]] = shufflevector <10 x double> [[IN]], <10 x double> undef, <2 x i32> ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[OUT:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[SPLIT]], <2 x double>* [[VEC_CAST]], align 8 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[OUT]], i32 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[OUT]], i64 4 ; CHECK-NEXT: [[VEC_CAST5:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[SPLIT1]], <2 x double>* [[VEC_CAST5]], align 8 -; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr double, double* [[OUT]], i32 8 +; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr double, double* [[OUT]], i64 8 ; CHECK-NEXT: [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[SPLIT2]], <2 x double>* [[VEC_CAST7]], align 8 -; CHECK-NEXT: [[VEC_GEP8:%.*]] = getelementptr double, double* [[OUT]], i32 12 +; CHECK-NEXT: [[VEC_GEP8:%.*]] = getelementptr double, double* [[OUT]], i64 12 ; CHECK-NEXT: [[VEC_CAST9:%.*]] = bitcast double* [[VEC_GEP8]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[SPLIT3]], <2 x double>* [[VEC_CAST9]], align 8 -; CHECK-NEXT: [[VEC_GEP10:%.*]] = getelementptr double, double* [[OUT]], i32 16 +; CHECK-NEXT: [[VEC_GEP10:%.*]] = getelementptr double, double* [[OUT]], i64 16 ; CHECK-NEXT: [[VEC_CAST11:%.*]] = bitcast double* [[VEC_GEP10]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[SPLIT4]], <2 x double>* [[VEC_CAST11]], align 8 ; CHECK-NEXT: ret void ; - call void @llvm.matrix.columnwise.store.v10f64(<10 x double> %in, double* %out, i32 4, i32 2, i32 5) + call void @llvm.matrix.column.major.store.v10f64(<10 x double> %in, double* %out, i64 4, i1 false, i32 2, i32 5) ret void } -declare void @llvm.matrix.columnwise.store.v10f64(<10 x double>, double*, i32, i32, i32) +declare void @llvm.matrix.column.major.store.v10f64(<10 x double>, double*, i64, i1, i32, i32) -; CHECK: declare void @llvm.matrix.columnwise.store.v6f64.p0f64(<6 x double>, double* nocapture writeonly, i32, i32 immarg, i32 immarg) [[WRITEONLY:#[0-9]]] +; CHECK: declare void @llvm.matrix.column.major.store.v6f64.p0f64(<6 x double>, double* nocapture writeonly, i64, i1 immarg, i32 immarg, i32 immarg) [[WRITEONLY:#[0-9]]] -; CHECK: declare void @llvm.matrix.columnwise.store.v10f64.p0f64(<10 x double>, double* nocapture writeonly, i32, i32 immarg, i32 immarg) [[WRITEONLY]] +; CHECK: declare void @llvm.matrix.column.major.store.v10f64.p0f64(<10 x double>, double* nocapture writeonly, i64, i1 immarg, i32 immarg, i32 immarg) [[WRITEONLY]] ; CHECK: attributes [[WRITEONLY]] = { argmemonly nosync nounwind willreturn writeonly } diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-float.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-float.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-float.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-float.ll @@ -8,35 +8,35 @@ ; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x float> [[IN]], <6 x float> undef, <3 x i32> ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast float* [[OUT:%.*]] to <3 x float>* ; CHECK-NEXT: store <3 x float> [[SPLIT]], <3 x float>* [[VEC_CAST]], align 4 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, float* [[OUT]], i32 5 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, float* [[OUT]], i64 5 ; CHECK-NEXT: [[VEC_CAST2:%.*]] = bitcast float* [[VEC_GEP]] to <3 x float>* ; CHECK-NEXT: store <3 x float> [[SPLIT1]], <3 x float>* [[VEC_CAST2]], align 4 ; CHECK-NEXT: ret void ; - call void @llvm.matrix.columnwise.store(<6 x float> %in, float* %out, i32 5, i32 3, i32 2) + call void @llvm.matrix.column.major.store(<6 x float> %in, float* %out, i64 5, i1 false, i32 3, i32 2) ret void } -define void @strided_store_3x2_nonconst_stride(<6 x float> %in, i32 %stride, float* %out) { +define void @strided_store_3x2_nonconst_stride(<6 x float> %in, i64 %stride, float* %out) { ; CHECK-LABEL: @strided_store_3x2_nonconst_stride( ; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x float> [[IN:%.*]], <6 x float> undef, <3 x i32> ; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x float> [[IN]], <6 x float> undef, <3 x i32> -; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]] -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, float* [[OUT:%.*]], i32 [[VEC_START]] +; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[VEC_START]] ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast float* [[VEC_GEP]] to <3 x float>* ; CHECK-NEXT: store <3 x float> [[SPLIT]], <3 x float>* [[VEC_CAST]], align 4 -; CHECK-NEXT: [[VEC_START2:%.*]] = mul i32 1, [[STRIDE]] -; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, float* [[OUT]], i32 [[VEC_START2]] +; CHECK-NEXT: [[VEC_START2:%.*]] = mul i64 1, [[STRIDE]] +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, float* [[OUT]], i64 [[VEC_START2]] ; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast float* [[VEC_GEP3]] to <3 x float>* ; CHECK-NEXT: store <3 x float> [[SPLIT1]], <3 x float>* [[VEC_CAST4]], align 4 ; CHECK-NEXT: ret void ; - call void @llvm.matrix.columnwise.store(<6 x float> %in, float* %out, i32 %stride, i32 3, i32 2) + call void @llvm.matrix.column.major.store(<6 x float> %in, float* %out, i64 %stride, i1 false, i32 3, i32 2) ret void } -declare void @llvm.matrix.columnwise.store(<6 x float>, float*, i32, i32, i32) +declare void @llvm.matrix.column.major.store(<6 x float>, float*, i64, i1, i32, i32) define void @strided_store_2x3(<10 x float> %in, float* %out) { ; CHECK-LABEL: @strided_store_2x3( @@ -47,22 +47,22 @@ ; CHECK-NEXT: [[SPLIT4:%.*]] = shufflevector <10 x float> [[IN]], <10 x float> undef, <2 x i32> ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast float* [[OUT:%.*]] to <2 x float>* ; CHECK-NEXT: store <2 x float> [[SPLIT]], <2 x float>* [[VEC_CAST]], align 4 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, float* [[OUT]], i32 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, float* [[OUT]], i64 4 ; CHECK-NEXT: [[VEC_CAST5:%.*]] = bitcast float* [[VEC_GEP]] to <2 x float>* ; CHECK-NEXT: store <2 x float> [[SPLIT1]], <2 x float>* [[VEC_CAST5]], align 4 -; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr float, float* [[OUT]], i32 8 +; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr float, float* [[OUT]], i64 8 ; CHECK-NEXT: [[VEC_CAST7:%.*]] = bitcast float* [[VEC_GEP6]] to <2 x float>* ; CHECK-NEXT: store <2 x float> [[SPLIT2]], <2 x float>* [[VEC_CAST7]], align 4 -; CHECK-NEXT: [[VEC_GEP8:%.*]] = getelementptr float, float* [[OUT]], i32 12 +; CHECK-NEXT: [[VEC_GEP8:%.*]] = getelementptr float, float* [[OUT]], i64 12 ; CHECK-NEXT: [[VEC_CAST9:%.*]] = bitcast float* [[VEC_GEP8]] to <2 x float>* ; CHECK-NEXT: store <2 x float> [[SPLIT3]], <2 x float>* [[VEC_CAST9]], align 4 -; CHECK-NEXT: [[VEC_GEP10:%.*]] = getelementptr float, float* [[OUT]], i32 16 +; CHECK-NEXT: [[VEC_GEP10:%.*]] = getelementptr float, float* [[OUT]], i64 16 ; CHECK-NEXT: [[VEC_CAST11:%.*]] = bitcast float* [[VEC_GEP10]] to <2 x float>* ; CHECK-NEXT: store <2 x float> [[SPLIT4]], <2 x float>* [[VEC_CAST11]], align 4 ; CHECK-NEXT: ret void ; - call void @llvm.matrix.columnwise.store.v10f32(<10 x float> %in, float* %out, i32 4, i32 2, i32 5) + call void @llvm.matrix.column.major.store.v10f32(<10 x float> %in, float* %out, i64 4, i1 false, i32 2, i32 5) ret void } -declare void @llvm.matrix.columnwise.store.v10f32(<10 x float>, float*, i32, i32, i32) +declare void @llvm.matrix.column.major.store.v10f32(<10 x float>, float*, i64, i1, i32, i32) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-i32.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-i32.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-i32.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-i32.ll @@ -8,35 +8,35 @@ ; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x i32> [[IN]], <6 x i32> undef, <3 x i32> ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast i32* [[OUT:%.*]] to <3 x i32>* ; CHECK-NEXT: store <3 x i32> [[SPLIT]], <3 x i32>* [[VEC_CAST]], align 4 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, i32* [[OUT]], i32 5 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, i32* [[OUT]], i64 5 ; CHECK-NEXT: [[VEC_CAST2:%.*]] = bitcast i32* [[VEC_GEP]] to <3 x i32>* ; CHECK-NEXT: store <3 x i32> [[SPLIT1]], <3 x i32>* [[VEC_CAST2]], align 4 ; CHECK-NEXT: ret void ; - call void @llvm.matrix.columnwise.store(<6 x i32> %in, i32* %out, i32 5, i32 3, i32 2) + call void @llvm.matrix.column.major.store(<6 x i32> %in, i32* %out, i64 5, i1 false, i32 3, i32 2) ret void } -define void @strided_store_3x2_nonconst_stride(<6 x i32> %in, i32 %stride, i32* %out) { +define void @strided_store_3x2_nonconst_stride(<6 x i32> %in, i64 %stride, i32* %out) { ; CHECK-LABEL: @strided_store_3x2_nonconst_stride( ; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x i32> [[IN:%.*]], <6 x i32> undef, <3 x i32> ; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x i32> [[IN]], <6 x i32> undef, <3 x i32> -; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]] -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i32 [[VEC_START]] +; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i64 [[VEC_START]] ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast i32* [[VEC_GEP]] to <3 x i32>* ; CHECK-NEXT: store <3 x i32> [[SPLIT]], <3 x i32>* [[VEC_CAST]], align 4 -; CHECK-NEXT: [[VEC_START2:%.*]] = mul i32 1, [[STRIDE]] -; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr i32, i32* [[OUT]], i32 [[VEC_START2]] +; CHECK-NEXT: [[VEC_START2:%.*]] = mul i64 1, [[STRIDE]] +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr i32, i32* [[OUT]], i64 [[VEC_START2]] ; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast i32* [[VEC_GEP3]] to <3 x i32>* ; CHECK-NEXT: store <3 x i32> [[SPLIT1]], <3 x i32>* [[VEC_CAST4]], align 4 ; CHECK-NEXT: ret void ; - call void @llvm.matrix.columnwise.store(<6 x i32> %in, i32* %out, i32 %stride, i32 3, i32 2) + call void @llvm.matrix.column.major.store(<6 x i32> %in, i32* %out, i64 %stride, i1 false, i32 3, i32 2) ret void } -declare void @llvm.matrix.columnwise.store(<6 x i32>, i32*, i32, i32, i32) +declare void @llvm.matrix.column.major.store(<6 x i32>, i32*, i64, i1, i32, i32) define void @strided_store_2x3(<10 x i32> %in, i32* %out) { ; CHECK-LABEL: @strided_store_2x3( @@ -47,22 +47,22 @@ ; CHECK-NEXT: [[SPLIT4:%.*]] = shufflevector <10 x i32> [[IN]], <10 x i32> undef, <2 x i32> ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast i32* [[OUT:%.*]] to <2 x i32>* ; CHECK-NEXT: store <2 x i32> [[SPLIT]], <2 x i32>* [[VEC_CAST]], align 4 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, i32* [[OUT]], i32 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, i32* [[OUT]], i64 4 ; CHECK-NEXT: [[VEC_CAST5:%.*]] = bitcast i32* [[VEC_GEP]] to <2 x i32>* ; CHECK-NEXT: store <2 x i32> [[SPLIT1]], <2 x i32>* [[VEC_CAST5]], align 4 -; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr i32, i32* [[OUT]], i32 8 +; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr i32, i32* [[OUT]], i64 8 ; CHECK-NEXT: [[VEC_CAST7:%.*]] = bitcast i32* [[VEC_GEP6]] to <2 x i32>* ; CHECK-NEXT: store <2 x i32> [[SPLIT2]], <2 x i32>* [[VEC_CAST7]], align 4 -; CHECK-NEXT: [[VEC_GEP8:%.*]] = getelementptr i32, i32* [[OUT]], i32 12 +; CHECK-NEXT: [[VEC_GEP8:%.*]] = getelementptr i32, i32* [[OUT]], i64 12 ; CHECK-NEXT: [[VEC_CAST9:%.*]] = bitcast i32* [[VEC_GEP8]] to <2 x i32>* ; CHECK-NEXT: store <2 x i32> [[SPLIT3]], <2 x i32>* [[VEC_CAST9]], align 4 -; CHECK-NEXT: [[VEC_GEP10:%.*]] = getelementptr i32, i32* [[OUT]], i32 16 +; CHECK-NEXT: [[VEC_GEP10:%.*]] = getelementptr i32, i32* [[OUT]], i64 16 ; CHECK-NEXT: [[VEC_CAST11:%.*]] = bitcast i32* [[VEC_GEP10]] to <2 x i32>* ; CHECK-NEXT: store <2 x i32> [[SPLIT4]], <2 x i32>* [[VEC_CAST11]], align 4 ; CHECK-NEXT: ret void ; - call void @llvm.matrix.columnwise.store.v10i32(<10 x i32> %in, i32* %out, i32 4, i32 2, i32 5) + call void @llvm.matrix.column.major.store.v10i32(<10 x i32> %in, i32* %out, i64 4, i1 false, i32 2, i32 5) ret void } -declare void @llvm.matrix.columnwise.store.v10i32(<10 x i32>, i32*, i32, i32, i32) +declare void @llvm.matrix.column.major.store.v10i32(<10 x i32>, i32*, i64, i1, i32, i32) diff --git a/llvm/test/Verifier/matrix-intrinsics.ll b/llvm/test/Verifier/matrix-intrinsics.ll --- a/llvm/test/Verifier/matrix-intrinsics.ll +++ b/llvm/test/Verifier/matrix-intrinsics.ll @@ -19,22 +19,22 @@ ret <4 x float> %result.2 } -declare <4 x float> @llvm.matrix.columnwise.load.v4f32.p0v4f32(<4 x float>*, i32, i32, i32) -declare <6 x float> @llvm.matrix.columnwise.load.v6f32.p0v6f32(<6 x float>*, i32, i32, i32) -define <4 x float> @columnwise_load(<4 x float>* %m, <6 x float>* %n) { +declare <4 x float> @llvm.matrix.column.major.load.v4f32.p0v4f32(<4 x float>*, i64, i1, i32, i32) +declare <6 x float> @llvm.matrix.column.major.load.v6f32.p0v6f32(<6 x float>*, i64, i1, i32, i32) +define <4 x float> @column.major_load(<4 x float>* %m, <6 x float>* %n) { ; CHECK-NEXT: result of a matrix operation does not fit in the returned vector ; CHECK-NEXT: result of a matrix operation does not fit in the returned vector - %result.1 = call <4 x float> @llvm.matrix.columnwise.load.v4f32.p0v4f32(<4 x float>* %m, i32 2, i32 1, i32 2) - %result.2 = call <6 x float> @llvm.matrix.columnwise.load.v6f32.p0v6f32(<6 x float>* %n, i32 2, i32 3, i32 3) + %result.1 = call <4 x float> @llvm.matrix.column.major.load.v4f32.p0v4f32(<4 x float>* %m, i64 2, i1 false, i32 1, i32 2) + %result.2 = call <6 x float> @llvm.matrix.column.major.load.v6f32.p0v6f32(<6 x float>* %n, i64 2, i1 true, i32 3, i32 3) ret <4 x float> %result.1 } -declare void @llvm.matrix.columnwise.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, i32, i32) -declare void @llvm.matrix.columnwise.store.v6f32.p0v6f32(<6 x float>, <6 x float>*, i32, i32, i32) -define void @columnwise_store(<4 x float>* %m, <6 x float>* %n) { +declare void @llvm.matrix.column.major.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i64, i1, i32, i32) +declare void @llvm.matrix.column.major.store.v6f32.p0v6f32(<6 x float>, <6 x float>*, i64, i1, i32, i32) +define void @column.major_store(<4 x float>* %m, <6 x float>* %n) { ; CHECK-NEXT: result of a matrix operation does not fit in the returned vector ; CHECK-NEXT: result of a matrix operation does not fit in the returned vector - call void @llvm.matrix.columnwise.store.v4f32.p0v4f32(<4 x float> zeroinitializer, <4 x float>* %m, i32 2, i32 1, i32 2) - call void @llvm.matrix.columnwise.store.v6f32.p0v6f32(<6 x float> zeroinitializer, <6 x float>* %n, i32 2, i32 3, i32 3) + call void @llvm.matrix.column.major.store.v4f32.p0v4f32(<4 x float> zeroinitializer, <4 x float>* %m, i64 2, i1 false, i32 1, i32 2) + call void @llvm.matrix.column.major.store.v6f32.p0v6f32(<6 x float> zeroinitializer, <6 x float>* %n, i64 2, i1 false, i32 3, i32 3) ret void } diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -815,40 +815,52 @@ // LLVM Matrix operations. // -/// Create a columnwise, strided 2-D matrix load, as specified in the LLVM +/// Create a column major, strided 2-D matrix load, as specified in the LLVM /// MatrixBuilder. -/// data - Start address of the matrix read -/// rows - Number of rows in matrix (must be a constant) -/// columns - Number of columns in matrix (must be a constant) -/// stride - Space between columns -def LLVM_MatrixColumnsWiseLoadOp - : LLVM_OneResultOp<"intr.matrix.columnwise.load">, - Arguments<(ins LLVM_Type:$data, LLVM_Type:$stride, +/// data - Start address of the matrix read +/// rows - Number of rows in matrix (must be a constant) +/// isVolatile - True if the load operation is marked as volatile. +/// columns - Number of columns in matrix (must be a constant) +/// stride - Space between columns +def LLVM_MatrixColumnMajorLoadOp + : LLVM_OneResultOp<"intr.matrix.column.major.load">, + Arguments<(ins LLVM_Type:$data, LLVM_Type:$stride, I1Attr:$isVolatile, I32Attr:$rows, I32Attr:$columns)> { string llvmBuilder = [{ llvm::MatrixBuilder mb(builder); - $res = mb.CreateMatrixColumnwiseLoad( - $data, $rows.getZExtValue(), $columns.getZExtValue(), $stride); + const llvm::DataLayout &dl = + builder.GetInsertBlock()->getModule()->getDataLayout(); + llvm::Align align = dl.getABITypeAlign( + $data->getType()->getPointerElementType()); + $res = mb.CreateColumnMajorLoad( + $data, align, $stride, $isVolatile.getZExtValue(), $rows.getZExtValue(), + $columns.getZExtValue()); }]; let assemblyFormat = "$data `,` `<` `stride` `=` $stride `>` attr-dict" "`:` type($res) `from` type($data) `stride` type($stride)"; } -/// Create a columnwise, strided 2-D matrix store, as specified in the LLVM +/// Create a column major, strided 2-D matrix store, as specified in the LLVM /// MatrixBuilder. -/// matrix - Matrix to store -/// ptr - Pointer to write back to -/// rows - Number of rows in matrix (must be a constant) -/// columns - Number of columns in matrix (must be a constant) -/// stride - Space between columns -def LLVM_MatrixColumnsWiseStoreOp - : LLVM_ZeroResultOp<"intr.matrix.columnwise.store">, +/// matrix - Matrix to store +/// ptr - Pointer to write back to +/// isVolatile - True if the load operation is marked as volatile. +/// rows - Number of rows in matrix (must be a constant) +/// columns - Number of columns in matrix (must be a constant) +/// stride - Space between columns +def LLVM_MatrixColumnMajorStoreOp + : LLVM_ZeroResultOp<"intr.matrix.column.major.store">, Arguments<(ins LLVM_Type:$matrix, LLVM_Type:$data, LLVM_Type:$stride, - I32Attr:$rows, I32Attr:$columns)> { + I1Attr:$isVolatile, I32Attr:$rows, I32Attr:$columns)> { string llvmBuilder = [{ llvm::MatrixBuilder mb(builder); - mb.CreateMatrixColumnwiseStore( - $matrix, $data, $stride, $rows.getZExtValue(), $columns.getZExtValue()); + const llvm::DataLayout &dl = + builder.GetInsertBlock()->getModule()->getDataLayout(); + llvm::Align align = dl.getABITypeAlign( + $data->getType()->getPointerElementType()); + mb.CreateColumnMajorStore( + $matrix, $data, align, $stride, $isVolatile.getZExtValue(), + $rows.getZExtValue(), $columns.getZExtValue()); }]; let assemblyFormat = "$matrix `,` $data `,` `<` `stride` `=` $stride `>` " "attr-dict`:` type($matrix) `to` type($data) `stride` type($stride)"; diff --git a/mlir/test/Target/llvmir-intrinsics.mlir b/mlir/test/Target/llvmir-intrinsics.mlir --- a/mlir/test/Target/llvmir-intrinsics.mlir +++ b/mlir/test/Target/llvmir-intrinsics.mlir @@ -151,7 +151,7 @@ // CHECK-LABEL: @matrix_intrinsics // 4x16 16x3 llvm.func @matrix_intrinsics(%A: !llvm<"<64 x float>">, %B: !llvm<"<48 x float>">, - %ptr: !llvm<"float*">, %stride: !llvm.i32) { + %ptr: !llvm<"float*">, %stride: !llvm.i64) { // CHECK: call <12 x float> @llvm.matrix.multiply.v12f32.v64f32.v48f32(<64 x float> %0, <48 x float> %1, i32 4, i32 16, i32 3) %C = llvm.intr.matrix.multiply %A, %B { lhs_rows = 4: i32, lhs_columns = 16: i32 , rhs_columns = 3: i32} : @@ -159,14 +159,14 @@ // CHECK: call <48 x float> @llvm.matrix.transpose.v48f32(<48 x float> %1, i32 3, i32 16) %D = llvm.intr.matrix.transpose %B { rows = 3: i32, columns = 16: i32} : !llvm<"<48 x float>"> into !llvm<"<48 x float>"> - // CHECK: call <48 x float> @llvm.matrix.columnwise.load.v48f32.p0f32(float* %2, i32 %3, i32 3, i32 16) - %E = llvm.intr.matrix.columnwise.load %ptr, - { rows = 3: i32, columns = 16: i32} : - !llvm<"<48 x float>"> from !llvm<"float*"> stride !llvm.i32 - // CHECK: call void @llvm.matrix.columnwise.store.v48f32.p0f32(<48 x float> %7, float* %2, i32 %3, i32 3, i32 16) - llvm.intr.matrix.columnwise.store %E, %ptr, - { rows = 3: i32, columns = 16: i32} : - !llvm<"<48 x float>"> to !llvm<"float*"> stride !llvm.i32 + // CHECK: call <48 x float> @llvm.matrix.column.major.load.v48f32.p0f32(float* align 4 %2, i64 %3, i1 false, i32 3, i32 16) + %E = llvm.intr.matrix.column.major.load %ptr, + { isVolatile = 0: i1, rows = 3: i32, columns = 16: i32} : + !llvm<"<48 x float>"> from !llvm<"float*"> stride !llvm.i64 + // CHECK: call void @llvm.matrix.column.major.store.v48f32.p0f32(<48 x float> %7, float* align 4 %2, i64 %3, i1 false, i32 3, i32 16) + llvm.intr.matrix.column.major.store %E, %ptr, + { isVolatile = 0: i1, rows = 3: i32, columns = 16: i32} : + !llvm<"<48 x float>"> to !llvm<"float*"> stride !llvm.i64 llvm.return } @@ -209,7 +209,7 @@ // CHECK-DAG: declare float @llvm.copysign.f32(float, float) // CHECK-DAG: declare <12 x float> @llvm.matrix.multiply.v12f32.v64f32.v48f32(<64 x float>, <48 x float>, i32 immarg, i32 immarg, i32 immarg) // CHECK-DAG: declare <48 x float> @llvm.matrix.transpose.v48f32(<48 x float>, i32 immarg, i32 immarg) -// CHECK-DAG: declare <48 x float> @llvm.matrix.columnwise.load.v48f32.p0f32(float* nocapture, i32, i32 immarg, i32 immarg) -// CHECK-DAG: declare void @llvm.matrix.columnwise.store.v48f32.p0f32(<48 x float>, float* nocapture writeonly, i32, i32 immarg, i32 immarg) +// CHECK-DAG: declare <48 x float> @llvm.matrix.column.major.load.v48f32.p0f32(float* nocapture, i64, i1 immarg, i32 immarg, i32 immarg) +// CHECK-DAG: declare void @llvm.matrix.column.major.store.v48f32.p0f32(<48 x float>, float* nocapture writeonly, i64, i1 immarg, i32 immarg, i32 immarg) // CHECK-DAG: declare <7 x float> @llvm.masked.load.v7f32.p0v7f32(<7 x float>*, i32 immarg, <7 x i1>, <7 x float>) // CHECK-DAG: declare void @llvm.masked.store.v7f32.p0v7f32(<7 x float>, <7 x float>*, i32 immarg, <7 x i1>)