diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -14407,6 +14407,115 @@ """""""""" The argument to this intrinsic must be a vector of floating-point values. +Matrix Intrinsics +----------------- + +Operations on matrixes requiring shape information (like number of rows/columns +or the memory layout) can be expressed using the matrix intrinsics. Matrixes are +embedded in a flat vector and the intrinsics take the dimensions as arguments. +Currently column-major layout is assumed. The intrinsics support both integer +and floating point matrixes. + + +'``llvm.matrix.transpose.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare vectorty @llvm.matrix.transpose.*(vectorty %In, i32 , i32 ) + +Overview: +""""""""" + +The '``llvm.matrix.transpose.*``' intrinsic treats %In as containing a matrix +with rows and columns and returns the transposed matrix embedded in +the result vector. + +Arguments: +"""""""""" + +The and arguments must be constant integers. The vector argument +%In and the returned vector must have * elements. + +'``llvm.matrix.multiply.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare vectorty @llvm.matrix.multiply.*(vectorty %A, vectorty %B, i32 , i32 , i32 ) + +Overview: +""""""""" + +The '``llvm.matrix.multiply.*``' intrinsic treats %A as matrix with rows and columns, %B as +matrix with rows and columns and multiplies them. The result matrix is returned embedded in the +result vector. + +Arguments: +"""""""""" + +The , and arguments must be constant integers. The vector argument %A +must have * elements, %B must have * elements and the returned +vector must have * elements. + + +'``llvm.matrix.columnwise.load.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare vectorty @llvm.matrix.columnwise.load.*(ptrty %Ptr, i32 %Stride, i32 , i32 ) + +Overview: +""""""""" + +The '``llvm.matrix.columnwise.load.*``' intrinsic loads a matrix with +rows and columns, using a stride of %Stride between columns. For two +consecutive columns A and B, %Stride refers to the distance (the number of +elements) between the start of column A and the start of column B. The result +matrix is returned embedded in the result vector. This allows for convenient +loading of sub matrixes. + +Arguments: +"""""""""" + +The and arguments must be constant integers. The returned vector +must have * elements. %Stride must be >= . + +'``llvm.matrix.columnwise.store.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare void @llvm.matrix.columnwise.store.*(vectorty %In, ptrty %Ptr, i32 %Stride, i32 , i32 ) + +Overview: +""""""""" + +The '``llvm.matrix.columnwise.store.*``' intrinsic stores the matrix with + rows and columns embedded in %In, using a stride of %Stride +between columns. For two consecutive columns A and B, %Stride refers to the +distance (the number of elements) between the start of column A and the start +of column B. + +Arguments: +"""""""""" + +The and arguments must be constant integers. The vector argument +%In must have * elements. %Stride must be >= . + Half Precision Floating-Point Intrinsics ---------------------------------------- diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1242,6 +1242,42 @@ [llvm_anyvector_ty]>; } +//===----- Matrix intrinsics ---------------------------------------------===// + +def int_matrix_transpose : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + llvm_i32_ty, + llvm_i32_ty], + [IntrNoMem, IntrSpeculatable, + IntrWillReturn, ImmArg<1>, ImmArg<2>]>; + +def int_matrix_multiply : Intrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty, + llvm_anyvector_ty, + llvm_i32_ty, + llvm_i32_ty, + llvm_i32_ty], + [IntrNoMem, IntrSpeculatable, + IntrWillReturn, ImmArg<2>, ImmArg<3>, + ImmArg<4>]>; + +def int_matrix_columnwise_load : Intrinsic<[llvm_anyvector_ty], + [LLVMAnyPointerType>, + llvm_i32_ty, + llvm_i32_ty, + llvm_i32_ty], + [IntrReadMem, IntrWillReturn, + ImmArg<2>, ImmArg<3>]>; + +def int_matrix_columnwise_store : Intrinsic<[], + [llvm_anyvector_ty, + LLVMAnyPointerType>, + llvm_i32_ty, + llvm_i32_ty, + llvm_i32_ty], + [WriteOnly<1>, IntrWillReturn, + ImmArg<3>, ImmArg<4>]>; + //===---------- Intrinsics to control hardware supported loops ----------===// // Specify that the value given is the number of iterations that the next loop diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -255,6 +255,7 @@ void initializeLowerInvokeLegacyPassPass(PassRegistry&); void initializeLowerSwitchPass(PassRegistry&); void initializeLowerTypeTestsPass(PassRegistry&); +void initializeLowerMatrixIntrinsicsLegacyPassPass(PassRegistry &); void initializeMIRCanonicalizerPass(PassRegistry &); void initializeMIRNamerPass(PassRegistry &); void initializeMIRPrintingPassPass(PassRegistry&); diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h --- a/llvm/include/llvm/Transforms/Scalar.h +++ b/llvm/include/llvm/Transforms/Scalar.h @@ -361,6 +361,12 @@ //===----------------------------------------------------------------------===// // +// LowerMatrixIntrinsics - Lower matrix intrinsics to vector operations. +// +Pass *createLowerMatrixIntrinsicsPass(); + +//===----------------------------------------------------------------------===// +// // LowerWidenableCondition - Lower widenable condition to i1 true. // Pass *createLowerWidenableConditionPass(); diff --git a/llvm/include/llvm/Transforms/Scalar/LowerMatrixIntrinsics.h b/llvm/include/llvm/Transforms/Scalar/LowerMatrixIntrinsics.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Transforms/Scalar/LowerMatrixIntrinsics.h @@ -0,0 +1,24 @@ +//===- LowerMatrixIntrinsics.h - Lower matrix intrinsics. -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass lowers matrix intrinsics down to vector operations. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_LOWERMATRIXINTRINSICSPASS_H +#define LLVM_TRANSFORMS_SCALAR_LOWERMATRIXINTRINSICSPASS_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { +struct LowerMatrixIntrinsicsPass : PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; +} // namespace llvm + +#endif diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -146,6 +146,7 @@ #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h" #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" #include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h" +#include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" #include "llvm/Transforms/Scalar/LowerWidenableCondition.h" #include "llvm/Transforms/Scalar/MakeGuardsExplicit.h" #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -189,6 +189,7 @@ FUNCTION_PASS("lower-expect", LowerExpectIntrinsicPass()) FUNCTION_PASS("lower-guard-intrinsic", LowerGuardIntrinsicPass()) FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass()) +FUNCTION_PASS("lower-matrix-intrinsics", LowerMatrixIntrinsicsPass()) FUNCTION_PASS("lower-widenable-condition", LowerWidenableConditionPass()) FUNCTION_PASS("guard-widening", GuardWideningPass()) FUNCTION_PASS("gvn", GVN()) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -147,6 +147,10 @@ "enable-order-file-instrumentation", cl::init(false), cl::Hidden, cl::desc("Enable order file instrumentation (default = off)")); +static cl::opt + EnableMatrix("enable-matrix", cl::init(false), cl::Hidden, + cl::desc("Enable lowering of the matrix intrinsics")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -669,6 +673,14 @@ MPM.add(createFloat2IntPass()); MPM.add(createLowerConstantIntrinsicsPass()); + if (EnableMatrix) { + MPM.add(createLowerMatrixIntrinsicsPass()); + // CSE the pointer arithmetic of the column vectors. This allows alias + // analysis to establish no-aliasing between loads and stores of different + // columns of the same matrix. + MPM.add(createEarlyCSEPass(false)); + } + addExtensionsToPM(EP_VectorizerStart, MPM); // Re-rotate loops in all our loop nests. These may have fallout out of diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt --- a/llvm/lib/Transforms/Scalar/CMakeLists.txt +++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -47,6 +47,7 @@ LowerConstantIntrinsics.cpp LowerExpectIntrinsic.cpp LowerGuardIntrinsic.cpp + LowerMatrixIntrinsics.cpp LowerWidenableCondition.cpp MakeGuardsExplicit.cpp MemCpyOptimizer.cpp diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -0,0 +1,479 @@ +//===- LowerMatrixIntrinsics.cpp - Lower matrix intrinsics -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Lower matrix intrinsics to vector operations. +// +// TODO: +// * Implement multiply & add fusion +// * Implement shape propagation +// * Implement optimizations to reduce or eliminateshufflevector uses by using +// shape information. +// * Add remark, summarizing the available matrix optimization opportunities. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; + +#define DEBUG_TYPE "lower-matrix-intrinsics" + +namespace { + +// Given an element poitner \p BasePtr to the start of a (sub) matrix, compute +// the start address of column \p Col with type (\p EltType x \p NumRows) +// assuming \p Stride elements between start two consecutive columns. +// \p Stride must be >= \p NumRows. +// +// Consider a 4x4 matrix like below +// +// 0 1 2 3 +// 0 v_0_0 v_0_1 v_0_2 v_0_3 +// 1 v_1_0 v_1_1 v_1_2 v_1_3 +// 2 v_2_0 v_2_1 v_2_2 v_2_3 +// 3 v_3_0 v_3_1 v_3_2 v_3_3 + +// To compute the column addresses for a 2x3 sub-matrix at row 1 and column 1, +// we need a pointer to the first element of the submatrix as base pointer. +// Then we can use computeColumnAddr to compute the addresses for the columns +// of the sub-matrix. +// +// Column 0: computeColumnAddr(Base, 0 (column), 4 (stride), 2 (num rows), ..) +// -> just returns Base +// Column 1: computeColumnAddr(Base, 1 (column), 4 (stride), 2 (num rows), ..) +// -> returns Base + (1 * 4) +// Column 2: computeColumnAddr(Base, 2 (column), 4 (stride), 2 (num rows), ..) +// -> returns Base + (2 * 4) +// +// The graphic below illustrates the number of elements in a column (marked +// with |) and the number of skipped elements (marked with }). +// +// v_0_0 v_0_1 {v_0_2 {v_0_3 +// Base Col 1 Col 2 +// | | | +// v_1_0 |v_1_1 |v_1_2 |v_1_3 +// v_2_0 |v_2_1 |v_2_2 |v_2_3 +// v_3_0 {v_3_1 {v_3_2 v_3_3 +// +Value *computeColumnAddr(Value *BasePtr, Value *Col, Value *Stride, + unsigned NumRows, Type *EltType, + IRBuilder<> &Builder) { + + assert((!isa(Stride) || + cast(Stride)->getZExtValue() >= NumRows) && + "Stride must be >= the number of rows."); + unsigned AS = cast(BasePtr->getType())->getAddressSpace(); + + // Compute the start of the column with index Col as Col * Stride. + Value *ColumnStart = Builder.CreateMul(Col, Stride); + + // Get pointer to the start of the selected column. Skip GEP creation, + // if we select column 0. + if (isa(ColumnStart) && cast(ColumnStart)->isZero()) + ColumnStart = BasePtr; + else + ColumnStart = Builder.CreateGEP(EltType, BasePtr, ColumnStart); + + // Cast elementwise column start pointer to a pointer to a column + // (EltType x NumRows)*. + Type *ColumnType = VectorType::get(EltType, NumRows); + Type *ColumnPtrType = PointerType::get(ColumnType, AS); + return Builder.CreatePointerCast(ColumnStart, ColumnPtrType); +} + +/// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics. +/// +/// Currently, the lowering for each matrix intrinsic is done as follows: +/// 1. Split the operand vectors containing an embedded matrix into a set of +/// column vectors, based on the shape information from the intrinsic. +/// 2. Apply the transformation described by the intrinsic on the column +/// vectors, which yields a set of column vectors containing result matrix. +/// 3. Embed the columns of the result matrix in a flat vector and replace all +/// uses of the intrinsic result with it. +class LowerMatrixIntrinsics { + Function &Func; + const DataLayout &DL; + const TargetTransformInfo &TTI; + + /// Wrapper class representing a matrix as a set of column vectors. + /// All column vectors must have the same vector type. + class ColumnMatrixTy { + SmallVector Columns; + + public: + ColumnMatrixTy() : Columns() {} + ColumnMatrixTy(ArrayRef Cols) + : Columns(Cols.begin(), Cols.end()) {} + + Value *getColumn(unsigned i) const { return Columns[i]; } + + void setColumn(unsigned i, Value *V) { Columns[i] = V; } + + size_t getNumColumns() const { return Columns.size(); } + + const SmallVectorImpl &getColumnVectors() const { return Columns; } + + SmallVectorImpl &getColumnVectors() { return Columns; } + + void addColumn(Value *V) { Columns.push_back(V); } + + iterator_range::iterator> columns() { + return make_range(Columns.begin(), Columns.end()); + } + + /// Embed the columns of the matrix into a flat vector by concatenating + /// them. + Value *embedInVector(IRBuilder<> &Builder) const { + return Columns.size() == 1 ? Columns[0] + : concatenateVectors(Builder, Columns); + } + }; + + struct ShapeInfo { + unsigned NumRows; + unsigned NumColumns; + + ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0) + : NumRows(NumRows), NumColumns(NumColumns) {} + + ShapeInfo(ConstantInt *NumRows, ConstantInt *NumColumns) + : NumRows(NumRows->getZExtValue()), + NumColumns(NumColumns->getZExtValue()) {} + }; + +public: + LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI) + : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI) {} + + /// Return the set of column vectors that a matrix value is lowered to. + /// + /// We split the flat vector \p MatrixVal containing a matrix with shape \p SI + /// into column vectors. + ColumnMatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI, + IRBuilder<> Builder) { + VectorType *VType = dyn_cast(MatrixVal->getType()); + assert(VType && "MatrixVal must be a vector type"); + assert(VType->getNumElements() == SI.NumRows * SI.NumColumns && + "The vector size must match the number of matrix elements"); + SmallVector SplitVecs; + Value *Undef = UndefValue::get(VType); + + for (unsigned MaskStart = 0; MaskStart < VType->getNumElements(); + MaskStart += SI.NumRows) { + Constant *Mask = createSequentialMask(Builder, MaskStart, SI.NumRows, 0); + Value *V = Builder.CreateShuffleVector(MatrixVal, Undef, Mask, "split"); + SplitVecs.push_back(V); + } + + return {SplitVecs}; + } + + // Replace intrinsic calls + bool VisitCallInst(CallInst *Inst) { + if (!Inst->getCalledFunction() || !Inst->getCalledFunction()->isIntrinsic()) + return false; + + switch (Inst->getCalledFunction()->getIntrinsicID()) { + case Intrinsic::matrix_multiply: + LowerMultiply(Inst); + break; + case Intrinsic::matrix_transpose: + LowerTranspose(Inst); + break; + case Intrinsic::matrix_columnwise_load: + LowerColumnwiseLoad(Inst); + break; + case Intrinsic::matrix_columnwise_store: + LowerColumnwiseStore(Inst); + break; + default: + return false; + } + Inst->eraseFromParent(); + return true; + } + + bool Visit() { + ReversePostOrderTraversal RPOT(&Func); + bool Changed = false; + for (auto *BB : RPOT) { + for (Instruction &Inst : make_early_inc_range(*BB)) { + if (CallInst *CInst = dyn_cast(&Inst)) + Changed |= VisitCallInst(CInst); + } + } + + return Changed; + } + + LoadInst *createColumnLoad(Value *ColumnPtr, Type *EltType, + IRBuilder<> Builder) { + unsigned Align = DL.getABITypeAlignment(EltType); + return Builder.CreateAlignedLoad(ColumnPtr, Align); + } + + StoreInst *createColumnStore(Value *ColumnValue, Value *ColumnPtr, + Type *EltType, IRBuilder<> Builder) { + unsigned Align = DL.getABITypeAlignment(EltType); + return Builder.CreateAlignedStore(ColumnValue, ColumnPtr, Align); + } + + /// Turns \p BasePtr into an elementwise pointer to \p EltType. + Value *createElementPtr(Value *BasePtr, Type *EltType, IRBuilder<> &Builder) { + unsigned AS = cast(BasePtr->getType())->getAddressSpace(); + Type *EltPtrType = PointerType::get(EltType, AS); + return Builder.CreatePointerCast(BasePtr, EltPtrType); + } + + /// Lowers llvm.matrix.columnwise.load. + /// + /// The intrinsic loads a matrix from memory using a stride between columns. + void LowerColumnwiseLoad(CallInst *Inst) { + IRBuilder<> Builder(Inst); + Value *Ptr = Inst->getArgOperand(0); + Value *Stride = Inst->getArgOperand(1); + auto VType = cast(Inst->getType()); + ShapeInfo Shape(cast(Inst->getArgOperand(2)), + cast(Inst->getArgOperand(3))); + Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); + + ColumnMatrixTy Result; + // Distance between start of one column and the start of the next + for (unsigned C = 0, E = Shape.NumColumns; C < E; ++C) { + Value *GEP = + computeColumnAddr(EltPtr, Builder.getInt32(C), Stride, Shape.NumRows, + VType->getElementType(), Builder); + Value *Column = createColumnLoad(GEP, VType->getElementType(), Builder); + Result.addColumn(Column); + } + + Inst->replaceAllUsesWith(Result.embedInVector(Builder)); + } + + /// Lowers llvm.matrix.columnwise.store. + /// + /// The intrinsic store a matrix back memory using a stride between columns. + void LowerColumnwiseStore(CallInst *Inst) { + IRBuilder<> Builder(Inst); + Value *Matrix = Inst->getArgOperand(0); + Value *Ptr = Inst->getArgOperand(1); + Value *Stride = Inst->getArgOperand(2); + ShapeInfo Shape(cast(Inst->getArgOperand(3)), + cast(Inst->getArgOperand(4))); + auto VType = cast(Matrix->getType()); + Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); + + auto LM = getMatrix(Matrix, Shape, Builder); + for (auto C : enumerate(LM.columns())) { + Value *GEP = + computeColumnAddr(EltPtr, Builder.getInt32(C.index()), Stride, + Shape.NumRows, VType->getElementType(), Builder); + createColumnStore(C.value(), GEP, VType->getElementType(), Builder); + } + } + + /// Extract a column vector of \p NumElts starting at index (\p I, \p J) from + /// the matrix \p LM represented as a vector of column vectors. + Value *extractVector(const ColumnMatrixTy &LM, unsigned I, unsigned J, + unsigned NumElts, IRBuilder<> Builder) { + Value *Col = LM.getColumn(J); + Value *Undef = UndefValue::get(Col->getType()); + Constant *Mask = createSequentialMask(Builder, I, NumElts, 0); + return Builder.CreateShuffleVector(Col, Undef, Mask, "block"); + } + + // Set elements I..I+NumElts-1 to Block + Value *insertVector(Value *Col, unsigned I, Value *Block, + IRBuilder<> Builder) { + + // First, bring Block to the same size as Col + unsigned BlockNumElts = + cast(Block->getType())->getNumElements(); + unsigned NumElts = cast(Col->getType())->getNumElements(); + assert(NumElts >= BlockNumElts && "Too few elements for current block"); + + Value *ExtendMask = + createSequentialMask(Builder, 0, BlockNumElts, NumElts - BlockNumElts); + Value *Undef = UndefValue::get(Block->getType()); + Block = Builder.CreateShuffleVector(Block, Undef, ExtendMask); + + // If Col is 7 long and I is 2 and BlockNumElts is 2 the mask is: 0, 1, 7, + // 8, 4, 5, 6 + SmallVector Mask; + unsigned i; + for (i = 0; i < I; i++) + Mask.push_back(Builder.getInt32(i)); + + unsigned VecNumElts = cast(Col->getType())->getNumElements(); + for (; i < I + BlockNumElts; i++) + Mask.push_back(Builder.getInt32(i - I + VecNumElts)); + + for (; i < VecNumElts; i++) + Mask.push_back(Builder.getInt32(i)); + + Value *MaskVal = ConstantVector::get(Mask); + + return Builder.CreateShuffleVector(Col, Block, MaskVal); + } + + Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp, + IRBuilder<> &Builder) { + Value *Mul = UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B); + if (!Sum) + return Mul; + + return UseFPOp ? Builder.CreateFAdd(Sum, Mul) : Builder.CreateAdd(Sum, Mul); + } + + /// Lowers llvm.matrix.multiply. + void LowerMultiply(CallInst *MatMul) { + IRBuilder<> Builder(MatMul); + auto *EltType = cast(MatMul->getType())->getElementType(); + ShapeInfo LShape(cast(MatMul->getArgOperand(2)), + cast(MatMul->getArgOperand(3))); + ShapeInfo RShape(cast(MatMul->getArgOperand(3)), + cast(MatMul->getArgOperand(4))); + + const ColumnMatrixTy &Lhs = + getMatrix(MatMul->getArgOperand(0), LShape, Builder); + const ColumnMatrixTy &Rhs = + getMatrix(MatMul->getArgOperand(1), RShape, Builder); + + const unsigned R = LShape.NumRows; + const unsigned M = LShape.NumColumns; + const unsigned C = RShape.NumColumns; + assert(M == RShape.NumRows); + + // Initialize the output + ColumnMatrixTy Result; + for (unsigned J = 0; J < C; ++J) + Result.addColumn(UndefValue::get(VectorType::get(EltType, R))); + + const unsigned VF = std::max(TTI.getRegisterBitWidth(true) / + EltType->getPrimitiveSizeInBits(), + uint64_t(1)); + + // Multiply columns from the first operand with scalars from the second + // operand. Then move along the K axes and accumulate the columns. With + // this the adds can be vectorized without reassociation. + for (unsigned J = 0; J < C; ++J) { + unsigned BlockSize = VF; + for (unsigned I = 0; I < R; I += BlockSize) { + // Gradually lower the vectorization factor to cover the remainder. + while (I + BlockSize > R) + BlockSize /= 2; + + Value *Sum = nullptr; + for (unsigned K = 0; K < M; ++K) { + Value *L = extractVector(Lhs, I, K, BlockSize, Builder); + Value *RH = Builder.CreateExtractElement(Rhs.getColumn(J), K); + Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat"); + Sum = createMulAdd(Sum, L, Splat, EltType->isFloatingPointTy(), + Builder); + } + Result.setColumn(J, insertVector(Result.getColumn(J), I, Sum, Builder)); + } + } + + MatMul->replaceAllUsesWith(Result.embedInVector(Builder)); + } + + /// Lowers llvm.matrix.transpose. + void LowerTranspose(CallInst *Inst) { + ColumnMatrixTy Result; + IRBuilder<> Builder(Inst); + Value *InputVal = Inst->getArgOperand(0); + VectorType *VectorTy = cast(InputVal->getType()); + ShapeInfo ArgShape(cast(Inst->getArgOperand(1)), + cast(Inst->getArgOperand(2))); + ColumnMatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder); + + for (unsigned Row = 0; Row < ArgShape.NumRows; ++Row) { + // Build a single column vector for this row. First initialize it. + Value *ResultColumn = UndefValue::get( + VectorType::get(VectorTy->getElementType(), ArgShape.NumColumns)); + + // Go through the elements of this row and insert it into the resulting + // column vector. + for (auto C : enumerate(InputMatrix.columns())) { + Value *Elt = Builder.CreateExtractElement(C.value(), Row); + // We insert at index Column since that is the row index after the + // transpose. + ResultColumn = + Builder.CreateInsertElement(ResultColumn, Elt, C.index()); + } + Result.addColumn(ResultColumn); + } + + Inst->replaceAllUsesWith(Result.embedInVector(Builder)); + } +}; +} // namespace + +PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &TTI = AM.getResult(F); + LowerMatrixIntrinsics LMT(F, TTI); + if (LMT.Visit()) { + PreservedAnalyses PA; + PA.preserveSet(); + return PA; + } + return PreservedAnalyses::all(); +} + +namespace { + +class LowerMatrixIntrinsicsLegacyPass : public FunctionPass { +public: + static char ID; + + LowerMatrixIntrinsicsLegacyPass() : FunctionPass(ID) { + initializeLowerMatrixIntrinsicsLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + auto *TTI = &getAnalysis().getTTI(F); + LowerMatrixIntrinsics LMT(F, *TTI); + bool C = LMT.Visit(); + return C; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesCFG(); + } +}; +} // namespace + +static const char pass_name[] = "Lower the matrix intrinsics"; +char LowerMatrixIntrinsicsLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, + false, false) +INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, + false, false) + +Pass *llvm::createLowerMatrixIntrinsicsPass() { + return new LowerMatrixIntrinsicsLegacyPass(); +} diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp --- a/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -82,6 +82,7 @@ initializeLowerConstantIntrinsicsPass(Registry); initializeLowerExpectIntrinsicPass(Registry); initializeLowerGuardIntrinsicLegacyPassPass(Registry); + initializeLowerMatrixIntrinsicsLegacyPassPass(Registry); initializeLowerWidenableConditionLegacyPassPass(Registry); initializeMemCpyOptLegacyPassPass(Registry); initializeMergeICmpsLegacyPassPass(Registry); diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll @@ -0,0 +1,455 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + + +define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x double>* %C.Ptr) { +; CHECK-LABEL: @transpose_multiply( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = load <9 x double>, <9 x double>* [[A_PTR:%.*]] +; CHECK-NEXT: [[B:%.*]] = load <9 x double>, <9 x double>* [[B_PTR:%.*]] +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x double> [[SPLIT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x double> undef, double [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x double> [[TMP1]], double [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <3 x double> [[TMP3]], double [[TMP4]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <3 x double> [[SPLIT]], i64 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <3 x double> undef, double [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <3 x double> [[TMP7]], double [[TMP8]], i64 1 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <3 x double> [[TMP9]], double [[TMP10]], i64 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <3 x double> [[SPLIT]], i64 2 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <3 x double> undef, double [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 2 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <3 x double> [[TMP13]], double [[TMP14]], i64 1 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 2 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <3 x double> [[TMP15]], double [[TMP16]], i64 2 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> [[TMP11]], <6 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <6 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <6 x double> [[TMP18]], <6 x double> [[TMP19]], <9 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <9 x double> [[TMP20]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT4:%.*]] = shufflevector <9 x double> [[TMP20]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT5:%.*]] = shufflevector <9 x double> [[TMP20]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT6:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT7:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT8:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> undef, double [[TMP21]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> undef, double [[TMP23]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = fmul <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]] +; CHECK-NEXT: [[TMP25:%.*]] = fadd <1 x double> [[TMP22]], [[TMP24]] +; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> undef, double [[TMP26]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = fmul <1 x double> [[BLOCK12]], [[SPLAT_SPLAT14]] +; CHECK-NEXT: [[TMP28:%.*]] = fadd <1 x double> [[TMP25]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <1 x double> [[TMP28]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP29]], <3 x i32> +; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> undef, double [[TMP31]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = fmul <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]] +; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> undef, double [[TMP33]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP34:%.*]] = fmul <1 x double> [[BLOCK18]], [[SPLAT_SPLAT20]] +; CHECK-NEXT: [[TMP35:%.*]] = fadd <1 x double> [[TMP32]], [[TMP34]] +; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT22:%.*]] = insertelement <1 x double> undef, double [[TMP36]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT23:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT22]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP37:%.*]] = fmul <1 x double> [[BLOCK21]], [[SPLAT_SPLAT23]] +; CHECK-NEXT: [[TMP38:%.*]] = fadd <1 x double> [[TMP35]], [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <1 x double> [[TMP38]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <3 x double> [[TMP30]], <3 x double> [[TMP39]], <3 x i32> +; CHECK-NEXT: [[BLOCK24:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT25:%.*]] = insertelement <1 x double> undef, double [[TMP41]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT26:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT25]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP42:%.*]] = fmul <1 x double> [[BLOCK24]], [[SPLAT_SPLAT26]] +; CHECK-NEXT: [[BLOCK27:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP43:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT28:%.*]] = insertelement <1 x double> undef, double [[TMP43]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT29:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT28]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP44:%.*]] = fmul <1 x double> [[BLOCK27]], [[SPLAT_SPLAT29]] +; CHECK-NEXT: [[TMP45:%.*]] = fadd <1 x double> [[TMP42]], [[TMP44]] +; CHECK-NEXT: [[BLOCK30:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT31:%.*]] = insertelement <1 x double> undef, double [[TMP46]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT32:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT31]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP47:%.*]] = fmul <1 x double> [[BLOCK30]], [[SPLAT_SPLAT32]] +; CHECK-NEXT: [[TMP48:%.*]] = fadd <1 x double> [[TMP45]], [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = shufflevector <1 x double> [[TMP48]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <3 x double> [[TMP40]], <3 x double> [[TMP49]], <3 x i32> +; CHECK-NEXT: [[BLOCK33:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT34:%.*]] = insertelement <1 x double> undef, double [[TMP51]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT35:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT34]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP52:%.*]] = fmul <1 x double> [[BLOCK33]], [[SPLAT_SPLAT35]] +; CHECK-NEXT: [[BLOCK36:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT37:%.*]] = insertelement <1 x double> undef, double [[TMP53]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT38:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT37]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP54:%.*]] = fmul <1 x double> [[BLOCK36]], [[SPLAT_SPLAT38]] +; CHECK-NEXT: [[TMP55:%.*]] = fadd <1 x double> [[TMP52]], [[TMP54]] +; CHECK-NEXT: [[BLOCK39:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP56:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT40:%.*]] = insertelement <1 x double> undef, double [[TMP56]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT41:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT40]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP57:%.*]] = fmul <1 x double> [[BLOCK39]], [[SPLAT_SPLAT41]] +; CHECK-NEXT: [[TMP58:%.*]] = fadd <1 x double> [[TMP55]], [[TMP57]] +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <1 x double> [[TMP58]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP59]], <3 x i32> +; CHECK-NEXT: [[BLOCK42:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT43:%.*]] = insertelement <1 x double> undef, double [[TMP61]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT44:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT43]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP62:%.*]] = fmul <1 x double> [[BLOCK42]], [[SPLAT_SPLAT44]] +; CHECK-NEXT: [[BLOCK45:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT46:%.*]] = insertelement <1 x double> undef, double [[TMP63]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT47:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT46]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP64:%.*]] = fmul <1 x double> [[BLOCK45]], [[SPLAT_SPLAT47]] +; CHECK-NEXT: [[TMP65:%.*]] = fadd <1 x double> [[TMP62]], [[TMP64]] +; CHECK-NEXT: [[BLOCK48:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP66:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT49:%.*]] = insertelement <1 x double> undef, double [[TMP66]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT49]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP67:%.*]] = fmul <1 x double> [[BLOCK48]], [[SPLAT_SPLAT50]] +; CHECK-NEXT: [[TMP68:%.*]] = fadd <1 x double> [[TMP65]], [[TMP67]] +; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <1 x double> [[TMP68]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <3 x double> [[TMP60]], <3 x double> [[TMP69]], <3 x i32> +; CHECK-NEXT: [[BLOCK51:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP71:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT52:%.*]] = insertelement <1 x double> undef, double [[TMP71]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT52]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP72:%.*]] = fmul <1 x double> [[BLOCK51]], [[SPLAT_SPLAT53]] +; CHECK-NEXT: [[BLOCK54:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP73:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT55:%.*]] = insertelement <1 x double> undef, double [[TMP73]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT55]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP74:%.*]] = fmul <1 x double> [[BLOCK54]], [[SPLAT_SPLAT56]] +; CHECK-NEXT: [[TMP75:%.*]] = fadd <1 x double> [[TMP72]], [[TMP74]] +; CHECK-NEXT: [[BLOCK57:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP76:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT58:%.*]] = insertelement <1 x double> undef, double [[TMP76]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT58]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP77:%.*]] = fmul <1 x double> [[BLOCK57]], [[SPLAT_SPLAT59]] +; CHECK-NEXT: [[TMP78:%.*]] = fadd <1 x double> [[TMP75]], [[TMP77]] +; CHECK-NEXT: [[TMP79:%.*]] = shufflevector <1 x double> [[TMP78]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP80:%.*]] = shufflevector <3 x double> [[TMP70]], <3 x double> [[TMP79]], <3 x i32> +; CHECK-NEXT: [[BLOCK60:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP81:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT61:%.*]] = insertelement <1 x double> undef, double [[TMP81]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT62:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT61]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP82:%.*]] = fmul <1 x double> [[BLOCK60]], [[SPLAT_SPLAT62]] +; CHECK-NEXT: [[BLOCK63:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP83:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT64:%.*]] = insertelement <1 x double> undef, double [[TMP83]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT65:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT64]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP84:%.*]] = fmul <1 x double> [[BLOCK63]], [[SPLAT_SPLAT65]] +; CHECK-NEXT: [[TMP85:%.*]] = fadd <1 x double> [[TMP82]], [[TMP84]] +; CHECK-NEXT: [[BLOCK66:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP86:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT67:%.*]] = insertelement <1 x double> undef, double [[TMP86]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT68:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT67]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP87:%.*]] = fmul <1 x double> [[BLOCK66]], [[SPLAT_SPLAT68]] +; CHECK-NEXT: [[TMP88:%.*]] = fadd <1 x double> [[TMP85]], [[TMP87]] +; CHECK-NEXT: [[TMP89:%.*]] = shufflevector <1 x double> [[TMP88]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP90:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP89]], <3 x i32> +; CHECK-NEXT: [[BLOCK69:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP91:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT70:%.*]] = insertelement <1 x double> undef, double [[TMP91]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT71:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT70]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP92:%.*]] = fmul <1 x double> [[BLOCK69]], [[SPLAT_SPLAT71]] +; CHECK-NEXT: [[BLOCK72:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP93:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT73:%.*]] = insertelement <1 x double> undef, double [[TMP93]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT74:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT73]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP94:%.*]] = fmul <1 x double> [[BLOCK72]], [[SPLAT_SPLAT74]] +; CHECK-NEXT: [[TMP95:%.*]] = fadd <1 x double> [[TMP92]], [[TMP94]] +; CHECK-NEXT: [[BLOCK75:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP96:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT76:%.*]] = insertelement <1 x double> undef, double [[TMP96]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT77:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT76]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP97:%.*]] = fmul <1 x double> [[BLOCK75]], [[SPLAT_SPLAT77]] +; CHECK-NEXT: [[TMP98:%.*]] = fadd <1 x double> [[TMP95]], [[TMP97]] +; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <1 x double> [[TMP98]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP100:%.*]] = shufflevector <3 x double> [[TMP90]], <3 x double> [[TMP99]], <3 x i32> +; CHECK-NEXT: [[BLOCK78:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP101:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT79:%.*]] = insertelement <1 x double> undef, double [[TMP101]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT80:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT79]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP102:%.*]] = fmul <1 x double> [[BLOCK78]], [[SPLAT_SPLAT80]] +; CHECK-NEXT: [[BLOCK81:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP103:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT82:%.*]] = insertelement <1 x double> undef, double [[TMP103]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT83:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT82]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP104:%.*]] = fmul <1 x double> [[BLOCK81]], [[SPLAT_SPLAT83]] +; CHECK-NEXT: [[TMP105:%.*]] = fadd <1 x double> [[TMP102]], [[TMP104]] +; CHECK-NEXT: [[BLOCK84:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP106:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT85:%.*]] = insertelement <1 x double> undef, double [[TMP106]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT86:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT85]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP107:%.*]] = fmul <1 x double> [[BLOCK84]], [[SPLAT_SPLAT86]] +; CHECK-NEXT: [[TMP108:%.*]] = fadd <1 x double> [[TMP105]], [[TMP107]] +; CHECK-NEXT: [[TMP109:%.*]] = shufflevector <1 x double> [[TMP108]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP110:%.*]] = shufflevector <3 x double> [[TMP100]], <3 x double> [[TMP109]], <3 x i32> +; CHECK-NEXT: [[TMP111:%.*]] = shufflevector <3 x double> [[TMP50]], <3 x double> [[TMP80]], <6 x i32> +; CHECK-NEXT: [[TMP112:%.*]] = shufflevector <3 x double> [[TMP110]], <3 x double> undef, <6 x i32> +; CHECK-NEXT: [[TMP113:%.*]] = shufflevector <6 x double> [[TMP111]], <6 x double> [[TMP112]], <9 x i32> +; CHECK-NEXT: store <9 x double> [[TMP113]], <9 x double>* [[C_PTR:%.*]] +; CHECK-NEXT: ret void +; +entry: + %a = load <9 x double>, <9 x double>* %A.Ptr + %b = load <9 x double>, <9 x double>* %B.Ptr + %a.trans = call <9 x double> @llvm.matrix.transpose(<9 x double> %a, i32 3, i32 3) + %c = call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %a.trans, <9 x double> %b, i32 3, i32 3, i32 3) + store <9 x double> %c, <9 x double>* %C.Ptr + ret void +} + +declare <9 x double> @llvm.matrix.transpose(<9 x double>, i32, i32) +declare <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double>, <9 x double>, i32, i32, i32) + +define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x double>* %C.Ptr) { +; CHECK-LABEL: @transpose_multiply_add( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = load <9 x double>, <9 x double>* [[A_PTR:%.*]] +; CHECK-NEXT: [[B:%.*]] = load <9 x double>, <9 x double>* [[B_PTR:%.*]] +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x double> [[SPLIT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x double> undef, double [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x double> [[TMP1]], double [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <3 x double> [[TMP3]], double [[TMP4]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <3 x double> [[SPLIT]], i64 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <3 x double> undef, double [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <3 x double> [[TMP7]], double [[TMP8]], i64 1 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <3 x double> [[TMP9]], double [[TMP10]], i64 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <3 x double> [[SPLIT]], i64 2 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <3 x double> undef, double [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 2 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <3 x double> [[TMP13]], double [[TMP14]], i64 1 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 2 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <3 x double> [[TMP15]], double [[TMP16]], i64 2 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> [[TMP11]], <6 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <6 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <6 x double> [[TMP18]], <6 x double> [[TMP19]], <9 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <9 x double> [[TMP20]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT4:%.*]] = shufflevector <9 x double> [[TMP20]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT5:%.*]] = shufflevector <9 x double> [[TMP20]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT6:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT7:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT8:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> undef, double [[TMP21]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> undef, double [[TMP23]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = fmul <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]] +; CHECK-NEXT: [[TMP25:%.*]] = fadd <1 x double> [[TMP22]], [[TMP24]] +; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> undef, double [[TMP26]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = fmul <1 x double> [[BLOCK12]], [[SPLAT_SPLAT14]] +; CHECK-NEXT: [[TMP28:%.*]] = fadd <1 x double> [[TMP25]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <1 x double> [[TMP28]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP29]], <3 x i32> +; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> undef, double [[TMP31]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = fmul <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]] +; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> undef, double [[TMP33]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP34:%.*]] = fmul <1 x double> [[BLOCK18]], [[SPLAT_SPLAT20]] +; CHECK-NEXT: [[TMP35:%.*]] = fadd <1 x double> [[TMP32]], [[TMP34]] +; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT22:%.*]] = insertelement <1 x double> undef, double [[TMP36]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT23:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT22]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP37:%.*]] = fmul <1 x double> [[BLOCK21]], [[SPLAT_SPLAT23]] +; CHECK-NEXT: [[TMP38:%.*]] = fadd <1 x double> [[TMP35]], [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <1 x double> [[TMP38]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <3 x double> [[TMP30]], <3 x double> [[TMP39]], <3 x i32> +; CHECK-NEXT: [[BLOCK24:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT25:%.*]] = insertelement <1 x double> undef, double [[TMP41]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT26:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT25]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP42:%.*]] = fmul <1 x double> [[BLOCK24]], [[SPLAT_SPLAT26]] +; CHECK-NEXT: [[BLOCK27:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP43:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT28:%.*]] = insertelement <1 x double> undef, double [[TMP43]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT29:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT28]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP44:%.*]] = fmul <1 x double> [[BLOCK27]], [[SPLAT_SPLAT29]] +; CHECK-NEXT: [[TMP45:%.*]] = fadd <1 x double> [[TMP42]], [[TMP44]] +; CHECK-NEXT: [[BLOCK30:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x double> [[SPLIT6]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT31:%.*]] = insertelement <1 x double> undef, double [[TMP46]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT32:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT31]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP47:%.*]] = fmul <1 x double> [[BLOCK30]], [[SPLAT_SPLAT32]] +; CHECK-NEXT: [[TMP48:%.*]] = fadd <1 x double> [[TMP45]], [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = shufflevector <1 x double> [[TMP48]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <3 x double> [[TMP40]], <3 x double> [[TMP49]], <3 x i32> +; CHECK-NEXT: [[BLOCK33:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT34:%.*]] = insertelement <1 x double> undef, double [[TMP51]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT35:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT34]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP52:%.*]] = fmul <1 x double> [[BLOCK33]], [[SPLAT_SPLAT35]] +; CHECK-NEXT: [[BLOCK36:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT37:%.*]] = insertelement <1 x double> undef, double [[TMP53]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT38:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT37]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP54:%.*]] = fmul <1 x double> [[BLOCK36]], [[SPLAT_SPLAT38]] +; CHECK-NEXT: [[TMP55:%.*]] = fadd <1 x double> [[TMP52]], [[TMP54]] +; CHECK-NEXT: [[BLOCK39:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP56:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT40:%.*]] = insertelement <1 x double> undef, double [[TMP56]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT41:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT40]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP57:%.*]] = fmul <1 x double> [[BLOCK39]], [[SPLAT_SPLAT41]] +; CHECK-NEXT: [[TMP58:%.*]] = fadd <1 x double> [[TMP55]], [[TMP57]] +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <1 x double> [[TMP58]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP59]], <3 x i32> +; CHECK-NEXT: [[BLOCK42:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT43:%.*]] = insertelement <1 x double> undef, double [[TMP61]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT44:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT43]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP62:%.*]] = fmul <1 x double> [[BLOCK42]], [[SPLAT_SPLAT44]] +; CHECK-NEXT: [[BLOCK45:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT46:%.*]] = insertelement <1 x double> undef, double [[TMP63]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT47:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT46]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP64:%.*]] = fmul <1 x double> [[BLOCK45]], [[SPLAT_SPLAT47]] +; CHECK-NEXT: [[TMP65:%.*]] = fadd <1 x double> [[TMP62]], [[TMP64]] +; CHECK-NEXT: [[BLOCK48:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP66:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT49:%.*]] = insertelement <1 x double> undef, double [[TMP66]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT49]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP67:%.*]] = fmul <1 x double> [[BLOCK48]], [[SPLAT_SPLAT50]] +; CHECK-NEXT: [[TMP68:%.*]] = fadd <1 x double> [[TMP65]], [[TMP67]] +; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <1 x double> [[TMP68]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <3 x double> [[TMP60]], <3 x double> [[TMP69]], <3 x i32> +; CHECK-NEXT: [[BLOCK51:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP71:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT52:%.*]] = insertelement <1 x double> undef, double [[TMP71]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT52]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP72:%.*]] = fmul <1 x double> [[BLOCK51]], [[SPLAT_SPLAT53]] +; CHECK-NEXT: [[BLOCK54:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP73:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT55:%.*]] = insertelement <1 x double> undef, double [[TMP73]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT55]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP74:%.*]] = fmul <1 x double> [[BLOCK54]], [[SPLAT_SPLAT56]] +; CHECK-NEXT: [[TMP75:%.*]] = fadd <1 x double> [[TMP72]], [[TMP74]] +; CHECK-NEXT: [[BLOCK57:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP76:%.*]] = extractelement <3 x double> [[SPLIT7]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT58:%.*]] = insertelement <1 x double> undef, double [[TMP76]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT58]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP77:%.*]] = fmul <1 x double> [[BLOCK57]], [[SPLAT_SPLAT59]] +; CHECK-NEXT: [[TMP78:%.*]] = fadd <1 x double> [[TMP75]], [[TMP77]] +; CHECK-NEXT: [[TMP79:%.*]] = shufflevector <1 x double> [[TMP78]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP80:%.*]] = shufflevector <3 x double> [[TMP70]], <3 x double> [[TMP79]], <3 x i32> +; CHECK-NEXT: [[BLOCK60:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP81:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT61:%.*]] = insertelement <1 x double> undef, double [[TMP81]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT62:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT61]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP82:%.*]] = fmul <1 x double> [[BLOCK60]], [[SPLAT_SPLAT62]] +; CHECK-NEXT: [[BLOCK63:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP83:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT64:%.*]] = insertelement <1 x double> undef, double [[TMP83]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT65:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT64]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP84:%.*]] = fmul <1 x double> [[BLOCK63]], [[SPLAT_SPLAT65]] +; CHECK-NEXT: [[TMP85:%.*]] = fadd <1 x double> [[TMP82]], [[TMP84]] +; CHECK-NEXT: [[BLOCK66:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP86:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT67:%.*]] = insertelement <1 x double> undef, double [[TMP86]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT68:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT67]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP87:%.*]] = fmul <1 x double> [[BLOCK66]], [[SPLAT_SPLAT68]] +; CHECK-NEXT: [[TMP88:%.*]] = fadd <1 x double> [[TMP85]], [[TMP87]] +; CHECK-NEXT: [[TMP89:%.*]] = shufflevector <1 x double> [[TMP88]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP90:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP89]], <3 x i32> +; CHECK-NEXT: [[BLOCK69:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP91:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT70:%.*]] = insertelement <1 x double> undef, double [[TMP91]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT71:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT70]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP92:%.*]] = fmul <1 x double> [[BLOCK69]], [[SPLAT_SPLAT71]] +; CHECK-NEXT: [[BLOCK72:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP93:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT73:%.*]] = insertelement <1 x double> undef, double [[TMP93]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT74:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT73]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP94:%.*]] = fmul <1 x double> [[BLOCK72]], [[SPLAT_SPLAT74]] +; CHECK-NEXT: [[TMP95:%.*]] = fadd <1 x double> [[TMP92]], [[TMP94]] +; CHECK-NEXT: [[BLOCK75:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP96:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT76:%.*]] = insertelement <1 x double> undef, double [[TMP96]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT77:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT76]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP97:%.*]] = fmul <1 x double> [[BLOCK75]], [[SPLAT_SPLAT77]] +; CHECK-NEXT: [[TMP98:%.*]] = fadd <1 x double> [[TMP95]], [[TMP97]] +; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <1 x double> [[TMP98]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP100:%.*]] = shufflevector <3 x double> [[TMP90]], <3 x double> [[TMP99]], <3 x i32> +; CHECK-NEXT: [[BLOCK78:%.*]] = shufflevector <3 x double> [[SPLIT3]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP101:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT79:%.*]] = insertelement <1 x double> undef, double [[TMP101]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT80:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT79]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP102:%.*]] = fmul <1 x double> [[BLOCK78]], [[SPLAT_SPLAT80]] +; CHECK-NEXT: [[BLOCK81:%.*]] = shufflevector <3 x double> [[SPLIT4]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP103:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT82:%.*]] = insertelement <1 x double> undef, double [[TMP103]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT83:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT82]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP104:%.*]] = fmul <1 x double> [[BLOCK81]], [[SPLAT_SPLAT83]] +; CHECK-NEXT: [[TMP105:%.*]] = fadd <1 x double> [[TMP102]], [[TMP104]] +; CHECK-NEXT: [[BLOCK84:%.*]] = shufflevector <3 x double> [[SPLIT5]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP106:%.*]] = extractelement <3 x double> [[SPLIT8]], i64 2 +; CHECK-NEXT: [[SPLAT_SPLATINSERT85:%.*]] = insertelement <1 x double> undef, double [[TMP106]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT86:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT85]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP107:%.*]] = fmul <1 x double> [[BLOCK84]], [[SPLAT_SPLAT86]] +; CHECK-NEXT: [[TMP108:%.*]] = fadd <1 x double> [[TMP105]], [[TMP107]] +; CHECK-NEXT: [[TMP109:%.*]] = shufflevector <1 x double> [[TMP108]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP110:%.*]] = shufflevector <3 x double> [[TMP100]], <3 x double> [[TMP109]], <3 x i32> +; CHECK-NEXT: [[TMP111:%.*]] = shufflevector <3 x double> [[TMP50]], <3 x double> [[TMP80]], <6 x i32> +; CHECK-NEXT: [[TMP112:%.*]] = shufflevector <3 x double> [[TMP110]], <3 x double> undef, <6 x i32> +; CHECK-NEXT: [[TMP113:%.*]] = shufflevector <6 x double> [[TMP111]], <6 x double> [[TMP112]], <9 x i32> +; CHECK-NEXT: [[C:%.*]] = load <9 x double>, <9 x double>* [[C_PTR:%.*]] +; CHECK-NEXT: [[RES:%.*]] = fadd <9 x double> [[C]], [[TMP113]] +; CHECK-NEXT: store <9 x double> [[RES]], <9 x double>* [[C_PTR]] +; CHECK-NEXT: ret void +; +entry: + %a = load <9 x double>, <9 x double>* %A.Ptr + %b = load <9 x double>, <9 x double>* %B.Ptr + %a.trans = call <9 x double> @llvm.matrix.transpose(<9 x double> %a, i32 3, i32 3) + %mult = call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %a.trans, <9 x double> %b, i32 3, i32 3, i32 3) + %c = load <9 x double>, <9 x double>* %C.Ptr + %res = fadd <9 x double> %c, %mult + + store <9 x double> %res, <9 x double>* %C.Ptr + ret void +} + diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double.ll @@ -0,0 +1,254 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + + +define <4 x double> @multiply_2x2(<4 x double> %a, <4 x double> %b) { +; CHECK-LABEL: @multiply_2x2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> undef, <2 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> undef, double [[TMP0]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[BLOCK4:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x double> undef, double [[TMP2]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT5]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul <1 x double> [[BLOCK4]], [[SPLAT_SPLAT6]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd <1 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <1 x double> [[TMP4]], <1 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> undef, <2 x double> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[BLOCK7:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x double> undef, double [[TMP7]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT8]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = fmul <1 x double> [[BLOCK7]], [[SPLAT_SPLAT9]] +; CHECK-NEXT: [[BLOCK10:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x double> undef, double [[TMP9]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT11]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = fmul <1 x double> [[BLOCK10]], [[SPLAT_SPLAT12]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd <1 x double> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP12]], <2 x i32> +; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x double> undef, double [[TMP14]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT14]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = fmul <1 x double> [[BLOCK13]], [[SPLAT_SPLAT15]] +; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x double> undef, double [[TMP16]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT17]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = fmul <1 x double> [[BLOCK16]], [[SPLAT_SPLAT18]] +; CHECK-NEXT: [[TMP18:%.*]] = fadd <1 x double> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x double> [[TMP18]], <1 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x double> undef, <2 x double> [[TMP19]], <2 x i32> +; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x double> undef, double [[TMP21]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT20]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = fmul <1 x double> [[BLOCK19]], [[SPLAT_SPLAT21]] +; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> undef, double [[TMP23]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = fmul <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]] +; CHECK-NEXT: [[TMP25:%.*]] = fadd <1 x double> [[TMP22]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x double> [[TMP25]], <1 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x double> [[TMP20]], <2 x double> [[TMP26]], <2 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> [[TMP27]], <4 x i32> +; CHECK-NEXT: ret <4 x double> [[TMP28]] +; +entry: + %c = call <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2) + ret <4 x double> %c +} + +declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32) + +define <4 x double> @multiply_1x2(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: @multiply_1x2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <2 x double> [[B]], <2 x double> undef, <1 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x double> [[SPLIT1]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> undef, double [[TMP0]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> undef, <2 x double> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[BLOCK3:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <1 x double> [[SPLIT1]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT4:%.*]] = insertelement <1 x double> undef, double [[TMP4]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT5:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT4]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = fmul <1 x double> [[BLOCK3]], [[SPLAT_SPLAT5]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[BLOCK6:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <1 x double> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT7:%.*]] = insertelement <1 x double> undef, double [[TMP8]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT8:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT7]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = fmul <1 x double> [[BLOCK6]], [[SPLAT_SPLAT8]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <1 x double> [[TMP9]], <1 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x double> undef, <2 x double> [[TMP10]], <2 x i32> +; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <1 x double> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> undef, double [[TMP12]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = fmul <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <1 x double> [[TMP13]], <1 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP14]], <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP15]], <4 x i32> +; CHECK-NEXT: ret <4 x double> [[TMP16]] +; +entry: + %c = call <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double> %a, <2 x double> %b, i32 2, i32 1, i32 2) + ret <4 x double> %c +} + +declare <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double>, <2 x double>, i32, i32, i32) + +define <9 x double> @multiply_2x3(<6 x double> %a, <6 x double> %b) { +; CHECK-LABEL: @multiply_2x3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x double> [[A:%.*]], <6 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x double> [[A]], <6 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <6 x double> [[B:%.*]], <6 x double> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <6 x double> [[B]], <6 x double> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT4:%.*]] = shufflevector <6 x double> [[B]], <6 x double> undef, <2 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <3 x double> [[SPLIT]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> undef, double [[TMP0]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <3 x double> [[SPLIT1]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> undef, double [[TMP2]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul <1 x double> [[BLOCK5]], [[SPLAT_SPLAT7]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd <1 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <1 x double> [[TMP4]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP5]], <3 x i32> +; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <3 x double> [[SPLIT]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT9:%.*]] = insertelement <1 x double> undef, double [[TMP7]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT9]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = fmul <1 x double> [[BLOCK8]], [[SPLAT_SPLAT10]] +; CHECK-NEXT: [[BLOCK11:%.*]] = shufflevector <3 x double> [[SPLIT1]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x double> undef, double [[TMP9]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT12]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = fmul <1 x double> [[BLOCK11]], [[SPLAT_SPLAT13]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd <1 x double> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <3 x double> [[TMP6]], <3 x double> [[TMP12]], <3 x i32> +; CHECK-NEXT: [[BLOCK14:%.*]] = shufflevector <3 x double> [[SPLIT]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x double> undef, double [[TMP14]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT15]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = fmul <1 x double> [[BLOCK14]], [[SPLAT_SPLAT16]] +; CHECK-NEXT: [[BLOCK17:%.*]] = shufflevector <3 x double> [[SPLIT1]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x double> undef, double [[TMP16]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT18]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = fmul <1 x double> [[BLOCK17]], [[SPLAT_SPLAT19]] +; CHECK-NEXT: [[TMP18:%.*]] = fadd <1 x double> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x double> [[TMP18]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> [[TMP19]], <3 x i32> +; CHECK-NEXT: [[BLOCK20:%.*]] = shufflevector <3 x double> [[SPLIT]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT21:%.*]] = insertelement <1 x double> undef, double [[TMP21]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT22:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT21]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = fmul <1 x double> [[BLOCK20]], [[SPLAT_SPLAT22]] +; CHECK-NEXT: [[BLOCK23:%.*]] = shufflevector <3 x double> [[SPLIT1]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT24:%.*]] = insertelement <1 x double> undef, double [[TMP23]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT25:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT24]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = fmul <1 x double> [[BLOCK23]], [[SPLAT_SPLAT25]] +; CHECK-NEXT: [[TMP25:%.*]] = fadd <1 x double> [[TMP22]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x double> [[TMP25]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP26]], <3 x i32> +; CHECK-NEXT: [[BLOCK26:%.*]] = shufflevector <3 x double> [[SPLIT]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT27:%.*]] = insertelement <1 x double> undef, double [[TMP28]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT28:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT27]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = fmul <1 x double> [[BLOCK26]], [[SPLAT_SPLAT28]] +; CHECK-NEXT: [[BLOCK29:%.*]] = shufflevector <3 x double> [[SPLIT1]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT30:%.*]] = insertelement <1 x double> undef, double [[TMP30]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT31:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT30]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP31:%.*]] = fmul <1 x double> [[BLOCK29]], [[SPLAT_SPLAT31]] +; CHECK-NEXT: [[TMP32:%.*]] = fadd <1 x double> [[TMP29]], [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <1 x double> [[TMP32]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <3 x double> [[TMP27]], <3 x double> [[TMP33]], <3 x i32> +; CHECK-NEXT: [[BLOCK32:%.*]] = shufflevector <3 x double> [[SPLIT]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT33:%.*]] = insertelement <1 x double> undef, double [[TMP35]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT34:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT33]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP36:%.*]] = fmul <1 x double> [[BLOCK32]], [[SPLAT_SPLAT34]] +; CHECK-NEXT: [[BLOCK35:%.*]] = shufflevector <3 x double> [[SPLIT1]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT36:%.*]] = insertelement <1 x double> undef, double [[TMP37]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT36]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP38:%.*]] = fmul <1 x double> [[BLOCK35]], [[SPLAT_SPLAT37]] +; CHECK-NEXT: [[TMP39:%.*]] = fadd <1 x double> [[TMP36]], [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <1 x double> [[TMP39]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <3 x double> [[TMP34]], <3 x double> [[TMP40]], <3 x i32> +; CHECK-NEXT: [[BLOCK38:%.*]] = shufflevector <3 x double> [[SPLIT]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x double> [[SPLIT4]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT39:%.*]] = insertelement <1 x double> undef, double [[TMP42]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT39]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP43:%.*]] = fmul <1 x double> [[BLOCK38]], [[SPLAT_SPLAT40]] +; CHECK-NEXT: [[BLOCK41:%.*]] = shufflevector <3 x double> [[SPLIT1]], <3 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <2 x double> [[SPLIT4]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT42:%.*]] = insertelement <1 x double> undef, double [[TMP44]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT43:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT42]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP45:%.*]] = fmul <1 x double> [[BLOCK41]], [[SPLAT_SPLAT43]] +; CHECK-NEXT: [[TMP46:%.*]] = fadd <1 x double> [[TMP43]], [[TMP45]] +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <1 x double> [[TMP46]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP47]], <3 x i32> +; CHECK-NEXT: [[BLOCK44:%.*]] = shufflevector <3 x double> [[SPLIT]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <2 x double> [[SPLIT4]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT45:%.*]] = insertelement <1 x double> undef, double [[TMP49]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT46:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT45]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP50:%.*]] = fmul <1 x double> [[BLOCK44]], [[SPLAT_SPLAT46]] +; CHECK-NEXT: [[BLOCK47:%.*]] = shufflevector <3 x double> [[SPLIT1]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <2 x double> [[SPLIT4]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT48:%.*]] = insertelement <1 x double> undef, double [[TMP51]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT49:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT48]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP52:%.*]] = fmul <1 x double> [[BLOCK47]], [[SPLAT_SPLAT49]] +; CHECK-NEXT: [[TMP53:%.*]] = fadd <1 x double> [[TMP50]], [[TMP52]] +; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <1 x double> [[TMP53]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <3 x double> [[TMP48]], <3 x double> [[TMP54]], <3 x i32> +; CHECK-NEXT: [[BLOCK50:%.*]] = shufflevector <3 x double> [[SPLIT]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = extractelement <2 x double> [[SPLIT4]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT51:%.*]] = insertelement <1 x double> undef, double [[TMP56]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT52:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT51]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP57:%.*]] = fmul <1 x double> [[BLOCK50]], [[SPLAT_SPLAT52]] +; CHECK-NEXT: [[BLOCK53:%.*]] = shufflevector <3 x double> [[SPLIT1]], <3 x double> undef, <1 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = extractelement <2 x double> [[SPLIT4]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT54:%.*]] = insertelement <1 x double> undef, double [[TMP58]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT55:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT54]], <1 x double> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP59:%.*]] = fmul <1 x double> [[BLOCK53]], [[SPLAT_SPLAT55]] +; CHECK-NEXT: [[TMP60:%.*]] = fadd <1 x double> [[TMP57]], [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <1 x double> [[TMP60]], <1 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <3 x double> [[TMP55]], <3 x double> [[TMP61]], <3 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <3 x double> [[TMP20]], <3 x double> [[TMP41]], <6 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <3 x double> [[TMP62]], <3 x double> undef, <6 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <6 x double> [[TMP63]], <6 x double> [[TMP64]], <9 x i32> +; CHECK-NEXT: ret <9 x double> [[TMP65]] +; +entry: + %c = call <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3) + ret <9 x double> %c +} + +declare <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float.ll @@ -0,0 +1,254 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + + +define <4 x float> @multiply_2x2(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @multiply_2x2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> undef, float [[TMP0]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fmul <1 x float> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[BLOCK4:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x float> undef, float [[TMP2]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT5]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul <1 x float> [[BLOCK4]], [[SPLAT_SPLAT6]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd <1 x float> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <1 x float> [[TMP4]], <1 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[BLOCK7:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x float> undef, float [[TMP7]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT8]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = fmul <1 x float> [[BLOCK7]], [[SPLAT_SPLAT9]] +; CHECK-NEXT: [[BLOCK10:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x float> undef, float [[TMP9]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT11]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = fmul <1 x float> [[BLOCK10]], [[SPLAT_SPLAT12]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd <1 x float> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x float> [[TMP11]], <1 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP12]], <2 x i32> +; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x float> undef, float [[TMP14]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT14]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = fmul <1 x float> [[BLOCK13]], [[SPLAT_SPLAT15]] +; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x float> undef, float [[TMP16]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT17]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = fmul <1 x float> [[BLOCK16]], [[SPLAT_SPLAT18]] +; CHECK-NEXT: [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP19]], <2 x i32> +; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x float> undef, float [[TMP21]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT20]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = fmul <1 x float> [[BLOCK19]], [[SPLAT_SPLAT21]] +; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x float> undef, float [[TMP23]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT23]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = fmul <1 x float> [[BLOCK22]], [[SPLAT_SPLAT24]] +; CHECK-NEXT: [[TMP25:%.*]] = fadd <1 x float> [[TMP22]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x float> [[TMP25]], <1 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x float> [[TMP20]], <2 x float> [[TMP26]], <2 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> [[TMP27]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP28]] +; +entry: + %c = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2) + ret <4 x float> %c +} + +declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32) + +define <4 x float> @multiply_1x2(<2 x float> %a, <2 x float> %b) { +; CHECK-LABEL: @multiply_1x2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <2 x float> [[A:%.*]], <2 x float> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <2 x float> [[B:%.*]], <2 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <2 x float> [[B]], <2 x float> undef, <1 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x float> [[SPLIT1]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> undef, float [[TMP0]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fmul <1 x float> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <1 x float> [[TMP1]], <1 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[BLOCK3:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <1 x float> [[SPLIT1]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT4:%.*]] = insertelement <1 x float> undef, float [[TMP4]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT5:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT4]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = fmul <1 x float> [[BLOCK3]], [[SPLAT_SPLAT5]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <1 x float> [[TMP5]], <1 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[BLOCK6:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <1 x float> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT7:%.*]] = insertelement <1 x float> undef, float [[TMP8]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT8:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT7]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = fmul <1 x float> [[BLOCK6]], [[SPLAT_SPLAT8]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <1 x float> [[TMP9]], <1 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP10]], <2 x i32> +; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <1 x float> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x float> undef, float [[TMP12]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT10]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = fmul <1 x float> [[BLOCK9]], [[SPLAT_SPLAT11]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <1 x float> [[TMP13]], <1 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP14]], <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> [[TMP15]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP16]] +; +entry: + %c = call <4 x float> @llvm.matrix.multiply.v4f32.v2f32.v2f32(<2 x float> %a, <2 x float> %b, i32 2, i32 1, i32 2) + ret <4 x float> %c +} + +declare <4 x float> @llvm.matrix.multiply.v4f32.v2f32.v2f32(<2 x float>, <2 x float>, i32, i32, i32) + +define <9 x float> @multiply_2x3(<6 x float> %a, <6 x float> %b) { +; CHECK-LABEL: @multiply_2x3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x float> [[A:%.*]], <6 x float> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x float> [[A]], <6 x float> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <6 x float> [[B:%.*]], <6 x float> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <6 x float> [[B]], <6 x float> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT4:%.*]] = shufflevector <6 x float> [[B]], <6 x float> undef, <2 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <3 x float> [[SPLIT]], <3 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> undef, float [[TMP0]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fmul <1 x float> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <3 x float> [[SPLIT1]], <3 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x float> undef, float [[TMP2]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT6]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul <1 x float> [[BLOCK5]], [[SPLAT_SPLAT7]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd <1 x float> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <1 x float> [[TMP4]], <1 x float> undef, <3 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <3 x float> undef, <3 x float> [[TMP5]], <3 x i32> +; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <3 x float> [[SPLIT]], <3 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT9:%.*]] = insertelement <1 x float> undef, float [[TMP7]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT9]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = fmul <1 x float> [[BLOCK8]], [[SPLAT_SPLAT10]] +; CHECK-NEXT: [[BLOCK11:%.*]] = shufflevector <3 x float> [[SPLIT1]], <3 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x float> undef, float [[TMP9]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT12]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = fmul <1 x float> [[BLOCK11]], [[SPLAT_SPLAT13]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd <1 x float> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x float> [[TMP11]], <1 x float> undef, <3 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> [[TMP12]], <3 x i32> +; CHECK-NEXT: [[BLOCK14:%.*]] = shufflevector <3 x float> [[SPLIT]], <3 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x float> undef, float [[TMP14]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT15]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = fmul <1 x float> [[BLOCK14]], [[SPLAT_SPLAT16]] +; CHECK-NEXT: [[BLOCK17:%.*]] = shufflevector <3 x float> [[SPLIT1]], <3 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x float> undef, float [[TMP16]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT18]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = fmul <1 x float> [[BLOCK17]], [[SPLAT_SPLAT19]] +; CHECK-NEXT: [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> undef, <3 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <3 x float> [[TMP13]], <3 x float> [[TMP19]], <3 x i32> +; CHECK-NEXT: [[BLOCK20:%.*]] = shufflevector <3 x float> [[SPLIT]], <3 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT21:%.*]] = insertelement <1 x float> undef, float [[TMP21]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT22:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT21]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = fmul <1 x float> [[BLOCK20]], [[SPLAT_SPLAT22]] +; CHECK-NEXT: [[BLOCK23:%.*]] = shufflevector <3 x float> [[SPLIT1]], <3 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT24:%.*]] = insertelement <1 x float> undef, float [[TMP23]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT25:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT24]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = fmul <1 x float> [[BLOCK23]], [[SPLAT_SPLAT25]] +; CHECK-NEXT: [[TMP25:%.*]] = fadd <1 x float> [[TMP22]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x float> [[TMP25]], <1 x float> undef, <3 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <3 x float> undef, <3 x float> [[TMP26]], <3 x i32> +; CHECK-NEXT: [[BLOCK26:%.*]] = shufflevector <3 x float> [[SPLIT]], <3 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT27:%.*]] = insertelement <1 x float> undef, float [[TMP28]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT28:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT27]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = fmul <1 x float> [[BLOCK26]], [[SPLAT_SPLAT28]] +; CHECK-NEXT: [[BLOCK29:%.*]] = shufflevector <3 x float> [[SPLIT1]], <3 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT30:%.*]] = insertelement <1 x float> undef, float [[TMP30]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT31:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT30]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP31:%.*]] = fmul <1 x float> [[BLOCK29]], [[SPLAT_SPLAT31]] +; CHECK-NEXT: [[TMP32:%.*]] = fadd <1 x float> [[TMP29]], [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <1 x float> [[TMP32]], <1 x float> undef, <3 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <3 x float> [[TMP27]], <3 x float> [[TMP33]], <3 x i32> +; CHECK-NEXT: [[BLOCK32:%.*]] = shufflevector <3 x float> [[SPLIT]], <3 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT33:%.*]] = insertelement <1 x float> undef, float [[TMP35]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT34:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT33]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP36:%.*]] = fmul <1 x float> [[BLOCK32]], [[SPLAT_SPLAT34]] +; CHECK-NEXT: [[BLOCK35:%.*]] = shufflevector <3 x float> [[SPLIT1]], <3 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT36:%.*]] = insertelement <1 x float> undef, float [[TMP37]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT36]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP38:%.*]] = fmul <1 x float> [[BLOCK35]], [[SPLAT_SPLAT37]] +; CHECK-NEXT: [[TMP39:%.*]] = fadd <1 x float> [[TMP36]], [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <1 x float> [[TMP39]], <1 x float> undef, <3 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <3 x float> [[TMP34]], <3 x float> [[TMP40]], <3 x i32> +; CHECK-NEXT: [[BLOCK38:%.*]] = shufflevector <3 x float> [[SPLIT]], <3 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[SPLIT4]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT39:%.*]] = insertelement <1 x float> undef, float [[TMP42]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT39]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP43:%.*]] = fmul <1 x float> [[BLOCK38]], [[SPLAT_SPLAT40]] +; CHECK-NEXT: [[BLOCK41:%.*]] = shufflevector <3 x float> [[SPLIT1]], <3 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <2 x float> [[SPLIT4]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT42:%.*]] = insertelement <1 x float> undef, float [[TMP44]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT43:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT42]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP45:%.*]] = fmul <1 x float> [[BLOCK41]], [[SPLAT_SPLAT43]] +; CHECK-NEXT: [[TMP46:%.*]] = fadd <1 x float> [[TMP43]], [[TMP45]] +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <1 x float> [[TMP46]], <1 x float> undef, <3 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <3 x float> undef, <3 x float> [[TMP47]], <3 x i32> +; CHECK-NEXT: [[BLOCK44:%.*]] = shufflevector <3 x float> [[SPLIT]], <3 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <2 x float> [[SPLIT4]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT45:%.*]] = insertelement <1 x float> undef, float [[TMP49]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT46:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT45]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP50:%.*]] = fmul <1 x float> [[BLOCK44]], [[SPLAT_SPLAT46]] +; CHECK-NEXT: [[BLOCK47:%.*]] = shufflevector <3 x float> [[SPLIT1]], <3 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <2 x float> [[SPLIT4]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT48:%.*]] = insertelement <1 x float> undef, float [[TMP51]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT49:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT48]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP52:%.*]] = fmul <1 x float> [[BLOCK47]], [[SPLAT_SPLAT49]] +; CHECK-NEXT: [[TMP53:%.*]] = fadd <1 x float> [[TMP50]], [[TMP52]] +; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <1 x float> [[TMP53]], <1 x float> undef, <3 x i32> +; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <3 x float> [[TMP48]], <3 x float> [[TMP54]], <3 x i32> +; CHECK-NEXT: [[BLOCK50:%.*]] = shufflevector <3 x float> [[SPLIT]], <3 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = extractelement <2 x float> [[SPLIT4]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT51:%.*]] = insertelement <1 x float> undef, float [[TMP56]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT52:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT51]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP57:%.*]] = fmul <1 x float> [[BLOCK50]], [[SPLAT_SPLAT52]] +; CHECK-NEXT: [[BLOCK53:%.*]] = shufflevector <3 x float> [[SPLIT1]], <3 x float> undef, <1 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = extractelement <2 x float> [[SPLIT4]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT54:%.*]] = insertelement <1 x float> undef, float [[TMP58]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT55:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT54]], <1 x float> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP59:%.*]] = fmul <1 x float> [[BLOCK53]], [[SPLAT_SPLAT55]] +; CHECK-NEXT: [[TMP60:%.*]] = fadd <1 x float> [[TMP57]], [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <1 x float> [[TMP60]], <1 x float> undef, <3 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <3 x float> [[TMP55]], <3 x float> [[TMP61]], <3 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <3 x float> [[TMP20]], <3 x float> [[TMP41]], <6 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <3 x float> [[TMP62]], <3 x float> undef, <6 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <6 x float> [[TMP63]], <6 x float> [[TMP64]], <9 x i32> +; CHECK-NEXT: ret <9 x float> [[TMP65]] +; +entry: + %c = call <9 x float> @llvm.matrix.multiply.v6f32.v6f32.v6f32(<6 x float> %a, <6 x float> %b, i32 3, i32 2, i32 3) + ret <9 x float> %c +} + +declare <9 x float> @llvm.matrix.multiply.v6f32.v6f32.v6f32(<6 x float>, <6 x float>, i32, i32, i32) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32.ll @@ -0,0 +1,254 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + + +define <4 x i32> @multiply_2x2(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @multiply_2x2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> undef, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[BLOCK4:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT5]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul <1 x i32> [[BLOCK4]], [[SPLAT_SPLAT6]] +; CHECK-NEXT: [[TMP4:%.*]] = add <1 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <1 x i32> [[TMP4]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[BLOCK7:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x i32> undef, i32 [[TMP7]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT8]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul <1 x i32> [[BLOCK7]], [[SPLAT_SPLAT9]] +; CHECK-NEXT: [[BLOCK10:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x i32> undef, i32 [[TMP9]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT11]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = mul <1 x i32> [[BLOCK10]], [[SPLAT_SPLAT12]] +; CHECK-NEXT: [[TMP11:%.*]] = add <1 x i32> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP12]], <2 x i32> +; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x i32> undef, i32 [[TMP14]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT14]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = mul <1 x i32> [[BLOCK13]], [[SPLAT_SPLAT15]] +; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x i32> undef, i32 [[TMP16]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT17]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = mul <1 x i32> [[BLOCK16]], [[SPLAT_SPLAT18]] +; CHECK-NEXT: [[TMP18:%.*]] = add <1 x i32> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x i32> [[TMP18]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP19]], <2 x i32> +; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x i32> undef, i32 [[TMP21]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT20]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = mul <1 x i32> [[BLOCK19]], [[SPLAT_SPLAT21]] +; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x i32> undef, i32 [[TMP23]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT23]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = mul <1 x i32> [[BLOCK22]], [[SPLAT_SPLAT24]] +; CHECK-NEXT: [[TMP25:%.*]] = add <1 x i32> [[TMP22]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x i32> [[TMP25]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP26]], <2 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP27]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP28]] +; +entry: + %c = call <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2) + ret <4 x i32> %c +} + +declare <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32) + +define <4 x i32> @multiply_1x2(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: @multiply_1x2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <2 x i32> [[A:%.*]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <2 x i32> [[B:%.*]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i32> [[SPLIT1]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> undef, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[BLOCK3:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <1 x i32> [[SPLIT1]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT4:%.*]] = insertelement <1 x i32> undef, i32 [[TMP4]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT5:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT4]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = mul <1 x i32> [[BLOCK3]], [[SPLAT_SPLAT5]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <1 x i32> [[TMP5]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[BLOCK6:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <1 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT7:%.*]] = insertelement <1 x i32> undef, i32 [[TMP8]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT8:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT7]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = mul <1 x i32> [[BLOCK6]], [[SPLAT_SPLAT8]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <1 x i32> [[TMP9]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP10]], <2 x i32> +; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <1 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x i32> undef, i32 [[TMP12]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT10]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = mul <1 x i32> [[BLOCK9]], [[SPLAT_SPLAT11]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <1 x i32> [[TMP13]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP14]], <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP15]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP16]] +; +entry: + %c = call <4 x i32> @llvm.matrix.multiply.v4i32.v2i32.v2i32(<2 x i32> %a, <2 x i32> %b, i32 2, i32 1, i32 2) + ret <4 x i32> %c +} + +declare <4 x i32> @llvm.matrix.multiply.v4i32.v2i32.v2i32(<2 x i32>, <2 x i32>, i32, i32, i32) + +define <9 x i32> @multiply_2x3(<6 x i32> %a, <6 x i32> %b) { +; CHECK-LABEL: @multiply_2x3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x i32> [[A:%.*]], <6 x i32> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x i32> [[A]], <6 x i32> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <6 x i32> [[B:%.*]], <6 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <6 x i32> [[B]], <6 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT4:%.*]] = shufflevector <6 x i32> [[B]], <6 x i32> undef, <2 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <3 x i32> [[SPLIT]], <3 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> undef, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <3 x i32> [[SPLIT1]], <3 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT6]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul <1 x i32> [[BLOCK5]], [[SPLAT_SPLAT7]] +; CHECK-NEXT: [[TMP4:%.*]] = add <1 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <1 x i32> [[TMP4]], <1 x i32> undef, <3 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <3 x i32> undef, <3 x i32> [[TMP5]], <3 x i32> +; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <3 x i32> [[SPLIT]], <3 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT9:%.*]] = insertelement <1 x i32> undef, i32 [[TMP7]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT9]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul <1 x i32> [[BLOCK8]], [[SPLAT_SPLAT10]] +; CHECK-NEXT: [[BLOCK11:%.*]] = shufflevector <3 x i32> [[SPLIT1]], <3 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x i32> undef, i32 [[TMP9]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT12]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = mul <1 x i32> [[BLOCK11]], [[SPLAT_SPLAT13]] +; CHECK-NEXT: [[TMP11:%.*]] = add <1 x i32> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> undef, <3 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <3 x i32> [[TMP6]], <3 x i32> [[TMP12]], <3 x i32> +; CHECK-NEXT: [[BLOCK14:%.*]] = shufflevector <3 x i32> [[SPLIT]], <3 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x i32> undef, i32 [[TMP14]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT15]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = mul <1 x i32> [[BLOCK14]], [[SPLAT_SPLAT16]] +; CHECK-NEXT: [[BLOCK17:%.*]] = shufflevector <3 x i32> [[SPLIT1]], <3 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x i32> undef, i32 [[TMP16]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT18]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = mul <1 x i32> [[BLOCK17]], [[SPLAT_SPLAT19]] +; CHECK-NEXT: [[TMP18:%.*]] = add <1 x i32> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x i32> [[TMP18]], <1 x i32> undef, <3 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <3 x i32> [[TMP13]], <3 x i32> [[TMP19]], <3 x i32> +; CHECK-NEXT: [[BLOCK20:%.*]] = shufflevector <3 x i32> [[SPLIT]], <3 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT21:%.*]] = insertelement <1 x i32> undef, i32 [[TMP21]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT22:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT21]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = mul <1 x i32> [[BLOCK20]], [[SPLAT_SPLAT22]] +; CHECK-NEXT: [[BLOCK23:%.*]] = shufflevector <3 x i32> [[SPLIT1]], <3 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT24:%.*]] = insertelement <1 x i32> undef, i32 [[TMP23]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT25:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT24]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = mul <1 x i32> [[BLOCK23]], [[SPLAT_SPLAT25]] +; CHECK-NEXT: [[TMP25:%.*]] = add <1 x i32> [[TMP22]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x i32> [[TMP25]], <1 x i32> undef, <3 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <3 x i32> undef, <3 x i32> [[TMP26]], <3 x i32> +; CHECK-NEXT: [[BLOCK26:%.*]] = shufflevector <3 x i32> [[SPLIT]], <3 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT27:%.*]] = insertelement <1 x i32> undef, i32 [[TMP28]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT28:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT27]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = mul <1 x i32> [[BLOCK26]], [[SPLAT_SPLAT28]] +; CHECK-NEXT: [[BLOCK29:%.*]] = shufflevector <3 x i32> [[SPLIT1]], <3 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT30:%.*]] = insertelement <1 x i32> undef, i32 [[TMP30]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT31:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT30]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP31:%.*]] = mul <1 x i32> [[BLOCK29]], [[SPLAT_SPLAT31]] +; CHECK-NEXT: [[TMP32:%.*]] = add <1 x i32> [[TMP29]], [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <1 x i32> [[TMP32]], <1 x i32> undef, <3 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <3 x i32> [[TMP27]], <3 x i32> [[TMP33]], <3 x i32> +; CHECK-NEXT: [[BLOCK32:%.*]] = shufflevector <3 x i32> [[SPLIT]], <3 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT33:%.*]] = insertelement <1 x i32> undef, i32 [[TMP35]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT34:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT33]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP36:%.*]] = mul <1 x i32> [[BLOCK32]], [[SPLAT_SPLAT34]] +; CHECK-NEXT: [[BLOCK35:%.*]] = shufflevector <3 x i32> [[SPLIT1]], <3 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT36:%.*]] = insertelement <1 x i32> undef, i32 [[TMP37]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT36]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP38:%.*]] = mul <1 x i32> [[BLOCK35]], [[SPLAT_SPLAT37]] +; CHECK-NEXT: [[TMP39:%.*]] = add <1 x i32> [[TMP36]], [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <1 x i32> [[TMP39]], <1 x i32> undef, <3 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <3 x i32> [[TMP34]], <3 x i32> [[TMP40]], <3 x i32> +; CHECK-NEXT: [[BLOCK38:%.*]] = shufflevector <3 x i32> [[SPLIT]], <3 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[SPLIT4]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT39:%.*]] = insertelement <1 x i32> undef, i32 [[TMP42]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT39]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP43:%.*]] = mul <1 x i32> [[BLOCK38]], [[SPLAT_SPLAT40]] +; CHECK-NEXT: [[BLOCK41:%.*]] = shufflevector <3 x i32> [[SPLIT1]], <3 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <2 x i32> [[SPLIT4]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT42:%.*]] = insertelement <1 x i32> undef, i32 [[TMP44]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT43:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT42]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP45:%.*]] = mul <1 x i32> [[BLOCK41]], [[SPLAT_SPLAT43]] +; CHECK-NEXT: [[TMP46:%.*]] = add <1 x i32> [[TMP43]], [[TMP45]] +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <1 x i32> [[TMP46]], <1 x i32> undef, <3 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <3 x i32> undef, <3 x i32> [[TMP47]], <3 x i32> +; CHECK-NEXT: [[BLOCK44:%.*]] = shufflevector <3 x i32> [[SPLIT]], <3 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <2 x i32> [[SPLIT4]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT45:%.*]] = insertelement <1 x i32> undef, i32 [[TMP49]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT46:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT45]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP50:%.*]] = mul <1 x i32> [[BLOCK44]], [[SPLAT_SPLAT46]] +; CHECK-NEXT: [[BLOCK47:%.*]] = shufflevector <3 x i32> [[SPLIT1]], <3 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <2 x i32> [[SPLIT4]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT48:%.*]] = insertelement <1 x i32> undef, i32 [[TMP51]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT49:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT48]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP52:%.*]] = mul <1 x i32> [[BLOCK47]], [[SPLAT_SPLAT49]] +; CHECK-NEXT: [[TMP53:%.*]] = add <1 x i32> [[TMP50]], [[TMP52]] +; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <1 x i32> [[TMP53]], <1 x i32> undef, <3 x i32> +; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <3 x i32> [[TMP48]], <3 x i32> [[TMP54]], <3 x i32> +; CHECK-NEXT: [[BLOCK50:%.*]] = shufflevector <3 x i32> [[SPLIT]], <3 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = extractelement <2 x i32> [[SPLIT4]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT51:%.*]] = insertelement <1 x i32> undef, i32 [[TMP56]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT52:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT51]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP57:%.*]] = mul <1 x i32> [[BLOCK50]], [[SPLAT_SPLAT52]] +; CHECK-NEXT: [[BLOCK53:%.*]] = shufflevector <3 x i32> [[SPLIT1]], <3 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = extractelement <2 x i32> [[SPLIT4]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT54:%.*]] = insertelement <1 x i32> undef, i32 [[TMP58]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT55:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT54]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP59:%.*]] = mul <1 x i32> [[BLOCK53]], [[SPLAT_SPLAT55]] +; CHECK-NEXT: [[TMP60:%.*]] = add <1 x i32> [[TMP57]], [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <1 x i32> [[TMP60]], <1 x i32> undef, <3 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <3 x i32> [[TMP55]], <3 x i32> [[TMP61]], <3 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <3 x i32> [[TMP20]], <3 x i32> [[TMP41]], <6 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <3 x i32> [[TMP62]], <3 x i32> undef, <6 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <6 x i32> [[TMP63]], <6 x i32> [[TMP64]], <9 x i32> +; CHECK-NEXT: ret <9 x i32> [[TMP65]] +; +entry: + %c = call <9 x i32> @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32> %a, <6 x i32> %b, i32 3, i32 2, i32 3) + ret <9 x i32> %c +} + +declare <9 x i32> @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32>, <6 x i32>, i32, i32, i32) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + +define <9 x double> @strided_load_3x3(<9 x double>* %in, i32 %stride) { +; CHECK-LABEL: @strided_load_3x3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <9 x double>* [[IN:%.*]] to double* +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[TMP0]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP2]] to <3 x double>* +; CHECK-NEXT: [[TMP4:%.*]] = load <3 x double>, <3 x double>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 1, [[STRIDE]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr double, double* [[TMP0]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <3 x double>* +; CHECK-NEXT: [[TMP8:%.*]] = load <3 x double>, <3 x double>* [[TMP7]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = mul i32 2, [[STRIDE]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr double, double* [[TMP0]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[TMP10]] to <3 x double>* +; CHECK-NEXT: [[TMP12:%.*]] = load <3 x double>, <3 x double>* [[TMP11]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <3 x double> [[TMP4]], <3 x double> [[TMP8]], <6 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <3 x double> [[TMP12]], <3 x double> undef, <6 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <6 x double> [[TMP13]], <6 x double> [[TMP14]], <9 x i32> +; CHECK-NEXT: ret <9 x double> [[TMP15]] +; +entry: + %load = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %in, i32 %stride, i32 3, i32 3) + ret <9 x double> %load +} + +declare <9 x double> @llvm.matrix.columnwise.load(<9 x double>*, i32, i32, i32) + +define <9 x double> @strided_load_9x1(<9 x double>* %in, i32 %stride) { +; CHECK-LABEL: @strided_load_9x1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <9 x double>* [[IN:%.*]] to double* +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[TMP0]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP2]] to <9 x double>* +; CHECK-NEXT: [[TMP4:%.*]] = load <9 x double>, <9 x double>* [[TMP3]], align 8 +; CHECK-NEXT: ret <9 x double> [[TMP4]] +; +entry: + %load = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %in, i32 %stride, i32 9, i32 1) + ret <9 x double> %load +} + +declare <8 x double> @llvm.matrix.columnwise.load.v8f64(<8 x double>*, i32, i32, i32) + +define <8 x double> @strided_load_4x2(<8 x double>* %in, i32 %stride) { +; CHECK-LABEL: @strided_load_4x2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x double>* [[IN:%.*]] to double* +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[TMP0]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP2]] to <4 x double>* +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 1, [[STRIDE]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr double, double* [[TMP0]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <4 x double>* +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x double>, <4 x double>* [[TMP7]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP8]], <8 x i32> +; CHECK-NEXT: ret <8 x double> [[TMP9]] +; +entry: + %load = call <8 x double> @llvm.matrix.columnwise.load.v8f64(<8 x double>* %in, i32 %stride, i32 4, i32 2) + ret <8 x double> %load +} diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-float.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-float.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-float.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + +define <9 x float> @strided_load_3x3(<9 x float>* %in, i32 %stride) { +; CHECK-LABEL: @strided_load_3x3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <9 x float>* [[IN:%.*]] to float* +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, float* [[TMP0]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <3 x float>* +; CHECK-NEXT: [[TMP4:%.*]] = load <3 x float>, <3 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 1, [[STRIDE]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[TMP0]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <3 x float>* +; CHECK-NEXT: [[TMP8:%.*]] = load <3 x float>, <3 x float>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = mul i32 2, [[STRIDE]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[TMP0]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <3 x float>* +; CHECK-NEXT: [[TMP12:%.*]] = load <3 x float>, <3 x float>* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <3 x float> [[TMP4]], <3 x float> [[TMP8]], <6 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <3 x float> [[TMP12]], <3 x float> undef, <6 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <6 x float> [[TMP13]], <6 x float> [[TMP14]], <9 x i32> +; CHECK-NEXT: ret <9 x float> [[TMP15]] +; +entry: + %load = call <9 x float> @llvm.matrix.columnwise.load(<9 x float>* %in, i32 %stride, i32 3, i32 3) + ret <9 x float> %load +} + +declare <9 x float> @llvm.matrix.columnwise.load(<9 x float>*, i32, i32, i32) + +define <9 x float> @strided_load_9x1(<9 x float>* %in, i32 %stride) { +; CHECK-LABEL: @strided_load_9x1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <9 x float>* [[IN:%.*]] to float* +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, float* [[TMP0]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <9 x float>* +; CHECK-NEXT: [[TMP4:%.*]] = load <9 x float>, <9 x float>* [[TMP3]], align 4 +; CHECK-NEXT: ret <9 x float> [[TMP4]] +; +entry: + %load = call <9 x float> @llvm.matrix.columnwise.load(<9 x float>* %in, i32 %stride, i32 9, i32 1) + ret <9 x float> %load +} + +declare <8 x float> @llvm.matrix.columnwise.load.v8f32(<8 x float>*, i32, i32, i32) + +define <8 x float> @strided_load_4x2(<8 x float>* %in, i32 %stride) { +; CHECK-LABEL: @strided_load_4x2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x float>* [[IN:%.*]] to float* +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, float* [[TMP0]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 1, [[STRIDE]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[TMP0]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>* +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP8]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP9]] +; +entry: + %load = call <8 x float> @llvm.matrix.columnwise.load.v8f32(<8 x float>* %in, i32 %stride, i32 4, i32 2) + ret <8 x float> %load +} diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-i32.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-i32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-i32.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + +define <9 x i32> @strided_load_3x3(<9 x i32>* %in, i32 %stride) { +; CHECK-LABEL: @strided_load_3x3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <9 x i32>* [[IN:%.*]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <3 x i32>* +; CHECK-NEXT: [[TMP4:%.*]] = load <3 x i32>, <3 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 1, [[STRIDE]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <3 x i32>* +; CHECK-NEXT: [[TMP8:%.*]] = load <3 x i32>, <3 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = mul i32 2, [[STRIDE]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <3 x i32>* +; CHECK-NEXT: [[TMP12:%.*]] = load <3 x i32>, <3 x i32>* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <3 x i32> [[TMP4]], <3 x i32> [[TMP8]], <6 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <3 x i32> [[TMP12]], <3 x i32> undef, <6 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <6 x i32> [[TMP13]], <6 x i32> [[TMP14]], <9 x i32> +; CHECK-NEXT: ret <9 x i32> [[TMP15]] +; +entry: + %load = call <9 x i32> @llvm.matrix.columnwise.load(<9 x i32>* %in, i32 %stride, i32 3, i32 3) + ret <9 x i32> %load +} + +declare <9 x i32> @llvm.matrix.columnwise.load(<9 x i32>*, i32, i32, i32) + +define <9 x i32> @strided_load_9x1(<9 x i32>* %in, i32 %stride) { +; CHECK-LABEL: @strided_load_9x1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <9 x i32>* [[IN:%.*]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <9 x i32>* +; CHECK-NEXT: [[TMP4:%.*]] = load <9 x i32>, <9 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: ret <9 x i32> [[TMP4]] +; +entry: + %load = call <9 x i32> @llvm.matrix.columnwise.load(<9 x i32>* %in, i32 %stride, i32 9, i32 1) + ret <9 x i32> %load +} + +declare <8 x i32> @llvm.matrix.columnwise.load.v8i32(<8 x i32>*, i32, i32, i32) + +define <8 x i32> @strided_load_4x2(<8 x i32>* %in, i32 %stride) { +; CHECK-LABEL: @strided_load_4x2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i32>* [[IN:%.*]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 1, [[STRIDE]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP0]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP9]] +; +entry: + %load = call <8 x i32> @llvm.matrix.columnwise.load.v8i32(<8 x i32>* %in, i32 %stride, i32 4, i32 2) + ret <8 x i32> %load +} diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + +define void @strided_store_3x2(<6 x double> %in, double* %out) { +; CHECK-LABEL: @strided_store_3x2( +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x double> [[IN:%.*]], <6 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x double> [[IN]], <6 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[OUT:%.*]] to <3 x double>* +; CHECK-NEXT: store <3 x double> [[SPLIT]], <3 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[OUT]], i32 5 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP2]] to <3 x double>* +; CHECK-NEXT: store <3 x double> [[SPLIT1]], <3 x double>* [[TMP3]], align 8 +; CHECK-NEXT: ret void +; + call void @llvm.matrix.columnwise.store(<6 x double> %in, double* %out, i32 5, i32 3, i32 2) + ret void +} + +define void @strided_store_3x2_nonconst_stride(<6 x double> %in, i32 %stride, double* %out) { +; CHECK-LABEL: @strided_store_3x2_nonconst_stride( +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x double> [[IN:%.*]], <6 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x double> [[IN]], <6 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[OUT:%.*]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP2]] to <3 x double>* +; CHECK-NEXT: store <3 x double> [[SPLIT]], <3 x double>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = mul i32 1, [[STRIDE]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr double, double* [[OUT]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[TMP5]] to <3 x double>* +; CHECK-NEXT: store <3 x double> [[SPLIT1]], <3 x double>* [[TMP6]], align 8 +; CHECK-NEXT: ret void +; + call void @llvm.matrix.columnwise.store(<6 x double> %in, double* %out, i32 %stride, i32 3, i32 2) + ret void +} + + +declare void @llvm.matrix.columnwise.store(<6 x double>, double*, i32, i32, i32) + +define void @strided_store_2x3(<10 x double> %in, double* %out) { +; CHECK-LABEL: @strided_store_2x3( +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <10 x double> [[IN:%.*]], <10 x double> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <10 x double> [[IN]], <10 x double> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <10 x double> [[IN]], <10 x double> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <10 x double> [[IN]], <10 x double> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT4:%.*]] = shufflevector <10 x double> [[IN]], <10 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[OUT:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[SPLIT]], <2 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[OUT]], i32 4 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP2]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[SPLIT1]], <2 x double>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr double, double* [[OUT]], i32 8 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[SPLIT2]], <2 x double>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr double, double* [[OUT]], i32 12 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[SPLIT3]], <2 x double>* [[TMP7]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr double, double* [[OUT]], i32 16 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[TMP8]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[SPLIT4]], <2 x double>* [[TMP9]], align 8 +; CHECK-NEXT: ret void +; + call void @llvm.matrix.columnwise.store.v10f64(<10 x double> %in, double* %out, i32 4, i32 2, i32 5) + ret void +} + +declare void @llvm.matrix.columnwise.store.v10f64(<10 x double>, double*, i32, i32, i32) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-float.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-float.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-float.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + +define void @strided_store_3x2(<6 x float> %in, float* %out) { +; CHECK-LABEL: @strided_store_3x2( +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x float> [[IN:%.*]], <6 x float> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x float> [[IN]], <6 x float> undef, <3 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[OUT:%.*]] to <3 x float>* +; CHECK-NEXT: store <3 x float> [[SPLIT]], <3 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, float* [[OUT]], i32 5 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <3 x float>* +; CHECK-NEXT: store <3 x float> [[SPLIT1]], <3 x float>* [[TMP3]], align 4 +; CHECK-NEXT: ret void +; + call void @llvm.matrix.columnwise.store(<6 x float> %in, float* %out, i32 5, i32 3, i32 2) + ret void +} + +define void @strided_store_3x2_nonconst_stride(<6 x float> %in, i32 %stride, float* %out) { +; CHECK-LABEL: @strided_store_3x2_nonconst_stride( +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x float> [[IN:%.*]], <6 x float> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x float> [[IN]], <6 x float> undef, <3 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, float* [[OUT:%.*]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <3 x float>* +; CHECK-NEXT: store <3 x float> [[SPLIT]], <3 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = mul i32 1, [[STRIDE]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, float* [[OUT]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <3 x float>* +; CHECK-NEXT: store <3 x float> [[SPLIT1]], <3 x float>* [[TMP6]], align 4 +; CHECK-NEXT: ret void +; + call void @llvm.matrix.columnwise.store(<6 x float> %in, float* %out, i32 %stride, i32 3, i32 2) + ret void +} + + +declare void @llvm.matrix.columnwise.store(<6 x float>, float*, i32, i32, i32) + +define void @strided_store_2x3(<10 x float> %in, float* %out) { +; CHECK-LABEL: @strided_store_2x3( +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <10 x float> [[IN:%.*]], <10 x float> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <10 x float> [[IN]], <10 x float> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <10 x float> [[IN]], <10 x float> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <10 x float> [[IN]], <10 x float> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT4:%.*]] = shufflevector <10 x float> [[IN]], <10 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[OUT:%.*]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[SPLIT]], <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, float* [[OUT]], i32 4 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[SPLIT1]], <2 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, float* [[OUT]], i32 8 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[SPLIT2]], <2 x float>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[OUT]], i32 12 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[SPLIT3]], <2 x float>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, float* [[OUT]], i32 16 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[SPLIT4]], <2 x float>* [[TMP9]], align 4 +; CHECK-NEXT: ret void +; + call void @llvm.matrix.columnwise.store.v10f32(<10 x float> %in, float* %out, i32 4, i32 2, i32 5) + ret void +} + +declare void @llvm.matrix.columnwise.store.v10f32(<10 x float>, float*, i32, i32, i32) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-i32.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-i32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-i32.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + +define void @strided_store_3x2(<6 x i32> %in, i32* %out) { +; CHECK-LABEL: @strided_store_3x2( +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x i32> [[IN:%.*]], <6 x i32> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x i32> [[IN]], <6 x i32> undef, <3 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[OUT:%.*]] to <3 x i32>* +; CHECK-NEXT: store <3 x i32> [[SPLIT]], <3 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[OUT]], i32 5 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <3 x i32>* +; CHECK-NEXT: store <3 x i32> [[SPLIT1]], <3 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: ret void +; + call void @llvm.matrix.columnwise.store(<6 x i32> %in, i32* %out, i32 5, i32 3, i32 2) + ret void +} + +define void @strided_store_3x2_nonconst_stride(<6 x i32> %in, i32 %stride, i32* %out) { +; CHECK-LABEL: @strided_store_3x2_nonconst_stride( +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x i32> [[IN:%.*]], <6 x i32> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x i32> [[IN]], <6 x i32> undef, <3 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 0, [[STRIDE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <3 x i32>* +; CHECK-NEXT: store <3 x i32> [[SPLIT]], <3 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = mul i32 1, [[STRIDE]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[OUT]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <3 x i32>* +; CHECK-NEXT: store <3 x i32> [[SPLIT1]], <3 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: ret void +; + call void @llvm.matrix.columnwise.store(<6 x i32> %in, i32* %out, i32 %stride, i32 3, i32 2) + ret void +} + + +declare void @llvm.matrix.columnwise.store(<6 x i32>, i32*, i32, i32, i32) + +define void @strided_store_2x3(<10 x i32> %in, i32* %out) { +; CHECK-LABEL: @strided_store_2x3( +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <10 x i32> [[IN:%.*]], <10 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <10 x i32> [[IN]], <10 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <10 x i32> [[IN]], <10 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <10 x i32> [[IN]], <10 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT4:%.*]] = shufflevector <10 x i32> [[IN]], <10 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[OUT:%.*]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[SPLIT]], <2 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[OUT]], i32 4 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[SPLIT1]], <2 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[OUT]], i32 8 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[SPLIT2]], <2 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[OUT]], i32 12 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[SPLIT3]], <2 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[OUT]], i32 16 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[SPLIT4]], <2 x i32>* [[TMP9]], align 4 +; CHECK-NEXT: ret void +; + call void @llvm.matrix.columnwise.store.v10i32(<10 x i32> %in, i32* %out, i32 4, i32 2, i32 5) + ret void +} + +declare void @llvm.matrix.columnwise.store.v10i32(<10 x i32>, i32*, i32, i32, i32) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-double.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-double.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + + +define <8 x double> @transpose(<8 x double> %a) { +; CHECK-LABEL: @transpose( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[SPLIT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> undef, double [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[SPLIT1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP1]], double [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP3]], double [[TMP4]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i64 3 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[SPLIT]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x double> undef, double [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[SPLIT1]], i64 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> [[TMP9]], double [[TMP10]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP12]], i64 2 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x double> [[TMP13]], double [[TMP14]], i64 3 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP15]], <8 x i32> +; CHECK-NEXT: ret <8 x double> [[TMP16]] +; +entry: + %c = call <8 x double> @llvm.matrix.transpose(<8 x double> %a, i32 2, i32 4) + ret <8 x double> %c +} + +declare <8 x double> @llvm.matrix.transpose(<8 x double>, i32, i32) + +define <8 x double> @transpose_single_column(<8 x double> %a) { +; CHECK-LABEL: @transpose_single_column( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> undef, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x double> [[SPLIT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x double> undef, double [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[SPLIT]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <1 x double> undef, double [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x double> [[SPLIT]], i64 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <1 x double> undef, double [[TMP4]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[SPLIT]], i64 3 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <1 x double> undef, double [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x double> [[SPLIT]], i64 4 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <1 x double> undef, double [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x double> [[SPLIT]], i64 5 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <1 x double> undef, double [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x double> [[SPLIT]], i64 6 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <1 x double> undef, double [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x double> [[SPLIT]], i64 7 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <1 x double> undef, double [[TMP14]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> [[TMP7]], <2 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <1 x double> [[TMP9]], <1 x double> [[TMP11]], <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x double> [[TMP13]], <1 x double> [[TMP15]], <2 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> [[TMP17]], <4 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x double> [[TMP18]], <2 x double> [[TMP19]], <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x double> [[TMP20]], <4 x double> [[TMP21]], <8 x i32> +; CHECK-NEXT: ret <8 x double> [[TMP22]] +; +entry: + %c = call <8 x double> @llvm.matrix.transpose(<8 x double> %a, i32 8, i32 1) + ret <8 x double> %c +} + +declare <12 x double> @llvm.matrix.transpose.v12f64(<12 x double>, i32, i32) + +define <12 x double> @transpose_double_3x4(<12 x double> %a) { +; CHECK-LABEL: @transpose_double_3x4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <12 x double> [[A:%.*]], <12 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <12 x double> [[A]], <12 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <12 x double> [[A]], <12 x double> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <12 x double> [[A]], <12 x double> undef, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x double> [[SPLIT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> undef, double [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP1]], double [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP3]], double [[TMP4]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i64 3 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <3 x double> [[SPLIT]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x double> undef, double [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> [[TMP9]], double [[TMP10]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP12]], i64 2 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x double> [[TMP13]], double [[TMP14]], i64 3 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <3 x double> [[SPLIT]], i64 2 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x double> undef, double [[TMP16]], i64 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 2 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP18]], i64 1 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 2 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x double> [[TMP19]], double [[TMP20]], i64 2 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 2 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x double> [[TMP21]], double [[TMP22]], i64 3 +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP15]], <8 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x double> [[TMP23]], <4 x double> undef, <8 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <8 x double> [[TMP24]], <8 x double> [[TMP25]], <12 x i32> +; CHECK-NEXT: ret <12 x double> [[TMP26]] +; +entry: + %c = call <12 x double> @llvm.matrix.transpose.v12f64(<12 x double> %a, i32 3, i32 4) + ret <12 x double> %c +} diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-float.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-float.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-float.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + + +define <8 x float> @transpose(<8 x float> %a) { +; CHECK-LABEL: @transpose( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x float> [[SPLIT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[SPLIT1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP1]], float [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP4]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP6]], i64 3 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[SPLIT]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> undef, float [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[SPLIT1]], i64 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP10]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP12]], i64 2 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP14]], i64 3 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP15]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP16]] +; +entry: + %c = call <8 x float> @llvm.matrix.transpose(<8 x float> %a, i32 2, i32 4) + ret <8 x float> %c +} + +declare <8 x float> @llvm.matrix.transpose(<8 x float>, i32, i32) + +define <8 x float> @transpose_single_column(<8 x float> %a) { +; CHECK-LABEL: @transpose_single_column( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x float> [[SPLIT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x float> undef, float [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[SPLIT]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <1 x float> undef, float [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[SPLIT]], i64 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <1 x float> undef, float [[TMP4]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[SPLIT]], i64 3 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <1 x float> undef, float [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[SPLIT]], i64 4 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <1 x float> undef, float [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x float> [[SPLIT]], i64 5 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <1 x float> undef, float [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x float> [[SPLIT]], i64 6 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <1 x float> undef, float [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x float> [[SPLIT]], i64 7 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <1 x float> undef, float [[TMP14]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <1 x float> [[TMP1]], <1 x float> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <1 x float> [[TMP5]], <1 x float> [[TMP7]], <2 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <1 x float> [[TMP9]], <1 x float> [[TMP11]], <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x float> [[TMP13]], <1 x float> [[TMP15]], <2 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> [[TMP17]], <4 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x float> [[TMP18]], <2 x float> [[TMP19]], <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP21]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP22]] +; +entry: + %c = call <8 x float> @llvm.matrix.transpose(<8 x float> %a, i32 8, i32 1) + ret <8 x float> %c +} + +declare <12 x float> @llvm.matrix.transpose.v12f32(<12 x float>, i32, i32) + +define <12 x float> @transpose_float_3x4(<12 x float> %a) { +; CHECK-LABEL: @transpose_float_3x4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <12 x float> [[A:%.*]], <12 x float> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <12 x float> [[A]], <12 x float> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <12 x float> [[A]], <12 x float> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <12 x float> [[A]], <12 x float> undef, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x float> [[SPLIT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x float> [[SPLIT1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP1]], float [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <3 x float> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP4]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <3 x float> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP6]], i64 3 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <3 x float> [[SPLIT]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> undef, float [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <3 x float> [[SPLIT1]], i64 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP10]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <3 x float> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP12]], i64 2 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <3 x float> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP14]], i64 3 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <3 x float> [[SPLIT]], i64 2 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> undef, float [[TMP16]], i64 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <3 x float> [[SPLIT1]], i64 2 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP18]], i64 1 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <3 x float> [[SPLIT2]], i64 2 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP20]], i64 2 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x float> [[SPLIT3]], i64 2 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP22]], i64 3 +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP15]], <8 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> undef, <8 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <8 x float> [[TMP24]], <8 x float> [[TMP25]], <12 x i32> +; CHECK-NEXT: ret <12 x float> [[TMP26]] +; +entry: + %c = call <12 x float> @llvm.matrix.transpose.v12f32(<12 x float> %a, i32 3, i32 4) + ret <12 x float> %c +} diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-i32.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-i32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-i32.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + + +define <8 x i32> @transpose(<8 x i32> %a) { +; CHECK-LABEL: @transpose( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[SPLIT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SPLIT1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP4]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i64 3 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[SPLIT]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> undef, i32 [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[SPLIT1]], i64 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i64 2 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP14]], i64 3 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> [[TMP15]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP16]] +; +entry: + %c = call <8 x i32> @llvm.matrix.transpose(<8 x i32> %a, i32 2, i32 4) + ret <8 x i32> %c +} + +declare <8 x i32> @llvm.matrix.transpose(<8 x i32>, i32, i32) + +define <8 x i32> @transpose_single_column(<8 x i32> %a) { +; CHECK-LABEL: @transpose_single_column( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i32> [[SPLIT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x i32> undef, i32 [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[SPLIT]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <1 x i32> undef, i32 [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[SPLIT]], i64 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <1 x i32> undef, i32 [[TMP4]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[SPLIT]], i64 3 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <1 x i32> undef, i32 [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[SPLIT]], i64 4 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <1 x i32> undef, i32 [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[SPLIT]], i64 5 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <1 x i32> undef, i32 [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[SPLIT]], i64 6 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <1 x i32> undef, i32 [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[SPLIT]], i64 7 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <1 x i32> undef, i32 [[TMP14]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <1 x i32> [[TMP5]], <1 x i32> [[TMP7]], <2 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <1 x i32> [[TMP9]], <1 x i32> [[TMP11]], <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x i32> [[TMP13]], <1 x i32> [[TMP15]], <2 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP17]], <4 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x i32> [[TMP18]], <2 x i32> [[TMP19]], <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> [[TMP21]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP22]] +; +entry: + %c = call <8 x i32> @llvm.matrix.transpose(<8 x i32> %a, i32 8, i32 1) + ret <8 x i32> %c +} + +declare <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32>, i32, i32) + +define <12 x i32> @transpose_i32_3x4(<12 x i32> %a) { +; CHECK-LABEL: @transpose_i32_3x4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <12 x i32> [[A:%.*]], <12 x i32> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> undef, <3 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> undef, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i32> [[SPLIT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i32> [[SPLIT1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <3 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP4]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <3 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i64 3 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <3 x i32> [[SPLIT]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> undef, i32 [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <3 x i32> [[SPLIT1]], i64 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <3 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i64 2 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <3 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP14]], i64 3 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <3 x i32> [[SPLIT]], i64 2 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> undef, i32 [[TMP16]], i64 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <3 x i32> [[SPLIT1]], i64 2 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP18]], i64 1 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <3 x i32> [[SPLIT2]], i64 2 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP20]], i64 2 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i32> [[SPLIT3]], i64 2 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP22]], i64 3 +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> [[TMP15]], <8 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <8 x i32> [[TMP24]], <8 x i32> [[TMP25]], <12 x i32> +; CHECK-NEXT: ret <12 x i32> [[TMP26]] +; +entry: + %c = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> %a, i32 3, i32 4) + ret <12 x i32> %c +}