diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -15195,6 +15195,7 @@ :: declare vectorty @llvm.matrix.multiply.*(vectorty %A, vectorty %B, i32 , i32 , i32 ) + declare vectorty @llvm.matrix.multiply.*(vectorty %A, vectorty %B, i32 , i32 , i32 , i1 , i1 ) Overview: """"""""" @@ -15210,6 +15211,10 @@ must have * elements, %B must have * elements and the returned vector must have * elements. +Optionally, two additional i1 arguments can be provided when operating on +integer matrixes. They indicated whether the NUW and NSW flags should be added +to the generated instructions respectively. + '``llvm.matrix.columnwise.load.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1384,7 +1384,8 @@ llvm_anyvector_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty], + llvm_i32_ty, + llvm_vararg_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<2>, ImmArg<3>, ImmArg<4>]>; diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -4941,6 +4941,27 @@ NumRows = cast(Call.getArgOperand(2)); NumColumns = cast(Call.getArgOperand(4)); TypeToCheck = cast(Call.getType()); + + Assert(Call.getNumArgOperands() == 5 || Call.getNumArgOperands() == 7, + "llvm.matrix.multiply takes either 5 or 7 arguments"); + // Verify variadic arguments (HasNUW, HasNSW) + if (Call.getNumArgOperands() == 7) { + Assert(TypeToCheck->getElementType()->isIntegerTy(), + "llvm.matrix.multiply with 7 arguments must operate on integer " + "matrixes"); + Value *Arg5 = Call.getArgOperand(5); + Type *Arg5Ty = Arg5->getType(); + Assert(Arg5Ty->isIntegerTy() && Arg5Ty->getIntegerBitWidth() == 1 && + isa(Arg5), + "sixth argument of llvm.matrix.multiply must be an i1 integer " + "constant"); + Value *Arg6 = Call.getArgOperand(6); + Type *Arg6Ty = Arg6->getType(); + Assert(Arg6Ty->isIntegerTy() && Arg6Ty->getIntegerBitWidth() == 1 && + isa(Arg6), + "seventh argument of llvm.matrix.multiply must be an i1 integer " + "constant"); + } break; case Intrinsic::matrix_transpose: NumRows = cast(Call.getArgOperand(1)); diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -185,6 +185,27 @@ LoopInfo &LI; OptimizationRemarkEmitter &ORE; + /// Contains a set of options for generating code for llvm.matrix.multiply. + struct MultiplyOptions { + /// Is Floating-point contraction allowed? + bool AllowContract; + /// Should the NUW flag be added? + bool HasNUW; + /// Should the NSW flag be added? + bool HasNSW; + + MultiplyOptions(CallInst *MatMul) { + AllowContract = AllowContractEnabled || (isa(MatMul) && + MatMul->hasAllowContract()); + HasNUW = MatMul->getNumArgOperands() == 7 + ? cast(MatMul->getArgOperand(5))->getZExtValue() + : false; + HasNSW = MatMul->getNumArgOperands() == 7 + ? cast(MatMul->getArgOperand(6))->getZExtValue() + : false; + } + }; + /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation. struct OpInfoTy { /// Number of stores emitted to generate this matrix. @@ -932,14 +953,15 @@ } Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp, - IRBuilder<> &Builder, bool AllowContraction, + IRBuilder<> &Builder, MultiplyOptions Opts, unsigned &NumComputeOps) { NumComputeOps += getNumOps(A->getType()); if (!Sum) - return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B); + return UseFPOp ? Builder.CreateFMul(A, B) + : Builder.CreateMul(A, B, "", Opts.HasNUW, Opts.HasNSW); if (UseFPOp) { - if (AllowContraction) { + if (Opts.AllowContract) { // Use fmuladd for floating point operations and let the backend decide // if that's profitable. Function *FMulAdd = Intrinsic::getDeclaration( @@ -952,8 +974,8 @@ } NumComputeOps += getNumOps(A->getType()); - Value *Mul = Builder.CreateMul(A, B); - return Builder.CreateAdd(Sum, Mul); + Value *Mul = Builder.CreateMul(A, B, "", Opts.HasNUW, Opts.HasNSW); + return Builder.CreateAdd(Sum, Mul, "", Opts.HasNUW, Opts.HasNSW); } /// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For @@ -980,7 +1002,7 @@ /// Compute \p Result += \p A * \p B for input matrices with left-associating /// addition. void emitMatrixMultiply(MatrixTy &Result, const MatrixTy &A, - const MatrixTy &B, bool AllowContraction, + const MatrixTy &B, MultiplyOptions Opts, IRBuilder<> &Builder, bool isTiled) { const unsigned VF = std::max( TTI.getRegisterBitWidth(true) / @@ -1017,7 +1039,7 @@ Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat"); Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, L, Splat, Result.getElementType()->isFloatingPointTy(), - Builder, AllowContraction, NumComputeOps); + Builder, Opts, NumComputeOps); } Result.setVector(J, insertVector(Result.getVector(J), I, Sum, Builder)); @@ -1041,7 +1063,7 @@ Value *LH = Builder.CreateExtractElement(A.getVector(I), K); Value *Splat = Builder.CreateVectorSplat(BlockSize, LH, "splat"); Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, Splat, R, - IsFP, Builder, AllowContraction, NumComputeOps); + IsFP, Builder, Opts, NumComputeOps); } Result.setVector(I, insertVector(Result.getVector(I), J, Sum, Builder)); @@ -1196,8 +1218,6 @@ Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul); Value *CPtr = Store->getPointerOperand(); - bool AllowContract = AllowContractEnabled || (isa(MatMul) && - MatMul->hasAllowContract()); IRBuilder<> Builder(Store); for (unsigned J = 0; J < C; J += TileSize) for (unsigned I = 0; I < R; I += TileSize) { @@ -1213,7 +1233,7 @@ MatrixTy B = loadMatrix(BPtr, RShape, Builder.getInt32(K), Builder.getInt32(J), {TileM, TileC}, EltType, Builder); - emitMatrixMultiply(Res, A, B, AllowContract, Builder, true); + emitMatrixMultiply(Res, A, B, MultiplyOptions(MatMul), Builder, true); } storeMatrix(Res, CPtr, {R, M}, Builder.getInt32(I), Builder.getInt32(J), EltType, Builder); @@ -1280,9 +1300,8 @@ // Initialize the output MatrixTy Result(R, C, EltType); - bool AllowContract = AllowContractEnabled || (isa(MatMul) && - MatMul->hasAllowContract()); - emitMatrixMultiply(Result, Lhs, Rhs, AllowContract, Builder, false); + emitMatrixMultiply(Result, Lhs, Rhs, MultiplyOptions(MatMul), Builder, + false); finalizeLowering(MatMul, Result, Builder); } diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll @@ -240,13 +240,13 @@ %a = load <9 x double>, <9 x double>* %A.Ptr %b = load <9 x double>, <9 x double>* %B.Ptr %a.trans = call <9 x double> @llvm.matrix.transpose(<9 x double> %a, i32 3, i32 3) - %c = call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %a.trans, <9 x double> %b, i32 3, i32 3, i32 3) + %c = call <9 x double> (<9 x double>, <9 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %a.trans, <9 x double> %b, i32 3, i32 3, i32 3) store <9 x double> %c, <9 x double>* %C.Ptr ret void } declare <9 x double> @llvm.matrix.transpose(<9 x double>, i32, i32) -declare <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double>, <9 x double>, i32, i32, i32) +declare <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double>, <9 x double>, i32, i32, i32, ...) define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x double>* %C.Ptr) { ; CHECK-LABEL: @transpose_multiply_add( @@ -504,7 +504,7 @@ %a = load <9 x double>, <9 x double>* %A.Ptr %b = load <9 x double>, <9 x double>* %B.Ptr %a.trans = call <9 x double> @llvm.matrix.transpose(<9 x double> %a, i32 3, i32 3) - %mult = call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %a.trans, <9 x double> %b, i32 3, i32 3, i32 3) + %mult = call <9 x double> (<9 x double>, <9 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %a.trans, <9 x double> %b, i32 3, i32 3, i32 3) %c = load <9 x double>, <9 x double>* %C.Ptr %res = fadd <9 x double> %c, %mult diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll @@ -77,9 +77,9 @@ store i32 %r, i32* %r.addr, align 4 store i32 %c, i32* %c.addr, align 4 %0 = load <4 x double>, <4 x double>* getelementptr inbounds ([5 x <4 x double>], [5 x <4 x double>]* @foo, i64 0, i64 0), align 16 - %mul = call <4 x double> @llvm.matrix.multiply(<4 x double> %0, <4 x double> %0, i32 2, i32 2, i32 2) + %mul = call <4 x double> (<4 x double>, <4 x double>, i32, i32, i32, ...) @llvm.matrix.multiply(<4 x double> %0, <4 x double> %0, i32 2, i32 2, i32 2) store <4 x double> %0, <4 x double>* getelementptr inbounds ([5 x <4 x double>], [5 x <4 x double>]* @foo, i64 0, i64 2), align 16 ret void } -declare <4 x double> @llvm.matrix.multiply(<4 x double>, <4 x double>, i32, i32, i32) +declare <4 x double> @llvm.matrix.multiply(<4 x double>, <4 x double>, i32, i32, i32, ...) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-add-sub-double-row-major.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-add-sub-double-row-major.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-add-sub-double-row-major.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-add-sub-double-row-major.ll @@ -142,11 +142,11 @@ store <6 x double> %add, <6 x double>* %a.ptr %sub = fsub <6 x double> %b, store <6 x double> %sub, <6 x double>* %b.ptr - %mul = call <4 x double> @llvm.matrix.multiply.v4f64.v6f64.v6f64(<6 x double> %add, <6 x double> %sub, i32 2, i32 3, i32 2) + %mul = call <4 x double> (<6 x double>, <6 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v6f64.v6f64(<6 x double> %add, <6 x double> %sub, i32 2, i32 3, i32 2) %c = load <4 x double>, <4 x double>* %c.ptr %res = fsub <4 x double> %c, %mul store <4 x double> %res, <4 x double>* %c.ptr ret void } -declare <4 x double> @llvm.matrix.multiply.v4f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32) +declare <4 x double> @llvm.matrix.multiply.v4f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32, ...) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction-fmf.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction-fmf.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction-fmf.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction-fmf.ll @@ -62,8 +62,8 @@ ; CHECK-NEXT: ret <4 x double> [[TMP24]] ; entry: - %c = call contract <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2) + %c = call contract <4 x double> (<4 x double>, <4 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2) ret <4 x double> %c } -declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32) +declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32, ...) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction.ll @@ -62,8 +62,8 @@ ; CHECK-NEXT: ret <4 x double> [[TMP24]] ; entry: - %c = call <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2) + %c = call <4 x double> (<4 x double>, <4 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2) ret <4 x double> %c } -declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32) +declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32, ...) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-row-major.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-row-major.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-row-major.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-row-major.ll @@ -67,11 +67,11 @@ ; RM-NEXT: ret <4 x double> [[TMP28]] ; entry: - %c = call <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2) + %c = call <4 x double> (<4 x double>, <4 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2) ret <4 x double> %c } -declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32) +declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32, ...) define <4 x double> @multiply_1x2(<2 x double> %a, <2 x double> %b) { @@ -112,11 +112,11 @@ ; RM-NEXT: ret <4 x double> [[TMP16]] ; entry: - %c = call <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double> %a, <2 x double> %b, i32 2, i32 1, i32 2) + %c = call <4 x double> (<2 x double>, <2 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double> %a, <2 x double> %b, i32 2, i32 1, i32 2) ret <4 x double> %c } -declare <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double>, <2 x double>, i32, i32, i32) +declare <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double>, <2 x double>, i32, i32, i32, ...) define <9 x double> @multiply_2x3(<6 x double> %a, <6 x double> %b) { ; RM-LABEL: @multiply_2x3( @@ -249,8 +249,8 @@ ; RM-NEXT: ret <9 x double> [[TMP65]] ; entry: - %c = call <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3) + %c = call <9 x double> (<6 x double>, <6 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3) ret <9 x double> %c } -declare <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32) +declare <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32, ...) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double.ll @@ -66,11 +66,11 @@ ; CHECK-NEXT: ret <4 x double> [[TMP28]] ; entry: - %c = call <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2) + %c = call <4 x double> (<4 x double>, <4 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2) ret <4 x double> %c } -declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32) +declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32, ...) define <4 x double> @multiply_1x2(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: @multiply_1x2( @@ -110,11 +110,11 @@ ; CHECK-NEXT: ret <4 x double> [[TMP16]] ; entry: - %c = call <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double> %a, <2 x double> %b, i32 2, i32 1, i32 2) + %c = call <4 x double> (<2 x double>, <2 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double> %a, <2 x double> %b, i32 2, i32 1, i32 2) ret <4 x double> %c } -declare <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double>, <2 x double>, i32, i32, i32) +declare <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double>, <2 x double>, i32, i32, i32, ...) define <9 x double> @multiply_2x3(<6 x double> %a, <6 x double> %b) { ; CHECK-LABEL: @multiply_2x3( @@ -247,8 +247,8 @@ ; CHECK-NEXT: ret <9 x double> [[TMP65]] ; entry: - %c = call <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3) + %c = call <9 x double> (<6 x double>, <6 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3) ret <9 x double> %c } -declare <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32) +declare <9 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32, ...) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction-fmf.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction-fmf.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction-fmf.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction-fmf.ll @@ -62,8 +62,8 @@ ; CHECK-NEXT: ret <4 x float> [[TMP24]] ; entry: - %c = call contract <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2) + %c = call contract <4 x float> (<4 x float>, <4 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2) ret <4 x float> %c } -declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32) +declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32, ...) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction.ll @@ -62,8 +62,8 @@ ; CHECK-NEXT: ret <4 x float> [[TMP24]] ; entry: - %c = call contract <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2) + %c = call contract <4 x float> (<4 x float>, <4 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2) ret <4 x float> %c } -declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32) +declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32, ...) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float.ll @@ -66,11 +66,11 @@ ; CHECK-NEXT: ret <4 x float> [[TMP28]] ; entry: - %c = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2) + %c = call <4 x float> (<4 x float>, <4 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2) ret <4 x float> %c } -declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32) +declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32, ...) define <4 x float> @multiply_1x2(<2 x float> %a, <2 x float> %b) { ; CHECK-LABEL: @multiply_1x2( @@ -110,11 +110,11 @@ ; CHECK-NEXT: ret <4 x float> [[TMP16]] ; entry: - %c = call <4 x float> @llvm.matrix.multiply.v4f32.v2f32.v2f32(<2 x float> %a, <2 x float> %b, i32 2, i32 1, i32 2) + %c = call <4 x float> (<2 x float>, <2 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f32.v2f32.v2f32(<2 x float> %a, <2 x float> %b, i32 2, i32 1, i32 2) ret <4 x float> %c } -declare <4 x float> @llvm.matrix.multiply.v4f32.v2f32.v2f32(<2 x float>, <2 x float>, i32, i32, i32) +declare <4 x float> @llvm.matrix.multiply.v4f32.v2f32.v2f32(<2 x float>, <2 x float>, i32, i32, i32, ...) define <9 x float> @multiply_2x3(<6 x float> %a, <6 x float> %b) { ; CHECK-LABEL: @multiply_2x3( @@ -247,8 +247,8 @@ ; CHECK-NEXT: ret <9 x float> [[TMP65]] ; entry: - %c = call <9 x float> @llvm.matrix.multiply.v6f32.v6f32.v6f32(<6 x float> %a, <6 x float> %b, i32 3, i32 2, i32 3) + %c = call <9 x float> (<6 x float>, <6 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v6f32.v6f32.v6f32(<6 x float> %a, <6 x float> %b, i32 3, i32 2, i32 3) ret <9 x float> %c } -declare <9 x float> @llvm.matrix.multiply.v6f32.v6f32.v6f32(<6 x float>, <6 x float>, i32, i32, i32) +declare <9 x float> @llvm.matrix.multiply.v6f32.v6f32.v6f32(<6 x float>, <6 x float>, i32, i32, i32, ...) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll @@ -277,7 +277,7 @@ entry: %a = load <6 x double>, <6 x double>* %A, align 16 %b = load <6 x double>, <6 x double>* %B, align 16 - %c = call <9 x double> @llvm.matrix.multiply(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3) + %c = call <9 x double> (<6 x double>, <6 x double>, i32, i32, i32, ...) @llvm.matrix.multiply(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3) store <9 x double> %c, <9 x double>* %C, align 16 br i1 %cond, label %true, label %false @@ -295,9 +295,9 @@ end: %a.2 = load <6 x double>, <6 x double>* %A, align 16 %b.2 = load <6 x double>, <6 x double>* %B, align 16 - %c.2 = call <9 x double> @llvm.matrix.multiply(<6 x double> %a.2, <6 x double> %b.2, i32 3, i32 2, i32 3) + %c.2 = call <9 x double> (<6 x double>, <6 x double>, i32, i32, i32, ...) @llvm.matrix.multiply(<6 x double> %a.2, <6 x double> %b.2, i32 3, i32 2, i32 3) store <9 x double> %c.2, <9 x double>* %C, align 16 ret void } -declare <9 x double> @llvm.matrix.multiply(<6 x double>, <6 x double>, i32, i32, i32) +declare <9 x double> @llvm.matrix.multiply(<6 x double>, <6 x double>, i32, i32, i32, ...) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll @@ -264,10 +264,10 @@ %a = load <16 x double>, <16 x double>* %A, align 16 %b = load <16 x double>, <16 x double>* %B, align 16 - %c = call <16 x double> @llvm.matrix.multiply(<16 x double> %a, <16 x double> %b, i32 4, i32 4, i32 4) + %c = call <16 x double> (<16 x double>, <16 x double>, i32, i32, i32, ...) @llvm.matrix.multiply(<16 x double> %a, <16 x double> %b, i32 4, i32 4, i32 4) store <16 x double> %c, <16 x double>* %C, align 16 ret void } -declare <16 x double> @llvm.matrix.multiply(<16 x double>, <16 x double>, i32, i32, i32) +declare <16 x double> @llvm.matrix.multiply(<16 x double>, <16 x double>, i32, i32, i32, ...) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-nuw-nsw.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-nuw-nsw.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-nuw-nsw.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + +declare <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, ...) + +define <4 x i32> @multiply_2x2_nuw(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @multiply_2x2_nuw( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> undef, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[BLOCK4:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT5]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw <1 x i32> [[BLOCK4]], [[SPLAT_SPLAT6]] +; CHECK-NEXT: [[TMP4:%.*]] = add nuw <1 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <1 x i32> [[TMP4]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[BLOCK7:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x i32> undef, i32 [[TMP7]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT8]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul nuw <1 x i32> [[BLOCK7]], [[SPLAT_SPLAT9]] +; CHECK-NEXT: [[BLOCK10:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x i32> undef, i32 [[TMP9]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT11]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <1 x i32> [[BLOCK10]], [[SPLAT_SPLAT12]] +; CHECK-NEXT: [[TMP11:%.*]] = add nuw <1 x i32> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP12]], <2 x i32> +; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x i32> undef, i32 [[TMP14]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT14]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = mul nuw <1 x i32> [[BLOCK13]], [[SPLAT_SPLAT15]] +; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x i32> undef, i32 [[TMP16]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT17]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = mul nuw <1 x i32> [[BLOCK16]], [[SPLAT_SPLAT18]] +; CHECK-NEXT: [[TMP18:%.*]] = add nuw <1 x i32> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x i32> [[TMP18]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP19]], <2 x i32> +; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x i32> undef, i32 [[TMP21]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT20]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = mul nuw <1 x i32> [[BLOCK19]], [[SPLAT_SPLAT21]] +; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x i32> undef, i32 [[TMP23]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT23]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = mul nuw <1 x i32> [[BLOCK22]], [[SPLAT_SPLAT24]] +; CHECK-NEXT: [[TMP25:%.*]] = add nuw <1 x i32> [[TMP22]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x i32> [[TMP25]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP26]], <2 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP27]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP28]] +; +entry: + %c = call <4 x i32> (<4 x i32>, <4 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2, i1 true, i1 false) + ret <4 x i32> %c +} + + +define <4 x i32> @multiply_2x2_nsw(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @multiply_2x2_nsw( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> undef, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[BLOCK4:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT5]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <1 x i32> [[BLOCK4]], [[SPLAT_SPLAT6]] +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <1 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <1 x i32> [[TMP4]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[BLOCK7:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x i32> undef, i32 [[TMP7]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT8]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul nsw <1 x i32> [[BLOCK7]], [[SPLAT_SPLAT9]] +; CHECK-NEXT: [[BLOCK10:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x i32> undef, i32 [[TMP9]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT11]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <1 x i32> [[BLOCK10]], [[SPLAT_SPLAT12]] +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <1 x i32> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP12]], <2 x i32> +; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x i32> undef, i32 [[TMP14]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT14]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <1 x i32> [[BLOCK13]], [[SPLAT_SPLAT15]] +; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x i32> undef, i32 [[TMP16]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT17]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = mul nsw <1 x i32> [[BLOCK16]], [[SPLAT_SPLAT18]] +; CHECK-NEXT: [[TMP18:%.*]] = add nsw <1 x i32> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x i32> [[TMP18]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP19]], <2 x i32> +; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x i32> undef, i32 [[TMP21]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT20]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = mul nsw <1 x i32> [[BLOCK19]], [[SPLAT_SPLAT21]] +; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x i32> undef, i32 [[TMP23]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT23]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = mul nsw <1 x i32> [[BLOCK22]], [[SPLAT_SPLAT24]] +; CHECK-NEXT: [[TMP25:%.*]] = add nsw <1 x i32> [[TMP22]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x i32> [[TMP25]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP26]], <2 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP27]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP28]] +; +entry: + %c = call <4 x i32> (<4 x i32>, <4 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2, i1 false, i1 true) + ret <4 x i32> %c +} + +define <4 x i32> @multiply_2x2_nuw_nsw(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @multiply_2x2_nuw_nsw( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> undef, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[BLOCK4:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT5]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw nsw <1 x i32> [[BLOCK4]], [[SPLAT_SPLAT6]] +; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw <1 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <1 x i32> [[TMP4]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[BLOCK7:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x i32> undef, i32 [[TMP7]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT8]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul nuw nsw <1 x i32> [[BLOCK7]], [[SPLAT_SPLAT9]] +; CHECK-NEXT: [[BLOCK10:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x i32> undef, i32 [[TMP9]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT11]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw <1 x i32> [[BLOCK10]], [[SPLAT_SPLAT12]] +; CHECK-NEXT: [[TMP11:%.*]] = add nuw nsw <1 x i32> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP12]], <2 x i32> +; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x i32> undef, i32 [[TMP14]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT14]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = mul nuw nsw <1 x i32> [[BLOCK13]], [[SPLAT_SPLAT15]] +; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x i32> undef, i32 [[TMP16]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT17]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = mul nuw nsw <1 x i32> [[BLOCK16]], [[SPLAT_SPLAT18]] +; CHECK-NEXT: [[TMP18:%.*]] = add nuw nsw <1 x i32> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x i32> [[TMP18]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP19]], <2 x i32> +; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x i32> undef, i32 [[TMP21]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT20]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = mul nuw nsw <1 x i32> [[BLOCK19]], [[SPLAT_SPLAT21]] +; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x i32> undef, i32 [[TMP23]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT23]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = mul nuw nsw <1 x i32> [[BLOCK22]], [[SPLAT_SPLAT24]] +; CHECK-NEXT: [[TMP25:%.*]] = add nuw nsw <1 x i32> [[TMP22]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x i32> [[TMP25]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP26]], <2 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP27]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP28]] +; +entry: + %c = call <4 x i32> (<4 x i32>, <4 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2, i1 true, i1 true) + ret <4 x i32> %c +} diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-row-major.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-row-major.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-row-major.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-row-major.ll @@ -67,11 +67,11 @@ ; RM-NEXT: ret <4 x i32> [[TMP28]] ; entry: - %c = call <4 x i32> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2) + %c = call <4 x i32> (<4 x i32>, <4 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2) ret <4 x i32> %c } -declare <4 x i32> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x i32>, <4 x i32>, i32, i32, i32) +declare <4 x i32> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x i32>, <4 x i32>, i32, i32, i32, ...) define <4 x i32> @multiply_1x2(<2 x i32> %a, <2 x i32> %b) { @@ -112,11 +112,11 @@ ; RM-NEXT: ret <4 x i32> [[TMP16]] ; entry: - %c = call <4 x i32> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x i32> %a, <2 x i32> %b, i32 2, i32 1, i32 2) + %c = call <4 x i32> (<2 x i32>, <2 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x i32> %a, <2 x i32> %b, i32 2, i32 1, i32 2) ret <4 x i32> %c } -declare <4 x i32> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x i32>, <2 x i32>, i32, i32, i32) +declare <4 x i32> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x i32>, <2 x i32>, i32, i32, i32, ...) define <9 x i32> @multiply_2x3(<6 x i32> %a, <6 x i32> %b) { ; RM-LABEL: @multiply_2x3( @@ -249,8 +249,8 @@ ; RM-NEXT: ret <9 x i32> [[TMP65]] ; entry: - %c = call <9 x i32> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x i32> %a, <6 x i32> %b, i32 3, i32 2, i32 3) + %c = call <9 x i32> (<6 x i32>, <6 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x i32> %a, <6 x i32> %b, i32 3, i32 2, i32 3) ret <9 x i32> %c } -declare <9 x i32> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x i32>, <6 x i32>, i32, i32, i32) +declare <9 x i32> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x i32>, <6 x i32>, i32, i32, i32, ...) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32.ll @@ -66,11 +66,11 @@ ; CHECK-NEXT: ret <4 x i32> [[TMP28]] ; entry: - %c = call <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2) + %c = call <4 x i32> (<4 x i32>, <4 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2) ret <4 x i32> %c } -declare <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32) +declare <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, ...) define <4 x i32> @multiply_1x2(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: @multiply_1x2( @@ -110,11 +110,11 @@ ; CHECK-NEXT: ret <4 x i32> [[TMP16]] ; entry: - %c = call <4 x i32> @llvm.matrix.multiply.v4i32.v2i32.v2i32(<2 x i32> %a, <2 x i32> %b, i32 2, i32 1, i32 2) + %c = call <4 x i32> (<2 x i32>, <2 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v4i32.v2i32.v2i32(<2 x i32> %a, <2 x i32> %b, i32 2, i32 1, i32 2) ret <4 x i32> %c } -declare <4 x i32> @llvm.matrix.multiply.v4i32.v2i32.v2i32(<2 x i32>, <2 x i32>, i32, i32, i32) +declare <4 x i32> @llvm.matrix.multiply.v4i32.v2i32.v2i32(<2 x i32>, <2 x i32>, i32, i32, i32, ...) define <9 x i32> @multiply_2x3(<6 x i32> %a, <6 x i32> %b) { ; CHECK-LABEL: @multiply_2x3( @@ -247,8 +247,8 @@ ; CHECK-NEXT: ret <9 x i32> [[TMP65]] ; entry: - %c = call <9 x i32> @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32> %a, <6 x i32> %b, i32 3, i32 2, i32 3) + %c = call <9 x i32> (<6 x i32>, <6 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32> %a, <6 x i32> %b, i32 3, i32 2, i32 3) ret <9 x i32> %c } -declare <9 x i32> @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32>, <6 x i32>, i32, i32, i32) +declare <9 x i32> @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32>, <6 x i32>, i32, i32, i32, ...) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll @@ -85,7 +85,7 @@ if.end: ; preds = %if.then, %if.else %merge = phi <9 x double> [ %A.trans, %if.then], [ %B.trans, %if.else ] - %res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %C, <9 x double> %merge, i32 3, i32 3, i32 3) + %res = tail call <9 x double> (<9 x double>, <9 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %C, <9 x double> %merge, i32 3, i32 3, i32 3) ret <9 x double> %res } @@ -126,10 +126,10 @@ ; %A.trans = tail call <9 x double> @llvm.matrix.transpose.v9f64(<9 x double> %A, i32 3, i32 3) %A.foo = call <9 x double> @foo(<9 x double> %A.trans) - %res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %B, <9 x double> %A.foo, i32 3, i32 3, i32 3) + %res = tail call <9 x double> (<9 x double>, <9 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %B, <9 x double> %A.foo, i32 3, i32 3, i32 3) ret <9 x double> %res } -declare <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double>, <9 x double>, i32 immarg, i32 immarg, i32 immarg) +declare <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double>, <9 x double>, i32 immarg, i32 immarg, i32 immarg, ...) declare <9 x double> @llvm.matrix.transpose.v9f64(<9 x double>, i32 immarg, i32 immarg) declare <9 x double> @foo(<9 x double>) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-shared-subtrees.ll @@ -94,7 +94,7 @@ %tmp17 = tail call <8 x double> @llvm.matrix.transpose.v8f64(<8 x double> %shared.load, i32 2, i32 4), !dbg !10 tail call void @llvm.matrix.columnwise.store.v8f64.p0f64(<8 x double> %tmp17, double* %arg3, i32 10, i32 4, i32 2), !dbg !10 %tmp18 = tail call <60 x double> @llvm.matrix.columnwise.load.v60f64.p0f64(double* %arg2, i32 20, i32 4, i32 15), !dbg !11 - %tmp48 = tail call <60 x double> @llvm.matrix.multiply.v60f64.v8f64.v30f64(<8 x double> %tmp17, <30 x double> %shared.load.2, i32 4, i32 2, i32 15), !dbg !11 + %tmp48 = tail call <60 x double> (<8 x double>, <30 x double>, i32, i32, i32, ...) @llvm.matrix.multiply.v60f64.v8f64.v30f64(<8 x double> %tmp17, <30 x double> %shared.load.2, i32 4, i32 2, i32 15), !dbg !11 %tmp49 = fsub <60 x double> %tmp18, %tmp48, !dbg !11 tail call void @llvm.matrix.columnwise.store.v60f64.p0f64(<60 x double> %tmp49, double* %arg2, i32 10, i32 4, i32 15), !dbg !11 ret void @@ -106,7 +106,7 @@ declare <60 x double> @llvm.matrix.columnwise.load.v60f64.p0f64(double*, i32, i32 immarg, i32 immarg) declare void @llvm.matrix.columnwise.store.v60f64.p0f64(<60 x double>, double* writeonly, i32, i32 immarg, i32 immarg) declare void @llvm.matrix.columnwise.store.v8f64.p0f64(<8 x double>, double* writeonly, i32, i32 immarg, i32 immarg) -declare <60 x double> @llvm.matrix.multiply.v60f64.v8f64.v30f64(<8 x double>, <30 x double>, i32 immarg, i32 immarg, i32 immarg) +declare <60 x double> @llvm.matrix.multiply.v60f64.v8f64.v30f64(<8 x double>, <30 x double>, i32 immarg, i32 immarg, i32 immarg, ...) !llvm.module.flags = !{!0, !1, !2, !3} !llvm.dbg.cu = !{!4} diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll @@ -27,12 +27,12 @@ define void @multiply(<12 x double>* %A, <12 x double>* %B, <4 x double>* %C) !dbg !25 { %A.matrix = load <12 x double>, <12 x double>* %A, !dbg !26 %B.matrix = load <12 x double>, <12 x double>* %B, !dbg !26 - %t = call <4 x double> @llvm.matrix.multiply(<12 x double> %A.matrix, <12 x double> %B.matrix, i32 2, i32 6, i32 2), !dbg !26 + %t = call <4 x double> (<12 x double>, <12 x double>, i32, i32, i32, ...) @llvm.matrix.multiply(<12 x double> %A.matrix, <12 x double> %B.matrix, i32 2, i32 6, i32 2), !dbg !26 store <4 x double> %t, <4 x double>* %C, !dbg !26 ret void } -declare <4 x double> @llvm.matrix.multiply(<12 x double>, <12 x double>, i32, i32, i32) +declare <4 x double> @llvm.matrix.multiply(<12 x double>, <12 x double>, i32, i32, i32, ...) ; CHECK-LABEL: remark: test.h:60:20: Lowered with 6 stores, 6 loads, 0 compute ops ; CHECK-NEXT: store( @@ -101,7 +101,7 @@ %C.matrix = load <12 x double>, <12 x double>* %C, !dbg !34 %D.matrix = load <12 x double>, <12 x double>* %D, !dbg !34 - %Mult.matrix = call <4 x double> @llvm.matrix.multiply(<12 x double> %C.matrix, <12 x double> %D.matrix, i32 2, i32 6, i32 2), !dbg !34 + %Mult.matrix = call <4 x double> (<12 x double>, <12 x double>, i32, i32, i32, ...) @llvm.matrix.multiply(<12 x double> %C.matrix, <12 x double> %D.matrix, i32 2, i32 6, i32 2), !dbg !34 store <4 x double> %Mult.matrix, <4 x double>* %E, !dbg !34 ret void diff --git a/llvm/test/Verifier/matrix-intrinsics.ll b/llvm/test/Verifier/matrix-intrinsics.ll --- a/llvm/test/Verifier/matrix-intrinsics.ll +++ b/llvm/test/Verifier/matrix-intrinsics.ll @@ -10,12 +10,34 @@ ret <4 x float> %result.2 } -declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32) +declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32, ...) define <4 x float> @multiply(<4 x float> %m) { ; CHECK-NEXT: result of a matrix operation does not fit in the returned vector ; CHECK-NEXT: result of a matrix operation does not fit in the returned vector - %result.1 = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %m, <4 x float> %m, i32 3, i32 2, i32 2) - %result.2 = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %result.1, <4 x float> %m, i32 2, i32 2, i32 1) + %result.1 = call <4 x float> (<4 x float>, <4 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %m, <4 x float> %m, i32 3, i32 2, i32 2) + %result.2 = call <4 x float> (<4 x float>, <4 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %result.1, <4 x float> %m, i32 2, i32 2, i32 1) + ret <4 x float> %result.2 +} + +declare <6 x i32 > @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32>, <6 x i32>, i32, i32, i32, ...) +define <4 x float> @multiply_num_args(<4 x float> %m, <6 x i32> %ia, i1 %c) { +; CHECK-NEXT: llvm.matrix.multiply takes either 5 or 7 arguments +; CHECK-NEXT: llvm.matrix.multiply takes either 5 or 7 arguments + %result.1 = call <4 x float> (<4 x float>, <4 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %m, <4 x float> %m, i32 3, i32 2, i32 2, i32 1) + %result.2 = call <4 x float> (<4 x float>, <4 x float>, i32, i32, i32, ...) @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %result.1, <4 x float> %m, i32 2, i32 2, i32 1, i1 0, i1 1, i1 0) + +; CHECK-NEXT: sixth argument of llvm.matrix.multiply must be an i1 integer constant +; CHECK-NEXT: seventh argument of llvm.matrix.multiply must be an i1 integer constant + %result.3 = call <6 x i32> (<6 x i32>, <6 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32> %ia, <6 x i32> %ia, i32 2, i32 3, i32 2, i32 0, i1 1) + %result.4 = call <6 x i32> (<6 x i32>, <6 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32> %ia, <6 x i32> %ia, i32 2, i32 3, i32 2, i1 0, i8* null) + +; CHECK-NEXT: llvm.matrix.multiply takes either 5 or 7 arguments + %result.6 = call <6 x i32> (<6 x i32>, <6 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32> %ia, <6 x i32> %ia, i32 2, i32 3, i32 2, i1 0, i1 0, i32 12) + +; CHECK-NEXT: sixth argument of llvm.matrix.multiply must be an i1 integer constant +; CHECK-NEXT: seventh argument of llvm.matrix.multiply must be an i1 integer constant + %result.8 = call <6 x i32> (<6 x i32>, <6 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32> %ia, <6 x i32> %ia, i32 2, i32 3, i32 2, i1 %c, i1 0) + %result.7 = call <6 x i32> (<6 x i32>, <6 x i32>, i32, i32, i32, ...) @llvm.matrix.multiply.v6i32.v6i32.v6i32(<6 x i32> %ia, <6 x i32> %ia, i32 2, i32 3, i32 2, i1 0, i1 %c) ret <4 x float> %result.2 }