diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -187,6 +187,29 @@ LoopInfo *LI; OptimizationRemarkEmitter *ORE; + /// Contains a set of options for generating code for llvm.matrix.multiply. + struct MultiplyOptions { + /// Is Floating-point contraction allowed? + bool AllowContract; + /// Should the NUW flag be added? + bool HasNUW; + /// Should the NSW flag be added? + bool HasNSW; + + MultiplyOptions(CallInst *MatMul) { + AllowContract = AllowContractEnabled || (isa(MatMul) && + MatMul->hasAllowContract()); + auto NUWBundle = MatMul->getOperandBundle(LLVMContext::OB_nuw); + HasNUW = NUWBundle + ? cast(*NUWBundle->Inputs.begin())->isOne() + : false; + auto NSWBundle = MatMul->getOperandBundle(LLVMContext::OB_nsw); + HasNSW = NSWBundle + ? cast(*NSWBundle->Inputs.begin())->isOne() + : false; + } + }; + /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation. struct OpInfoTy { /// Number of stores emitted to generate this matrix. @@ -960,14 +983,15 @@ } Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp, - IRBuilder<> &Builder, bool AllowContraction, + IRBuilder<> &Builder, MultiplyOptions Opts, unsigned &NumComputeOps) { NumComputeOps += getNumOps(A->getType()); if (!Sum) - return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B); + return UseFPOp ? Builder.CreateFMul(A, B) + : Builder.CreateMul(A, B, "", Opts.HasNUW, Opts.HasNSW); if (UseFPOp) { - if (AllowContraction) { + if (Opts.AllowContract) { // Use fmuladd for floating point operations and let the backend decide // if that's profitable. Function *FMulAdd = Intrinsic::getDeclaration( @@ -980,8 +1004,8 @@ } NumComputeOps += getNumOps(A->getType()); - Value *Mul = Builder.CreateMul(A, B); - return Builder.CreateAdd(Sum, Mul); + Value *Mul = Builder.CreateMul(A, B, "", Opts.HasNUW, Opts.HasNSW); + return Builder.CreateAdd(Sum, Mul, "", Opts.HasNUW, Opts.HasNSW); } /// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For @@ -1008,7 +1032,7 @@ /// Compute \p Result += \p A * \p B for input matrices with left-associating /// addition. void emitMatrixMultiply(MatrixTy &Result, const MatrixTy &A, - const MatrixTy &B, bool AllowContraction, + const MatrixTy &B, MultiplyOptions Opts, IRBuilder<> &Builder, bool isTiled) { const unsigned VF = std::max( TTI.getRegisterBitWidth(true) / @@ -1045,7 +1069,7 @@ Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat"); Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, L, Splat, Result.getElementType()->isFloatingPointTy(), - Builder, AllowContraction, NumComputeOps); + Builder, Opts, NumComputeOps); } Result.setVector(J, insertVector(Result.getVector(J), I, Sum, Builder)); @@ -1069,7 +1093,7 @@ Value *LH = Builder.CreateExtractElement(A.getVector(I), K); Value *Splat = Builder.CreateVectorSplat(BlockSize, LH, "splat"); Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, Splat, R, - IsFP, Builder, AllowContraction, NumComputeOps); + IsFP, Builder, Opts, NumComputeOps); } Result.setVector(I, insertVector(Result.getVector(I), J, Sum, Builder)); @@ -1224,8 +1248,6 @@ Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul); Value *CPtr = Store->getPointerOperand(); - bool AllowContract = AllowContractEnabled || (isa(MatMul) && - MatMul->hasAllowContract()); IRBuilder<> Builder(Store); for (unsigned J = 0; J < C; J += TileSize) for (unsigned I = 0; I < R; I += TileSize) { @@ -1243,7 +1265,7 @@ loadMatrix(BPtr, LoadOp1->getAlign(), LoadOp1->isVolatile(), RShape, Builder.getInt64(K), Builder.getInt64(J), {TileM, TileC}, EltType, Builder); - emitMatrixMultiply(Res, A, B, AllowContract, Builder, true); + emitMatrixMultiply(Res, A, B, MultiplyOptions(MatMul), Builder, true); } storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M}, Builder.getInt64(I), Builder.getInt64(J), EltType, Builder); @@ -1310,9 +1332,8 @@ // Initialize the output MatrixTy Result(R, C, EltType); - bool AllowContract = AllowContractEnabled || (isa(MatMul) && - MatMul->hasAllowContract()); - emitMatrixMultiply(Result, Lhs, Rhs, AllowContract, Builder, false); + emitMatrixMultiply(Result, Lhs, Rhs, MultiplyOptions(MatMul), Builder, + false); finalizeLowering(MatMul, Result, Builder); } diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-nuw-nsw.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-nuw-nsw.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-i32-nuw-nsw.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + +declare <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32) + +define <4 x i32> @multiply_2x2_nuw(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @multiply_2x2_nuw( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> undef, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[BLOCK4:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT5]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw <1 x i32> [[BLOCK4]], [[SPLAT_SPLAT6]] +; CHECK-NEXT: [[TMP4:%.*]] = add nuw <1 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <1 x i32> [[TMP4]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[BLOCK7:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x i32> undef, i32 [[TMP7]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT8]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul nuw <1 x i32> [[BLOCK7]], [[SPLAT_SPLAT9]] +; CHECK-NEXT: [[BLOCK10:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x i32> undef, i32 [[TMP9]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT11]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <1 x i32> [[BLOCK10]], [[SPLAT_SPLAT12]] +; CHECK-NEXT: [[TMP11:%.*]] = add nuw <1 x i32> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP12]], <2 x i32> +; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x i32> undef, i32 [[TMP14]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT14]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = mul nuw <1 x i32> [[BLOCK13]], [[SPLAT_SPLAT15]] +; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x i32> undef, i32 [[TMP16]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT17]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = mul nuw <1 x i32> [[BLOCK16]], [[SPLAT_SPLAT18]] +; CHECK-NEXT: [[TMP18:%.*]] = add nuw <1 x i32> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x i32> [[TMP18]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP19]], <2 x i32> +; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x i32> undef, i32 [[TMP21]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT20]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = mul nuw <1 x i32> [[BLOCK19]], [[SPLAT_SPLAT21]] +; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x i32> undef, i32 [[TMP23]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT23]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = mul nuw <1 x i32> [[BLOCK22]], [[SPLAT_SPLAT24]] +; CHECK-NEXT: [[TMP25:%.*]] = add nuw <1 x i32> [[TMP22]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x i32> [[TMP25]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP26]], <2 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP27]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP28]] +; +entry: + %c = call <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2) [ "nuw"(i1 true), "nsw"(i1 false) ] + ret <4 x i32> %c +} + + +define <4 x i32> @multiply_2x2_nsw(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @multiply_2x2_nsw( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> undef, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[BLOCK4:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT5]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <1 x i32> [[BLOCK4]], [[SPLAT_SPLAT6]] +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <1 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <1 x i32> [[TMP4]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[BLOCK7:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x i32> undef, i32 [[TMP7]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT8]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul nsw <1 x i32> [[BLOCK7]], [[SPLAT_SPLAT9]] +; CHECK-NEXT: [[BLOCK10:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x i32> undef, i32 [[TMP9]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT11]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <1 x i32> [[BLOCK10]], [[SPLAT_SPLAT12]] +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <1 x i32> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP12]], <2 x i32> +; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x i32> undef, i32 [[TMP14]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT14]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <1 x i32> [[BLOCK13]], [[SPLAT_SPLAT15]] +; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x i32> undef, i32 [[TMP16]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT17]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = mul nsw <1 x i32> [[BLOCK16]], [[SPLAT_SPLAT18]] +; CHECK-NEXT: [[TMP18:%.*]] = add nsw <1 x i32> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x i32> [[TMP18]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP19]], <2 x i32> +; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x i32> undef, i32 [[TMP21]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT20]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = mul nsw <1 x i32> [[BLOCK19]], [[SPLAT_SPLAT21]] +; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x i32> undef, i32 [[TMP23]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT23]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = mul nsw <1 x i32> [[BLOCK22]], [[SPLAT_SPLAT24]] +; CHECK-NEXT: [[TMP25:%.*]] = add nsw <1 x i32> [[TMP22]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x i32> [[TMP25]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP26]], <2 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP27]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP28]] +; +entry: + %c = call <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2) [ "nuw"(i1 false), "nsw"(i1 true) ] + ret <4 x i32> %c +} + +define <4 x i32> @multiply_2x2_nuw_nsw(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @multiply_2x2_nuw_nsw( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> undef, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[BLOCK4:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT5]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw nsw <1 x i32> [[BLOCK4]], [[SPLAT_SPLAT6]] +; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw <1 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <1 x i32> [[TMP4]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[BLOCK7:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x i32> undef, i32 [[TMP7]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT8]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul nuw nsw <1 x i32> [[BLOCK7]], [[SPLAT_SPLAT9]] +; CHECK-NEXT: [[BLOCK10:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x i32> undef, i32 [[TMP9]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT11]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw <1 x i32> [[BLOCK10]], [[SPLAT_SPLAT12]] +; CHECK-NEXT: [[TMP11:%.*]] = add nuw nsw <1 x i32> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP12]], <2 x i32> +; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x i32> undef, i32 [[TMP14]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT14]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = mul nuw nsw <1 x i32> [[BLOCK13]], [[SPLAT_SPLAT15]] +; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x i32> undef, i32 [[TMP16]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT17]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = mul nuw nsw <1 x i32> [[BLOCK16]], [[SPLAT_SPLAT18]] +; CHECK-NEXT: [[TMP18:%.*]] = add nuw nsw <1 x i32> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x i32> [[TMP18]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> undef, <2 x i32> [[TMP19]], <2 x i32> +; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <2 x i32> [[SPLIT]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x i32> undef, i32 [[TMP21]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT20]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = mul nuw nsw <1 x i32> [[BLOCK19]], [[SPLAT_SPLAT21]] +; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x i32> [[SPLIT1]], <2 x i32> undef, <1 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x i32> undef, i32 [[TMP23]], i32 0 +; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT23]], <1 x i32> undef, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = mul nuw nsw <1 x i32> [[BLOCK22]], [[SPLAT_SPLAT24]] +; CHECK-NEXT: [[TMP25:%.*]] = add nuw nsw <1 x i32> [[TMP22]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x i32> [[TMP25]], <1 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP26]], <2 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP27]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP28]] +; +entry: + %c = call <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2) [ "nuw"(i1 true), "nsw"(i1 true) ] + ret <4 x i32> %c +}