Diff 235135

llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp

Show All 37 Lines
using namespace llvm;		using namespace llvm;
using namespace PatternMatch;		using namespace PatternMatch;

#define DEBUG_TYPE "lower-matrix-intrinsics"		#define DEBUG_TYPE "lower-matrix-intrinsics"

static cl::opt<bool> EnableShapePropagation("matrix-propagate-shape",		static cl::opt<bool> EnableShapePropagation("matrix-propagate-shape",
cl::init(true));		cl::init(true));

		static cl::opt<bool> AllowContractEnabled(
		"matrix-allow-contract", cl::init(false), cl::Hidden,
		cl::desc("Allow the use of FMAs if available and profitable. This may "
		"result in different results, due to less rounding error."));

		anemetUnsubmitted Not Done Reply Inline Actions Shouldn't this be using the fpflags from the intrinsic? I.e. I was thinking of a clang flag. anemet: Shouldn't this be using the fpflags from the intrinsic? I.e. I was thinking of a clang flag.
		fhahnAuthorUnsubmitted Done Reply Inline Actions We can certainly add this on the clang level, although I am not sure how easy/hard it will be to add additional flags there. fhahn: We can certainly add this on the clang level, although I am not sure how easy/hard it will be…
		anemetUnsubmitted Not Done Reply Inline Actions Can we do both? I.e. propagate fp-contract from the intrinsic's fpflag and also allow the backend to globally enable this? I'd be fine with that. anemet: Can we do both? I.e. propagate fp-contract from the intrinsic's fpflag and also allow the…
		fhahnAuthorUnsubmitted Done Reply Inline Actions Yep I've updated it to respect the FMF and also kept the option fhahn: Yep I've updated it to respect the FMF and also kept the option
namespace {		namespace {

// Given an element poitner \p BasePtr to the start of a (sub) matrix, compute		// Given an element poitner \p BasePtr to the start of a (sub) matrix, compute
// the start address of column \p Col with type (\p EltType x \p NumRows)		// the start address of column \p Col with type (\p EltType x \p NumRows)
// assuming \p Stride elements between start two consecutive columns.		// assuming \p Stride elements between start two consecutive columns.
// \p Stride must be >= \p NumRows.		// \p Stride must be >= \p NumRows.
//		//
// Consider a 4x4 matrix like below		// Consider a 4x4 matrix like below
▲ Show 20 Lines • Show All 477 Lines • ▼ Show 20 Lines	for (; i < VecNumElts; i++)
Mask.push_back(Builder.getInt32(i));		Mask.push_back(Builder.getInt32(i));

Value *MaskVal = ConstantVector::get(Mask);		Value *MaskVal = ConstantVector::get(Mask);

return Builder.CreateShuffleVector(Col, Block, MaskVal);		return Builder.CreateShuffleVector(Col, Block, MaskVal);
}		}

Value createMulAdd(Value Sum, Value A, Value B, bool UseFPOp,		Value createMulAdd(Value Sum, Value A, Value B, bool UseFPOp,
IRBuilder<> &Builder) {		IRBuilder<> &Builder, bool AllowContraction) {
Value *Mul = UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B);
if (!Sum)		if (!Sum)
return Mul;		return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B);

		if (UseFPOp) {
		if (AllowContraction) {
		// Use fmuladd for floating point operations and let the backend decide
		// if that's profitable.
		Value *FMulAdd = Intrinsic::getDeclaration(
		Func.getParent(), Intrinsic::fmuladd, A->getType());
		return Builder.CreateCall(FMulAdd, {A, B, Sum});
		}
		Value *Mul = Builder.CreateFMul(A, B);
		return Builder.CreateFAdd(Sum, Mul);
		}

return UseFPOp ? Builder.CreateFAdd(Sum, Mul) : Builder.CreateAdd(Sum, Mul);		Value *Mul = Builder.CreateMul(A, B);
		return Builder.CreateAdd(Sum, Mul);
}		}

/// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For		/// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For
/// users with shape information, there's nothing to do: the will use the		/// users with shape information, there's nothing to do: the will use the
/// cached value when they are lowered. For other users, \p Matrix is		/// cached value when they are lowered. For other users, \p Matrix is
/// flattened and the uses are updated to use it. Also marks \p Inst for		/// flattened and the uses are updated to use it. Also marks \p Inst for
/// deletion.		/// deletion.
void finalizeLowering(Instruction *Inst, ColumnMatrixTy Matrix,		void finalizeLowering(Instruction *Inst, ColumnMatrixTy Matrix,
Show All 33 Lines	void LowerMultiply(CallInst *MatMul) {
ColumnMatrixTy Result;		ColumnMatrixTy Result;
for (unsigned J = 0; J < C; ++J)		for (unsigned J = 0; J < C; ++J)
Result.addColumn(UndefValue::get(VectorType::get(EltType, R)));		Result.addColumn(UndefValue::get(VectorType::get(EltType, R)));

const unsigned VF = std::max(TTI.getRegisterBitWidth(true) /		const unsigned VF = std::max(TTI.getRegisterBitWidth(true) /
EltType->getPrimitiveSizeInBits(),		EltType->getPrimitiveSizeInBits(),
uint64_t(1));		uint64_t(1));

		bool AllowContract = AllowContractEnabled \|\| (isa<FPMathOperator>(MatMul) &&
		MatMul->hasAllowContract());
// Multiply columns from the first operand with scalars from the second		// Multiply columns from the first operand with scalars from the second
// operand. Then move along the K axes and accumulate the columns. With		// operand. Then move along the K axes and accumulate the columns. With
// this the adds can be vectorized without reassociation.		// this the adds can be vectorized without reassociation.
for (unsigned J = 0; J < C; ++J) {		for (unsigned J = 0; J < C; ++J) {
unsigned BlockSize = VF;		unsigned BlockSize = VF;
for (unsigned I = 0; I < R; I += BlockSize) {		for (unsigned I = 0; I < R; I += BlockSize) {
// Gradually lower the vectorization factor to cover the remainder.		// Gradually lower the vectorization factor to cover the remainder.
while (I + BlockSize > R)		while (I + BlockSize > R)
BlockSize /= 2;		BlockSize /= 2;

Value *Sum = nullptr;		Value *Sum = nullptr;
for (unsigned K = 0; K < M; ++K) {		for (unsigned K = 0; K < M; ++K) {
Value *L = extractVector(Lhs, I, K, BlockSize, Builder);		Value *L = extractVector(Lhs, I, K, BlockSize, Builder);
Value *RH = Builder.CreateExtractElement(Rhs.getColumn(J), K);		Value *RH = Builder.CreateExtractElement(Rhs.getColumn(J), K);
Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat");		Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat");
Sum = createMulAdd(Sum, L, Splat, EltType->isFloatingPointTy(),		Sum = createMulAdd(Sum, L, Splat, EltType->isFloatingPointTy(),
Builder);		Builder, AllowContract);
}		}
Result.setColumn(J, insertVector(Result.getColumn(J), I, Sum, Builder));		Result.setColumn(J, insertVector(Result.getColumn(J), I, Sum, Builder));
}		}
}		}
finalizeLowering(MatMul, Result, Builder);		finalizeLowering(MatMul, Result, Builder);
}		}

/// Lowers llvm.matrix.transpose.		/// Lowers llvm.matrix.transpose.
▲ Show 20 Lines • Show All 87 Lines • Show Last 20 Lines

llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction-fmf.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt -lower-matrix-intrinsics -S < %s \| FileCheck %s
				; RUN: opt -passes='lower-matrix-intrinsics' -S < %s \| FileCheck %s


				define <4 x double> @multiply_2x2(<4 x double> %a, <4 x double> %b) {
				; CHECK-LABEL: @multiply_2x2(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[SPLIT:%.]] = shufflevector <4 x double> [[A:%.]], <4 x double> undef, <2 x i32> <i32 0, i32 1>
				; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> undef, <2 x i32> <i32 2, i32 3>
				; CHECK-NEXT: [[SPLIT2:%.]] = shufflevector <4 x double> [[B:%.]], <4 x double> undef, <2 x i32> <i32 0, i32 1>
				; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> undef, <2 x i32> <i32 2, i32 3>
				; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 0
				; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> undef, double [[TMP0]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP1:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
				; CHECK-NEXT: [[BLOCK4:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 1
				; CHECK-NEXT: [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x double> undef, double [[TMP2]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT5]], <1 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK4]], <1 x double> [[SPLAT_SPLAT6]], <1 x double> [[TMP1]])
				; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> undef, <2 x i32> <i32 0, i32 undef>
				; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> undef, <2 x double> [[TMP4]], <2 x i32> <i32 2, i32 1>
				; CHECK-NEXT: [[BLOCK7:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> undef, <1 x i32> <i32 1>
				; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 0
				; CHECK-NEXT: [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x double> undef, double [[TMP6]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT8]], <1 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP7:%.*]] = fmul <1 x double> [[BLOCK7]], [[SPLAT_SPLAT9]]
				; CHECK-NEXT: [[BLOCK10:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> undef, <1 x i32> <i32 1>
				; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 1
				; CHECK-NEXT: [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x double> undef, double [[TMP8]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT11]], <1 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP9:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK10]], <1 x double> [[SPLAT_SPLAT12]], <1 x double> [[TMP7]])
				; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <1 x double> [[TMP9]], <1 x double> undef, <2 x i32> <i32 0, i32 undef>
				; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP10]], <2 x i32> <i32 0, i32 2>
				; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 0
				; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x double> undef, double [[TMP12]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT14]], <1 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP13:%.*]] = fmul <1 x double> [[BLOCK13]], [[SPLAT_SPLAT15]]
				; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 1
				; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x double> undef, double [[TMP14]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT17]], <1 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP15:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK16]], <1 x double> [[SPLAT_SPLAT18]], <1 x double> [[TMP13]])
				; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <1 x double> [[TMP15]], <1 x double> undef, <2 x i32> <i32 0, i32 undef>
				; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x double> undef, <2 x double> [[TMP16]], <2 x i32> <i32 2, i32 1>
				; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> undef, <1 x i32> <i32 1>
				; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 0
				; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x double> undef, double [[TMP18]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT20]], <1 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP19:%.*]] = fmul <1 x double> [[BLOCK19]], [[SPLAT_SPLAT21]]
				; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> undef, <1 x i32> <i32 1>
				; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 1
				; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> undef, double [[TMP20]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP21:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK22]], <1 x double> [[SPLAT_SPLAT24]], <1 x double> [[TMP19]])
				; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <1 x double> [[TMP21]], <1 x double> undef, <2 x i32> <i32 0, i32 undef>
				; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x double> [[TMP17]], <2 x double> [[TMP22]], <2 x i32> <i32 0, i32 2>
				; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP23]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				; CHECK-NEXT: ret <4 x double> [[TMP24]]
				;
				entry:
				%c = call contract <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
				ret <4 x double> %c
				}

				declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32)

llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt -lower-matrix-intrinsics -matrix-allow-contract -S < %s \| FileCheck %s
				; RUN: opt -passes='lower-matrix-intrinsics' -matrix-allow-contract -S < %s \| FileCheck %s


				define <4 x double> @multiply_2x2(<4 x double> %a, <4 x double> %b) {
				; CHECK-LABEL: @multiply_2x2(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[SPLIT:%.]] = shufflevector <4 x double> [[A:%.]], <4 x double> undef, <2 x i32> <i32 0, i32 1>
				; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> undef, <2 x i32> <i32 2, i32 3>
				; CHECK-NEXT: [[SPLIT2:%.]] = shufflevector <4 x double> [[B:%.]], <4 x double> undef, <2 x i32> <i32 0, i32 1>
				; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> undef, <2 x i32> <i32 2, i32 3>
				; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 0
				; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> undef, double [[TMP0]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP1:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
				; CHECK-NEXT: [[BLOCK4:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 1
				; CHECK-NEXT: [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x double> undef, double [[TMP2]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT5]], <1 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK4]], <1 x double> [[SPLAT_SPLAT6]], <1 x double> [[TMP1]])
				; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> undef, <2 x i32> <i32 0, i32 undef>
				; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> undef, <2 x double> [[TMP4]], <2 x i32> <i32 2, i32 1>
				; CHECK-NEXT: [[BLOCK7:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> undef, <1 x i32> <i32 1>
				; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 0
				; CHECK-NEXT: [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x double> undef, double [[TMP6]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT8]], <1 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP7:%.*]] = fmul <1 x double> [[BLOCK7]], [[SPLAT_SPLAT9]]
				; CHECK-NEXT: [[BLOCK10:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> undef, <1 x i32> <i32 1>
				; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[SPLIT2]], i64 1
				; CHECK-NEXT: [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x double> undef, double [[TMP8]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT11]], <1 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP9:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK10]], <1 x double> [[SPLAT_SPLAT12]], <1 x double> [[TMP7]])
				; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <1 x double> [[TMP9]], <1 x double> undef, <2 x i32> <i32 0, i32 undef>
				; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP10]], <2 x i32> <i32 0, i32 2>
				; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 0
				; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x double> undef, double [[TMP12]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT14]], <1 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP13:%.*]] = fmul <1 x double> [[BLOCK13]], [[SPLAT_SPLAT15]]
				; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 1
				; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x double> undef, double [[TMP14]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT17]], <1 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP15:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK16]], <1 x double> [[SPLAT_SPLAT18]], <1 x double> [[TMP13]])
				; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <1 x double> [[TMP15]], <1 x double> undef, <2 x i32> <i32 0, i32 undef>
				; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x double> undef, <2 x double> [[TMP16]], <2 x i32> <i32 2, i32 1>
				; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <2 x double> [[SPLIT]], <2 x double> undef, <1 x i32> <i32 1>
				; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 0
				; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x double> undef, double [[TMP18]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT20]], <1 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP19:%.*]] = fmul <1 x double> [[BLOCK19]], [[SPLAT_SPLAT21]]
				; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x double> [[SPLIT1]], <2 x double> undef, <1 x i32> <i32 1>
				; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[SPLIT3]], i64 1
				; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> undef, double [[TMP20]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP21:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK22]], <1 x double> [[SPLAT_SPLAT24]], <1 x double> [[TMP19]])
				; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <1 x double> [[TMP21]], <1 x double> undef, <2 x i32> <i32 0, i32 undef>
				; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x double> [[TMP17]], <2 x double> [[TMP22]], <2 x i32> <i32 0, i32 2>
				; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP23]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				; CHECK-NEXT: ret <4 x double> [[TMP24]]
				;
				entry:
				%c = call <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
				ret <4 x double> %c
				}

				declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32)

llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction-fmf.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt -lower-matrix-intrinsics -S < %s \| FileCheck %s
				; RUN: opt -passes='lower-matrix-intrinsics' -S < %s \| FileCheck %s


				define <4 x float> @multiply_2x2(<4 x float> %a, <4 x float> %b) {
				; CHECK-LABEL: @multiply_2x2(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[SPLIT:%.]] = shufflevector <4 x float> [[A:%.]], <4 x float> undef, <2 x i32> <i32 0, i32 1>
				; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 2, i32 3>
				; CHECK-NEXT: [[SPLIT2:%.]] = shufflevector <4 x float> [[B:%.]], <4 x float> undef, <2 x i32> <i32 0, i32 1>
				; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <2 x i32> <i32 2, i32 3>
				; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 0
				; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> undef, float [[TMP0]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP1:%.*]] = fmul <1 x float> [[BLOCK]], [[SPLAT_SPLAT]]
				; CHECK-NEXT: [[BLOCK4:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 1
				; CHECK-NEXT: [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x float> undef, float [[TMP2]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT5]], <1 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP3:%.*]] = call <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK4]], <1 x float> [[SPLAT_SPLAT6]], <1 x float> [[TMP1]])
				; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <1 x float> [[TMP3]], <1 x float> undef, <2 x i32> <i32 0, i32 undef>
				; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP4]], <2 x i32> <i32 2, i32 1>
				; CHECK-NEXT: [[BLOCK7:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> undef, <1 x i32> <i32 1>
				; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 0
				; CHECK-NEXT: [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x float> undef, float [[TMP6]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT8]], <1 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP7:%.*]] = fmul <1 x float> [[BLOCK7]], [[SPLAT_SPLAT9]]
				; CHECK-NEXT: [[BLOCK10:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> undef, <1 x i32> <i32 1>
				; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 1
				; CHECK-NEXT: [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x float> undef, float [[TMP8]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT11]], <1 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP9:%.*]] = call <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK10]], <1 x float> [[SPLAT_SPLAT12]], <1 x float> [[TMP7]])
				; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <1 x float> [[TMP9]], <1 x float> undef, <2 x i32> <i32 0, i32 undef>
				; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 2>
				; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 0
				; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x float> undef, float [[TMP12]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT14]], <1 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP13:%.*]] = fmul <1 x float> [[BLOCK13]], [[SPLAT_SPLAT15]]
				; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 1
				; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x float> undef, float [[TMP14]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT17]], <1 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP15:%.*]] = call <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK16]], <1 x float> [[SPLAT_SPLAT18]], <1 x float> [[TMP13]])
				; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <1 x float> [[TMP15]], <1 x float> undef, <2 x i32> <i32 0, i32 undef>
				; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP16]], <2 x i32> <i32 2, i32 1>
				; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> undef, <1 x i32> <i32 1>
				; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 0
				; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x float> undef, float [[TMP18]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT20]], <1 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP19:%.*]] = fmul <1 x float> [[BLOCK19]], [[SPLAT_SPLAT21]]
				; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> undef, <1 x i32> <i32 1>
				; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 1
				; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x float> undef, float [[TMP20]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT23]], <1 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP21:%.*]] = call <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK22]], <1 x float> [[SPLAT_SPLAT24]], <1 x float> [[TMP19]])
				; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <1 x float> [[TMP21]], <1 x float> undef, <2 x i32> <i32 0, i32 undef>
				; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x float> [[TMP17]], <2 x float> [[TMP22]], <2 x i32> <i32 0, i32 2>
				; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP23]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				; CHECK-NEXT: ret <4 x float> [[TMP24]]
				;
				entry:
				%c = call contract <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2)
				ret <4 x float> %c
				}

				declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32)

llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt -lower-matrix-intrinsics -matrix-allow-contract -S < %s \| FileCheck %s
				; RUN: opt -passes='lower-matrix-intrinsics' -matrix-allow-contract -S < %s \| FileCheck %s


				define <4 x float> @multiply_2x2(<4 x float> %a, <4 x float> %b) {
				; CHECK-LABEL: @multiply_2x2(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[SPLIT:%.]] = shufflevector <4 x float> [[A:%.]], <4 x float> undef, <2 x i32> <i32 0, i32 1>
				; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 2, i32 3>
				; CHECK-NEXT: [[SPLIT2:%.]] = shufflevector <4 x float> [[B:%.]], <4 x float> undef, <2 x i32> <i32 0, i32 1>
				; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <2 x i32> <i32 2, i32 3>
				; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 0
				; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> undef, float [[TMP0]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP1:%.*]] = fmul <1 x float> [[BLOCK]], [[SPLAT_SPLAT]]
				; CHECK-NEXT: [[BLOCK4:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 1
				; CHECK-NEXT: [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x float> undef, float [[TMP2]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT5]], <1 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP3:%.*]] = call <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK4]], <1 x float> [[SPLAT_SPLAT6]], <1 x float> [[TMP1]])
				; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <1 x float> [[TMP3]], <1 x float> undef, <2 x i32> <i32 0, i32 undef>
				; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP4]], <2 x i32> <i32 2, i32 1>
				; CHECK-NEXT: [[BLOCK7:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> undef, <1 x i32> <i32 1>
				; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 0
				; CHECK-NEXT: [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x float> undef, float [[TMP6]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT8]], <1 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP7:%.*]] = fmul <1 x float> [[BLOCK7]], [[SPLAT_SPLAT9]]
				; CHECK-NEXT: [[BLOCK10:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> undef, <1 x i32> <i32 1>
				; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[SPLIT2]], i64 1
				; CHECK-NEXT: [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x float> undef, float [[TMP8]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT11]], <1 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP9:%.*]] = call <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK10]], <1 x float> [[SPLAT_SPLAT12]], <1 x float> [[TMP7]])
				; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <1 x float> [[TMP9]], <1 x float> undef, <2 x i32> <i32 0, i32 undef>
				; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 2>
				; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 0
				; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x float> undef, float [[TMP12]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT14]], <1 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP13:%.*]] = fmul <1 x float> [[BLOCK13]], [[SPLAT_SPLAT15]]
				; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 1
				; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x float> undef, float [[TMP14]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT17]], <1 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP15:%.*]] = call <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK16]], <1 x float> [[SPLAT_SPLAT18]], <1 x float> [[TMP13]])
				; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <1 x float> [[TMP15]], <1 x float> undef, <2 x i32> <i32 0, i32 undef>
				; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> undef, <2 x float> [[TMP16]], <2 x i32> <i32 2, i32 1>
				; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <2 x float> [[SPLIT]], <2 x float> undef, <1 x i32> <i32 1>
				; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 0
				; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x float> undef, float [[TMP18]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT20]], <1 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP19:%.*]] = fmul <1 x float> [[BLOCK19]], [[SPLAT_SPLAT21]]
				; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x float> [[SPLIT1]], <2 x float> undef, <1 x i32> <i32 1>
				; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[SPLIT3]], i64 1
				; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x float> undef, float [[TMP20]], i32 0
				; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT23]], <1 x float> undef, <1 x i32> zeroinitializer
				; CHECK-NEXT: [[TMP21:%.*]] = call <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK22]], <1 x float> [[SPLAT_SPLAT24]], <1 x float> [[TMP19]])
				; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <1 x float> [[TMP21]], <1 x float> undef, <2 x i32> <i32 0, i32 undef>
				; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x float> [[TMP17]], <2 x float> [[TMP22]], <2 x i32> <i32 0, i32 2>
				; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP23]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				; CHECK-NEXT: ret <4 x float> [[TMP24]]
				;
				entry:
				%c = call contract <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2)
				ret <4 x float> %c
				}

				declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32)

This is an archive of the discontinued LLVM Phabricator instance.

[Matrix] Use fmuladd for matrix.multiply if allowed.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 235135

llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp

llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction-fmf.ll

llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction.ll

llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction-fmf.ll

llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction.ll

This is an archive of the discontinued LLVM Phabricator instance.

[Matrix] Use fmuladd for matrix.multiply if allowed.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 235135

llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp

llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction-fmf.ll

llvm/test/Transforms/LowerMatrixIntrinsics/multiply-double-contraction.ll

llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction-fmf.ll

llvm/test/Transforms/LowerMatrixIntrinsics/multiply-float-contraction.ll

[Matrix] Use fmuladd for matrix.multiply if allowed.
ClosedPublic