Diff 374457

llvm/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 47,472 Lines • ▼ Show 20 Lines	static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
assert((NumEltsPer128BitChunk % 2 == 0) &&		assert((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane");		"Vector type should have an even number of elements in each lane");
for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {		for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {		for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
// Ignore undefined components.		// Ignore undefined components.
int LIdx = LMask[i + j], RIdx = RMask[i + j];		int LIdx = LMask[i + j], RIdx = RMask[i + j];
if (LIdx < 0 \|\| RIdx < 0 \|\|		if (LIdx < 0 \|\| RIdx < 0 \|\|
(!A.getNode() && (LIdx < (int)NumElts \|\| RIdx < (int)NumElts)) \|\|		(!A.getNode() && (LIdx < (int)NumElts \|\| RIdx < (int)NumElts)) \|\|
(!B.getNode() && (LIdx >= (int)NumElts \|\| RIdx >= (int)NumElts)))		(!B.getNode() && (LIdx >= (int)NumElts \|\| RIdx >= (int)NumElts)))
		xbolva00Unsubmitted Not Done Reply Inline Actions Do we really need this output here? Simplify it a bit? Something like you wrote "Combine the FADD(A, FMA(B, C, 0)) to FMA(B, C, A)"? xbolva00: Do we really need this output here? Simplify it a bit? Something like you wrote "Combine the…
		LiuChen3AuthorUnsubmitted Done Reply Inline Actions Good idea. LiuChen3: Good idea.
continue;		continue;

// Check that successive odd/even elements are being operated on. If not,		// Check that successive odd/even elements are being operated on. If not,
// this is not a horizontal operation.		// this is not a horizontal operation.
if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&		if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
!((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))		!((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
return false;		return false;

Show All 14 Lines	static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,

SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.		SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.		SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.

bool IsIdentityPostShuffle =		bool IsIdentityPostShuffle =
isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);		isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
if (IsIdentityPostShuffle)		if (IsIdentityPostShuffle)
PostShuffleMask.clear();		PostShuffleMask.clear();

		pengfeiUnsubmitted Not Done Reply Inline Actions We don't need else after return. See the Lint comment. pengfei: We don't need else after return. See the Lint comment.
// Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).		// Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&		if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
		pengfeiUnsubmitted Not Done Reply Inline Actions indent pengfei: indent
isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))		isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
return false;		return false;

		pengfeiUnsubmitted Not Done Reply Inline Actions Why we still need this? pengfei: Why we still need this?
		LiuChen3AuthorUnsubmitted Done Reply Inline Actions We need transfer FMA(A, B 0) to MUL(A, B) firstly. LiuChen3: We need transfer FMA(A, B 0) to MUL(A, B) firstly.
		LiuChen3AuthorUnsubmitted Done Reply Inline Actions My bad. I got what's your mean. LiuChen3: My bad. I got what's your mean.
// If the source nodes are already used in HorizOps then always accept this.		// If the source nodes are already used in HorizOps then always accept this.
		pengfeiUnsubmitted Not Done Reply Inline Actions Can these be MulOp0 = Op0->getOperand(1); MulOp1 = Op0->getOperand(2); pengfei: Can these be ``` MulOp0 = Op0->getOperand(1); MulOp1 = Op0->getOperand(2); ```
// Shuffle folding should merge these back together.		// Shuffle folding should merge these back together.
bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {		bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
return User->getOpcode() == HOpcode && User->getValueType(0) == VT;		return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
});		});
		pengfeiUnsubmitted Not Done Reply Inline Actions I think we can use `bool IsConj`, `SDValue MulOp0, MulOp0` instead of `CFmul`. Then you don't need to create a temp mul node. pengfei: I think we can use `bool IsConj`, `SDValue MulOp0, MulOp0` instead of `CFmul`. Then you don't…
		LiuChen3AuthorUnsubmitted Done Reply Inline Actions It seems we create more temp node. Is it better? LiuChen3: It seems we create more temp node. Is it better?
		pengfeiUnsubmitted Not Done Reply Inline Actions They are temp variables rather than nodes. And compiler may likly optimize them. pengfei: They are temp variables rather than nodes. And compiler may likly optimize them.
		pengfeiUnsubmitted Not Done Reply Inline Actions I think we can then use if ((Opcode == X86ISD::VFMULC \|\| Opcode == X86ISD::VFCMULC)) { ... return true; } if ((Opcode == X86ISD::VFMADDC \|\| Opcode == X86ISD::VFCMADDC) ... { ... return true; } return false; pengfei: I think we can then use ``` if ((Opcode == X86ISD::VFMULC \|\| Opcode == X86ISD::VFCMULC)) { ...
bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {		bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
return User->getOpcode() == HOpcode && User->getValueType(0) == VT;		return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
});		});
bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;		bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;

// Assume a SingleSource HOP if we only shuffle one input and don't need to		// Assume a SingleSource HOP if we only shuffle one input and don't need to
// shuffle the result.		// shuffle the result.
if (!ForceHorizOp &&		if (!ForceHorizOp &&
		pengfeiUnsubmitted Not Done Reply Inline Actions Better to add parentheses. pengfei: Better to add parentheses.
!shouldUseHorizontalOp(NewLHS == NewRHS &&		!shouldUseHorizontalOp(NewLHS == NewRHS &&
(NumShuffles < 2 \|\| !IsIdentityPostShuffle),		(NumShuffles < 2 \|\| !IsIdentityPostShuffle),
DAG, Subtarget))		DAG, Subtarget))
		LiuChen3AuthorUnsubmitted Done Reply Inline Actions Maybe we can just check hasNoSignedZeros() and hasAllowContract() as pengfei said? LiuChen3: Maybe we can just check hasNoSignedZeros() and hasAllowContract() as pengfei said?
		pengfeiUnsubmitted Not Done Reply Inline Actions Yeah, I prefer to checking both in line 47582. pengfei: Yeah, I prefer to checking both in line 47582.
return false;		return false;
		pengfeiUnsubmitted Not Done Reply Inline Actions I think we can remove the assert now. pengfei: I think we can remove the assert now.

LHS = DAG.getBitcast(VT, NewLHS);		LHS = DAG.getBitcast(VT, NewLHS);
RHS = DAG.getBitcast(VT, NewRHS);		RHS = DAG.getBitcast(VT, NewRHS);
return true;		return true;
}		}

// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.		// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,		static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
Show All 31 Lines	if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,		if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
PostShuffleMask)) {		PostShuffleMask)) {
auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,		auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {		ArrayRef<SDValue> Ops) {
return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);		return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
};		};
SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,		SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
{LHS, RHS}, HOpBuilder);		{LHS, RHS}, HOpBuilder);
if (!PostShuffleMask.empty())		if (!PostShuffleMask.empty())
		pengfeiUnsubmitted Not Done Reply Inline Actions Should this be AllowContract(Op0->getFlags()) && (ISD::isBuildVectorAllZeros(Op0->getOperand(0).getNode()) && Op0->getFlags().hasNoSignedZeros()) \|\| IsVectorAllNegativeZero(Op0->getOperand(0).getNode())) I.e, check `AllowContract` together with `IsVectorAllNegativeZero` as well. pengfei: Should this be ``` AllowContract(Op0->getFlags()) && (ISD::isBuildVectorAllZeros(Op0…
		LiuChen3AuthorUnsubmitted Done Reply Inline Actions AllowContract will check hasNoSignedZeros(). It seems that we can only do this combination when the fast-math flag is set, No matter if the third operand is +0.0 or 0.0. +0.0 or -0.0 affects the conversion of FMA(a, b, ±0.0) to FMUL(a, b). LiuChen3: AllowContract will check hasNoSignedZeros(). It seems that we can only do this combination when…
HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,		HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
DAG.getUNDEF(VT), PostShuffleMask);		DAG.getUNDEF(VT), PostShuffleMask);
return HorizBinOp;		return HorizBinOp;
}		}
}		}
break;		break;
}		}

Show All 13 Lines
// t22: v16f32 = bitcast t7		// t22: v16f32 = bitcast t7
// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22		// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
// t24: v32f16 = bitcast t23		// t24: v32f16 = bitcast t23
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,		static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);
SDValue LHS = N->getOperand(0);		SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);		SDValue RHS = N->getOperand(1);
int CombineOpcode =		int CombineOpcode =
N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;		N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
auto isConjugationConstant = [](const Constant *c) {		auto isConjugationConstant = [](const Constant *c) {
if (const auto *CI = dyn_cast<ConstantInt>(c)) {		if (const auto *CI = dyn_cast<ConstantInt>(c)) {
APInt ConjugationInt32 = APInt(32, 0x80000000, true);		APInt ConjugationInt32 = APInt(32, 0x80000000, true);
APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);		APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
switch (CI->getBitWidth()) {		switch (CI->getBitWidth()) {
case 16:		case 16:
return false;		return false;
case 32:		case 32:
		pengfeiUnsubmitted Not Done Reply Inline Actions This seems been changed unconsciously. pengfei: This seems been changed unconsciously.
		LiuChen3AuthorUnsubmitted Done Reply Inline Actions Sorry for this. Looks like I accidentally do some change here. LiuChen3: Sorry for this. Looks like I accidentally do some change here.
return CI->getValue() == ConjugationInt32;		return CI->getValue() == ConjugationInt32;
case 64:		case 64:
return CI->getValue() == ConjugationInt64;		return CI->getValue() == ConjugationInt64;
default:		default:
llvm_unreachable("Unexpected bit width");		llvm_unreachable("Unexpected bit width");
}		}
}		}
if (const auto *CF = dyn_cast<ConstantFP>(c))		if (const auto *CF = dyn_cast<ConstantFP>(c))
return CF->isNegativeZeroValue();		return CF->isNegativeZeroValue();
return false;		return false;
};		};
auto combineConjugation = [&](SDValue &r) {		auto combineConjugation = [&](SDValue &r) {
if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {		if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
		pengfeiUnsubmitted Not Done Reply Inline Actions The indentation is wrong too. The same below. pengfei: The indentation is wrong too. The same below.
SDValue XOR = LHS.getOperand(0);		SDValue XOR = LHS.getOperand(0);
if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {		if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
SDValue XORRHS = XOR.getOperand(1);		SDValue XORRHS = XOR.getOperand(1);
if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())		if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())
XORRHS = XORRHS.getOperand(0);		XORRHS = XORRHS.getOperand(0);
if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&		if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&
XORRHS.getOperand(1).getNumOperands()) {		XORRHS.getOperand(1).getNumOperands()) {
ConstantPoolSDNode *CP =		ConstantPoolSDNode *CP =
Show All 14 Lines	static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
if (combineConjugation(Res))		if (combineConjugation(Res))
return Res;		return Res;
std::swap(LHS, RHS);		std::swap(LHS, RHS);
if (combineConjugation(Res))		if (combineConjugation(Res))
return Res;		return Res;
return Res;		return Res;
}		}

// Try to combine the following nodes		// Try to combine the following nodes:
// t21: v16f32 = X86ISD::VFMULC/VFCMULC t7, t8		// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
// t15: v32f16 = bitcast t21
// t16: v32f16 = fadd nnan ninf nsz arcp contract afn reassoc t15, t2
// into X86ISD::VFMADDC/VFCMADDC if possible:
// t22: v16f32 = bitcast t2
// t23: v16f32 = nnan ninf nsz arcp contract afn reassoc
// X86ISD::VFMADDC/VFCMADDC t7, t8, t22
// t24: v32f16 = bitcast t23
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,		static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
auto AllowContract = [&DAG](SDNode *N) {		auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast \|\|		return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
N->getFlags().hasAllowContract();		Flags.hasAllowContract();
		};

		auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
		return DAG.getTarget().Options.NoSignedZerosFPMath \|\|
		Flags.hasNoSignedZeros();
		};
		auto IsVectorAllNegativeZero = [](const SDNode *N) {
		if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)
		return false;
		assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&
		"Unexpected vector type!");
		if (ConstantPoolSDNode *CP =
		dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {
		APInt AI = APInt(32, 0x80008000, true);
		if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))
		return CI->getValue() == AI;
		if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))
		return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);
		}
		return false;
};		};
if (N->getOpcode() != ISD::FADD \|\| !Subtarget.hasFP16() \|\| !AllowContract(N))
		if (N->getOpcode() != ISD::FADD \|\| !Subtarget.hasFP16() \|\|
		!AllowContract(N->getFlags()))
return SDValue();		return SDValue();

EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);
if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)		if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
return SDValue();		return SDValue();

SDValue LHS = N->getOperand(0);		SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);		SDValue RHS = N->getOperand(1);
SDValue CFmul, FAddOp1;		bool IsConj;
auto GetCFmulFrom = [&CFmul, &AllowContract](SDValue N) -> bool {		SDValue FAddOp1, MulOp0, MulOp1;
		auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
		&IsVectorAllNegativeZero,
		&HasNoSignedZero](SDValue N) -> bool {
if (!N.hasOneUse() \|\| N.getOpcode() != ISD::BITCAST)		if (!N.hasOneUse() \|\| N.getOpcode() != ISD::BITCAST)
return false;		return false;
SDValue Op0 = N.getOperand(0);		SDValue Op0 = N.getOperand(0);
unsigned Opcode = Op0.getOpcode();		unsigned Opcode = Op0.getOpcode();
if (Op0.hasOneUse() && AllowContract(Op0.getNode()) &&		if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
(Opcode == X86ISD::VFMULC \|\| Opcode == X86ISD::VFCMULC))		if ((Opcode == X86ISD::VFMULC \|\| Opcode == X86ISD::VFCMULC)) {
CFmul = Op0;		MulOp0 = Op0.getOperand(0);
return !!CFmul;		MulOp1 = Op0.getOperand(1);
		IsConj = Opcode == X86ISD::VFCMULC;
		return true;
		}
		if ((Opcode == X86ISD::VFMADDC \|\| Opcode == X86ISD::VFCMADDC) &&
		((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&
		HasNoSignedZero(Op0->getFlags())) \|\|
		IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {
		MulOp0 = Op0.getOperand(0);
		MulOp1 = Op0.getOperand(1);
		IsConj = Opcode == X86ISD::VFCMADDC;
		return true;
		}
		}
		return false;
};		};

if (GetCFmulFrom(LHS))		if (GetCFmulFrom(LHS))
FAddOp1 = RHS;		FAddOp1 = RHS;
else if (GetCFmulFrom(RHS))		else if (GetCFmulFrom(RHS))
FAddOp1 = LHS;		FAddOp1 = LHS;
else		else
return SDValue();		return SDValue();

MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);		MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
assert(CFmul->getValueType(0) == CVT && "Complex type mismatch");
FAddOp1 = DAG.getBitcast(CVT, FAddOp1);		FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
unsigned newOp = CFmul.getOpcode() == X86ISD::VFMULC ? X86ISD::VFMADDC		unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
: X86ISD::VFCMADDC;
// FIXME: How do we handle when fast math flags of FADD are different from		// FIXME: How do we handle when fast math flags of FADD are different from
// CFMUL's?		// CFMUL's?
CFmul = DAG.getNode(newOp, SDLoc(N), CVT, FAddOp1, CFmul.getOperand(0),		SDValue CFmul =
CFmul.getOperand(1), N->getFlags());		DAG.getNode(NewOp, SDLoc(N), CVT, FAddOp1, MulOp0, MulOp1, N->getFlags());
return DAG.getBitcast(VT, CFmul);		return DAG.getBitcast(VT, CFmul);
}		}

/// Do target-specific dag combines on floating-point adds/subs.		/// Do target-specific dag combines on floating-point adds/subs.
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,		static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))		if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
return HOp;		return HOp;
▲ Show 20 Lines • Show All 6,057 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast --enable-no-signed-zeros-fp-math -mattr=avx512fp16 \| FileCheck %s --check-prefixes=CHECK,NO-SZ
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast -mattr=avx512fp16 \| FileCheck %s --check-prefixes=CHECK,HAS-SZ
				pengfeiUnsubmitted Not Done Reply Inline Actions How about `CHECK,NO-SZ` pengfei: How about `CHECK,NO-SZ`

				pengfeiUnsubmitted Not Done Reply Inline Actions How about `CHECK,HAS-SZ` pengfei: How about `CHECK,HAS-SZ`
				; FADD(acc, FMA(a, b, +0.0)) can be combined to FMA(a, b, acc) if the nsz flag set.
				define dso_local <32 x half> @test1(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
				; NO-SZ-LABEL: test1:
				; NO-SZ: # %bb.0: # %entry
				; NO-SZ-NEXT: vfcmaddcph %zmm1, %zmm0, %zmm2
				; NO-SZ-NEXT: vmovaps %zmm2, %zmm0
				; NO-SZ-NEXT: retq
				;
				; HAS-SZ-LABEL: test1:
				; HAS-SZ: # %bb.0: # %entry
				; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
				; HAS-SZ-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm3
				; HAS-SZ-NEXT: vaddph %zmm0, %zmm3, %zmm0
				; HAS-SZ-NEXT: retq
				entry:
				%0 = bitcast <32 x half> %a to <16 x float>
				%1 = bitcast <32 x half> %b to <16 x float>
				%2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
				%3 = bitcast <16 x float> %2 to <32 x half>
				%add.i = fadd <32 x half> %3, %acc
				ret <32 x half> %add.i
				}

				define dso_local <32 x half> @test2(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
				; NO-SZ-LABEL: test2:
				; NO-SZ: # %bb.0: # %entry
				; NO-SZ-NEXT: vfmaddcph %zmm1, %zmm0, %zmm2
				; NO-SZ-NEXT: vmovaps %zmm2, %zmm0
				; NO-SZ-NEXT: retq
				;
				; HAS-SZ-LABEL: test2:
				; HAS-SZ: # %bb.0: # %entry
				; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
				; HAS-SZ-NEXT: vfmaddcph %zmm2, %zmm1, %zmm3
				; HAS-SZ-NEXT: vaddph %zmm0, %zmm3, %zmm0
				; HAS-SZ-NEXT: retq
				entry:
				%0 = bitcast <32 x half> %a to <16 x float>
				%1 = bitcast <32 x half> %b to <16 x float>
				%2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
				%3 = bitcast <16 x float> %2 to <32 x half>
				%add.i = fadd <32 x half> %3, %acc
				ret <32 x half> %add.i
				}

				define dso_local <16 x half> @test3(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
				; NO-SZ-LABEL: test3:
				; NO-SZ: # %bb.0: # %entry
				; NO-SZ-NEXT: vfcmaddcph %ymm1, %ymm0, %ymm2
				; NO-SZ-NEXT: vmovaps %ymm2, %ymm0
				; NO-SZ-NEXT: retq
				;
				; HAS-SZ-LABEL: test3:
				; HAS-SZ: # %bb.0: # %entry
				; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
				; HAS-SZ-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm3
				; HAS-SZ-NEXT: vaddph %ymm0, %ymm3, %ymm0
				; HAS-SZ-NEXT: retq
				entry:
				%0 = bitcast <16 x half> %a to <8 x float>
				%1 = bitcast <16 x half> %b to <8 x float>
				%2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
				%3 = bitcast <8 x float> %2 to <16 x half>
				%add.i = fadd <16 x half> %3, %acc
				ret <16 x half> %add.i
				}

				define dso_local <16 x half> @test4(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
				; NO-SZ-LABEL: test4:
				; NO-SZ: # %bb.0: # %entry
				; NO-SZ-NEXT: vfmaddcph %ymm1, %ymm0, %ymm2
				; NO-SZ-NEXT: vmovaps %ymm2, %ymm0
				; NO-SZ-NEXT: retq
				;
				; HAS-SZ-LABEL: test4:
				; HAS-SZ: # %bb.0: # %entry
				; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
				; HAS-SZ-NEXT: vfmaddcph %ymm2, %ymm1, %ymm3
				; HAS-SZ-NEXT: vaddph %ymm0, %ymm3, %ymm0
				; HAS-SZ-NEXT: retq
				entry:
				%0 = bitcast <16 x half> %a to <8 x float>
				%1 = bitcast <16 x half> %b to <8 x float>
				%2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
				%3 = bitcast <8 x float> %2 to <16 x half>
				%add.i = fadd <16 x half> %3, %acc
				ret <16 x half> %add.i
				}

				define dso_local <8 x half> @test5(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
				; NO-SZ-LABEL: test5:
				; NO-SZ: # %bb.0: # %entry
				; NO-SZ-NEXT: vfcmaddcph %xmm1, %xmm0, %xmm2
				; NO-SZ-NEXT: vmovaps %xmm2, %xmm0
				; NO-SZ-NEXT: retq
				;
				; HAS-SZ-LABEL: test5:
				; HAS-SZ: # %bb.0: # %entry
				; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
				; HAS-SZ-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm3
				; HAS-SZ-NEXT: vaddph %xmm0, %xmm3, %xmm0
				; HAS-SZ-NEXT: retq
				entry:
				%0 = bitcast <8 x half> %a to <4 x float>
				%1 = bitcast <8 x half> %b to <4 x float>
				%2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
				%3 = bitcast <4 x float> %2 to <8 x half>
				%add.i = fadd <8 x half> %3, %acc
				ret <8 x half> %add.i
				}

				define dso_local <8 x half> @test6(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
				; NO-SZ-LABEL: test6:
				; NO-SZ: # %bb.0: # %entry
				; NO-SZ-NEXT: vfmaddcph %xmm1, %xmm0, %xmm2
				; NO-SZ-NEXT: vmovaps %xmm2, %xmm0
				; NO-SZ-NEXT: retq
				;
				; HAS-SZ-LABEL: test6:
				; HAS-SZ: # %bb.0: # %entry
				; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
				; HAS-SZ-NEXT: vfmaddcph %xmm2, %xmm1, %xmm3
				; HAS-SZ-NEXT: vaddph %xmm0, %xmm3, %xmm0
				; HAS-SZ-NEXT: retq
				entry:
				%0 = bitcast <8 x half> %a to <4 x float>
				%1 = bitcast <8 x half> %b to <4 x float>
				%2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
				%3 = bitcast <4 x float> %2 to <8 x half>
				%add.i = fadd <8 x half> %3, %acc
				ret <8 x half> %add.i
				}

				; FADD(acc, FMA(a, b, -0.0)) can be combined to FMA(a, b, acc) no matter if the nsz flag set.
				define dso_local <32 x half> @test13(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
				; CHECK-LABEL: test13:
				; CHECK: # %bb.0: # %entry
				; CHECK-NEXT: vfcmaddcph %zmm1, %zmm0, %zmm2
				; CHECK-NEXT: vmovaps %zmm2, %zmm0
				; CHECK-NEXT: retq
				entry:
				%0 = bitcast <32 x half> %a to <16 x float>
				%1 = bitcast <32 x half> %b to <16 x float>
				%2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
				%3 = bitcast <16 x float> %2 to <32 x half>
				%add.i = fadd <32 x half> %3, %acc
				ret <32 x half> %add.i
				}

				define dso_local <32 x half> @test14(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
				; CHECK-LABEL: test14:
				; CHECK: # %bb.0: # %entry
				; CHECK-NEXT: vfmaddcph %zmm1, %zmm0, %zmm2
				; CHECK-NEXT: vmovaps %zmm2, %zmm0
				; CHECK-NEXT: retq
				entry:
				%0 = bitcast <32 x half> %a to <16 x float>
				%1 = bitcast <32 x half> %b to <16 x float>
				%2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
				%3 = bitcast <16 x float> %2 to <32 x half>
				%add.i = fadd <32 x half> %3, %acc
				ret <32 x half> %add.i
				}

				define dso_local <16 x half> @test15(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
				; CHECK-LABEL: test15:
				; CHECK: # %bb.0: # %entry
				; CHECK-NEXT: vfcmaddcph %ymm1, %ymm0, %ymm2
				; CHECK-NEXT: vmovaps %ymm2, %ymm0
				; CHECK-NEXT: retq
				entry:
				%0 = bitcast <16 x half> %a to <8 x float>
				%1 = bitcast <16 x half> %b to <8 x float>
				%2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
				%3 = bitcast <8 x float> %2 to <16 x half>
				%add.i = fadd <16 x half> %3, %acc
				ret <16 x half> %add.i
				}

				define dso_local <16 x half> @test16(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
				; CHECK-LABEL: test16:
				; CHECK: # %bb.0: # %entry
				; CHECK-NEXT: vfmaddcph %ymm1, %ymm0, %ymm2
				; CHECK-NEXT: vmovaps %ymm2, %ymm0
				; CHECK-NEXT: retq
				entry:
				%0 = bitcast <16 x half> %a to <8 x float>
				%1 = bitcast <16 x half> %b to <8 x float>
				%2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
				%3 = bitcast <8 x float> %2 to <16 x half>
				%add.i = fadd <16 x half> %3, %acc
				ret <16 x half> %add.i
				}
				LiuChen3AuthorUnsubmitted Done Reply Inline Actions Should we do this combine standalone? LiuChen3: Should we do this combine standalone?

				define dso_local <8 x half> @test17(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
				; CHECK-LABEL: test17:
				; CHECK: # %bb.0: # %entry
				; CHECK-NEXT: vfcmaddcph %xmm1, %xmm0, %xmm2
				; CHECK-NEXT: vmovaps %xmm2, %xmm0
				; CHECK-NEXT: retq
				entry:
				%0 = bitcast <8 x half> %a to <4 x float>
				%1 = bitcast <8 x half> %b to <4 x float>
				%2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
				%3 = bitcast <4 x float> %2 to <8 x half>
				%add.i = fadd <8 x half> %3, %acc
				ret <8 x half> %add.i
				}

				define dso_local <8 x half> @test18(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
				; CHECK-LABEL: test18:
				; CHECK: # %bb.0: # %entry
				; CHECK-NEXT: vfmaddcph %xmm1, %xmm0, %xmm2
				; CHECK-NEXT: vmovaps %xmm2, %xmm0
				; CHECK-NEXT: retq
				entry:
				%0 = bitcast <8 x half> %a to <4 x float>
				%1 = bitcast <8 x half> %b to <4 x float>
				%2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
				%3 = bitcast <4 x float> %2 to <8 x half>
				%add.i = fadd <8 x half> %3, %acc
				ret <8 x half> %add.i
				}

				declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
				declare <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
				declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
				declare <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
				declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
				declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)

This is an archive of the discontinued LLVM Phabricator instance.

[X86][FP16] Combine the FADD(A, FMA(B, C, 0)) to FMA(B, C, A)
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 374457

llvm/lib/Target/X86/X86ISelLowering.cpp

llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll

This is an archive of the discontinued LLVM Phabricator instance.

[X86][FP16] Combine the FADD(A, FMA(B, C, 0)) to FMA(B, C, A)ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 374457

llvm/lib/Target/X86/X86ISelLowering.cpp

llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll

[X86][FP16] Combine the FADD(A, FMA(B, C, 0)) to FMA(B, C, A)
ClosedPublic