Diff 69739

../lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,860 Lines • ▼ Show 20 Lines	else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());		SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
else		else
return false;		return false;
}		}

return true;		return true;
}		}

static const Constant *getTargetShuffleMaskConstant(SDValue MaskNode) {		static const Constant *getTargetConstantFromNode(SDValue MaskNode) {
		spatelUnsubmitted Not Done Reply Inline Actions It's good to generalize this, but you should also change the variable names from Mask* to something generic since it's not about shuffles anymore. Please make this change ahead of the fneg changes to reduce the size of the patch. spatel: It's good to generalize this, but you should also change the variable names from Mask* to…
MaskNode = peekThroughBitcasts(MaskNode);		MaskNode = peekThroughBitcasts(MaskNode);

auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);		auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
		spatelUnsubmitted Done Reply Inline Actions MaskLoad -> Load ? spatel: MaskLoad -> Load ?
if (!MaskLoad)		if (!MaskLoad)
return nullptr;		return nullptr;

SDValue Ptr = MaskLoad->getBasePtr();		SDValue Ptr = MaskLoad->getBasePtr();
if (Ptr->getOpcode() == X86ISD::Wrapper \|\|		if (Ptr->getOpcode() == X86ISD::Wrapper \|\|
Ptr->getOpcode() == X86ISD::WrapperRIP)		Ptr->getOpcode() == X86ISD::WrapperRIP)
Ptr = Ptr->getOperand(0);		Ptr = Ptr->getOperand(0);

auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);		auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
		spatelUnsubmitted Done Reply Inline Actions MaskCP -> ConstantPoolNode ? spatel: MaskCP -> ConstantPoolNode ?
if (!MaskCP \|\| MaskCP->isMachineConstantPoolEntry())		if (!MaskCP \|\| MaskCP->isMachineConstantPoolEntry())
return nullptr;		return nullptr;

return dyn_cast<Constant>(MaskCP->getConstVal());		return dyn_cast<Constant>(MaskCP->getConstVal());
}		}

/// Calculates the shuffle mask corresponding to the target-specific opcode.		/// Calculates the shuffle mask corresponding to the target-specific opcode.
/// If the mask could be calculated, returns it in \p Mask, returns the shuffle		/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
▲ Show 20 Lines • Show All 97 Lines • ▼ Show 20 Lines	case X86ISD::VPERMILPV: {
IsUnary = true;		IsUnary = true;
SDValue MaskNode = N->getOperand(1);		SDValue MaskNode = N->getOperand(1);
unsigned MaskEltSize = VT.getScalarSizeInBits();		unsigned MaskEltSize = VT.getScalarSizeInBits();
SmallVector<uint64_t, 32> RawMask;		SmallVector<uint64_t, 32> RawMask;
if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {		if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
DecodeVPERMILPMask(VT, RawMask, Mask);		DecodeVPERMILPMask(VT, RawMask, Mask);
break;		break;
}		}
if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {		if (auto *C = getTargetConstantFromNode(MaskNode)) {
DecodeVPERMILPMask(C, MaskEltSize, Mask);		DecodeVPERMILPMask(C, MaskEltSize, Mask);
break;		break;
}		}
return false;		return false;
}		}
case X86ISD::PSHUFB: {		case X86ISD::PSHUFB: {
IsUnary = true;		IsUnary = true;
SDValue MaskNode = N->getOperand(1);		SDValue MaskNode = N->getOperand(1);
SmallVector<uint64_t, 32> RawMask;		SmallVector<uint64_t, 32> RawMask;
if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {		if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
DecodePSHUFBMask(RawMask, Mask);		DecodePSHUFBMask(RawMask, Mask);
break;		break;
}		}
if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {		if (auto *C = getTargetConstantFromNode(MaskNode)) {
DecodePSHUFBMask(C, Mask);		DecodePSHUFBMask(C, Mask);
break;		break;
}		}
return false;		return false;
}		}
case X86ISD::VPERMI:		case X86ISD::VPERMI:
ImmN = N->getOperand(N->getNumOperands()-1);		ImmN = N->getOperand(N->getNumOperands()-1);
DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);		DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
Show All 32 Lines	case X86ISD::VPERMIL2: {
SDValue CtrlNode = N->getOperand(3);		SDValue CtrlNode = N->getOperand(3);
if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {		if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
unsigned CtrlImm = CtrlOp->getZExtValue();		unsigned CtrlImm = CtrlOp->getZExtValue();
SmallVector<uint64_t, 32> RawMask;		SmallVector<uint64_t, 32> RawMask;
if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {		if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);		DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
break;		break;
}		}
if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {		if (auto *C = getTargetConstantFromNode(MaskNode)) {
DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);		DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
break;		break;
}		}
}		}
return false;		return false;
}		}
case X86ISD::VPPERM: {		case X86ISD::VPPERM: {
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);		IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
SDValue MaskNode = N->getOperand(2);		SDValue MaskNode = N->getOperand(2);
SmallVector<uint64_t, 32> RawMask;		SmallVector<uint64_t, 32> RawMask;
if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {		if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
DecodeVPPERMMask(RawMask, Mask);		DecodeVPPERMMask(RawMask, Mask);
break;		break;
}		}
if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {		if (auto *C = getTargetConstantFromNode(MaskNode)) {
DecodeVPPERMMask(C, Mask);		DecodeVPPERMMask(C, Mask);
break;		break;
}		}
return false;		return false;
}		}
case X86ISD::VPERMV: {		case X86ISD::VPERMV: {
IsUnary = true;		IsUnary = true;
// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.		// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
Ops.push_back(N->getOperand(1));		Ops.push_back(N->getOperand(1));
SDValue MaskNode = N->getOperand(0);		SDValue MaskNode = N->getOperand(0);
SmallVector<uint64_t, 32> RawMask;		SmallVector<uint64_t, 32> RawMask;
unsigned MaskEltSize = VT.getScalarSizeInBits();		unsigned MaskEltSize = VT.getScalarSizeInBits();
if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {		if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
DecodeVPERMVMask(RawMask, Mask);		DecodeVPERMVMask(RawMask, Mask);
break;		break;
}		}
if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {		if (auto *C = getTargetConstantFromNode(MaskNode)) {
DecodeVPERMVMask(C, VT, Mask);		DecodeVPERMVMask(C, VT, Mask);
break;		break;
}		}
return false;		return false;
}		}
case X86ISD::VPERMV3: {		case X86ISD::VPERMV3: {
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);		IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.		// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
Ops.push_back(N->getOperand(0));		Ops.push_back(N->getOperand(0));
Ops.push_back(N->getOperand(2));		Ops.push_back(N->getOperand(2));
SDValue MaskNode = N->getOperand(1);		SDValue MaskNode = N->getOperand(1);
if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {		if (auto *C = getTargetConstantFromNode(MaskNode)) {
DecodeVPERMV3Mask(C, VT, Mask);		DecodeVPERMV3Mask(C, VT, Mask);
break;		break;
}		}
return false;		return false;
}		}
default: llvm_unreachable("unknown target shuffle node");		default: llvm_unreachable("unknown target shuffle node");
}		}

▲ Show 20 Lines • Show All 25,242 Lines • ▼ Show 20 Lines	case X86ISD::FMSUB:
return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),		return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),
Arg.getOperand(1), Arg.getOperand(2));		Arg.getOperand(1), Arg.getOperand(2));
case X86ISD::FNMADD:		case X86ISD::FNMADD:
return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),		return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),
Arg.getOperand(1), Arg.getOperand(2));		Arg.getOperand(1), Arg.getOperand(2));
case X86ISD::FNMSUB:		case X86ISD::FNMSUB:
return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),		return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),
Arg.getOperand(1), Arg.getOperand(2));		Arg.getOperand(1), Arg.getOperand(2));
		case X86ISD::FMADD_RND:
		return DAG.getNode(X86ISD::FNMSUB_RND, DL, VT, Arg.getOperand(0),
		Arg.getOperand(1), Arg.getOperand(2), Arg.getOperand(3));
		case X86ISD::FMSUB_RND:
		return DAG.getNode(X86ISD::FNMADD_RND, DL, VT, Arg.getOperand(0),
		Arg.getOperand(1), Arg.getOperand(2), Arg.getOperand(3));
		case X86ISD::FNMADD_RND:
		return DAG.getNode(X86ISD::FMSUB_RND, DL, VT, Arg.getOperand(0),
		Arg.getOperand(1), Arg.getOperand(2), Arg.getOperand(3));
		case X86ISD::FNMSUB_RND:
		return DAG.getNode(X86ISD::FMADD_RND, DL, VT, Arg.getOperand(0),
		Arg.getOperand(1), Arg.getOperand(2), Arg.getOperand(3));
}		}
		spatelUnsubmitted Not Done Reply Inline Actions Are the RND changes independent of the fneg bug fix? If yes, can you separate them from this patch? spatel:* Are the *RND changes independent of the fneg bug fix? If yes, can you separate them from this…
		delenaAuthorUnsubmitted Not Done Reply Inline Actions The goal of this patch is to combine FNEG with FMA intrinsics. _RND nodes are generated from intrinsics only. Here we are combining the following patterns: FNEG ( avx512.mask.fm a, b ,c) and FNEG ( avx512.mask.fm* a, b ,c, Rounding) delena: The goal of this patch is to combine FNEG with FMA intrinsics. *_RND nodes are generated from…
}		}
return SDValue();		return SDValue();
}		}

static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,		static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);
if (VT.is512BitVector() && !Subtarget.hasDQI()) {		if (VT.is512BitVector() && !Subtarget.hasDQI()) {
Show All 11 Lines	switch (N->getOpcode()) {
case X86ISD::FAND: IntOpcode = ISD::AND; break;		case X86ISD::FAND: IntOpcode = ISD::AND; break;
case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;		case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
}		}
SDValue IntOp = DAG.getNode(IntOpcode, dl, MVT::v8i64, Op0, Op1);		SDValue IntOp = DAG.getNode(IntOpcode, dl, MVT::v8i64, Op0, Op1);
return DAG.getBitcast(VT, IntOp);		return DAG.getBitcast(VT, IntOp);
}		}
return SDValue();		return SDValue();
}		}

		/// returns true if the node \p N is FNEG(x) or FXOR (x, 0x80000000)
		spatelUnsubmitted Done Reply Inline Actions returns -> Returns Add period to end of sentence. spatel: returns -> Returns Add period to end of sentence.
		bool isFNEG(const SDNode *N) {
		if (N->getOpcode() == ISD::FNEG)
		return true;

		if (N->getOpcode() == X86ISD::FXOR) {
		unsigned EltBits = N->getSimpleValueType(0).getScalarSizeInBits();
		SDValue Op1 = N->getOperand(1);

		auto isSingBitValue = [&](const ConstantFP *C) {
		spatelUnsubmitted Done Reply Inline Actions Sing -> Sign spatel: Sing -> Sign
		return C->getValueAPF().bitcastToAPInt() == APInt::getSignBit(EltBits);
		};
		if (Op1.getOpcode() == X86ISD::VBROADCAST) {
		if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
		return isSingBitValue(cast<ConstantFP>(C));

		} else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
		if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
		return isSingBitValue(CN->getConstantFPValue());
		}
		}
		return false;
		}

/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.		/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,		static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
assert(N->getOpcode() == X86ISD::FOR \|\| N->getOpcode() == X86ISD::FXOR);		assert(N->getOpcode() == X86ISD::FOR \|\| N->getOpcode() == X86ISD::FXOR);

// F[X]OR(0.0, x) -> x		// F[X]OR(0.0, x) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))		if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())		if (C->getValueAPF().isPosZero())
return N->getOperand(1);		return N->getOperand(1);

// F[X]OR(x, 0.0) -> x		// F[X]OR(x, 0.0) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))		if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
if (C->getValueAPF().isPosZero())		if (C->getValueAPF().isPosZero())
return N->getOperand(0);		return N->getOperand(0);

		if (isFNEG(N))
		if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
		return NewVal;
return lowerX86FPLogicOp(N, DAG, Subtarget);		return lowerX86FPLogicOp(N, DAG, Subtarget);
}		}

/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.		/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {		static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == X86ISD::FMIN \|\| N->getOpcode() == X86ISD::FMAX);		assert(N->getOpcode() == X86ISD::FMIN \|\| N->getOpcode() == X86ISD::FMAX);

// Only perform optimizations if UnsafeMath is used.		// Only perform optimizations if UnsafeMath is used.
▲ Show 20 Lines • Show All 393 Lines • ▼ Show 20 Lines	static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
EVT ScalarVT = VT.getScalarType();		EVT ScalarVT = VT.getScalarType();
if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) \|\| !Subtarget.hasAnyFMA())		if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) \|\| !Subtarget.hasAnyFMA())
return SDValue();		return SDValue();

SDValue A = N->getOperand(0);		SDValue A = N->getOperand(0);
SDValue B = N->getOperand(1);		SDValue B = N->getOperand(1);
SDValue C = N->getOperand(2);		SDValue C = N->getOperand(2);

bool NegA = (A.getOpcode() == ISD::FNEG);		bool NegA = isFNEG(A.getNode());
bool NegB = (B.getOpcode() == ISD::FNEG);		bool NegB = isFNEG(B.getNode());
bool NegC = (C.getOpcode() == ISD::FNEG);		bool NegC = isFNEG(C.getNode());

// Negative multiplication when NegA xor NegB		// Negative multiplication when NegA xor NegB
bool NegMul = (NegA != NegB);		bool NegMul = (NegA != NegB);
if (NegA)		if (NegA)
A = A.getOperand(0);		A = A.getOperand(0);
if (NegB)		if (NegB)
B = B.getOperand(0);		B = B.getOperand(0);
if (NegC)		if (NegC)
C = C.getOperand(0);		C = C.getOperand(0);

unsigned Opcode;		unsigned NewOpcode;
if (!NegMul)		if (!NegMul)
Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;		NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
else		else
Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;		NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;

return DAG.getNode(Opcode, dl, VT, A, B, C);		if (N->getOpcode() == X86ISD::FMADD_RND) {
		switch (NewOpcode) {
		case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
		case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
		case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
		case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
		}
		return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
		}
		return DAG.getNode(NewOpcode, dl, VT, A, B, C);
}		}

static SDValue combineZext(SDNode *N, SelectionDAG &DAG,		static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,		TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->		// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
// (and (i32 x86isd::setcc_carry), 1)		// (and (i32 x86isd::setcc_carry), 1)
// This eliminates the zext. This transformation is necessary because		// This eliminates the zext. This transformation is necessary because
▲ Show 20 Lines • Show All 713 Lines • ▼ Show 20 Lines	SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::VPERMV:		case X86ISD::VPERMV:
case X86ISD::VPERMV3:		case X86ISD::VPERMV3:
case X86ISD::VPERMIL2:		case X86ISD::VPERMIL2:
case X86ISD::VPERMILPI:		case X86ISD::VPERMILPI:
case X86ISD::VPERMILPV:		case X86ISD::VPERMILPV:
case X86ISD::VPERM2X128:		case X86ISD::VPERM2X128:
case X86ISD::VZEXT_MOVL:		case X86ISD::VZEXT_MOVL:
case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);		case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
		case X86ISD::FMADD:
		case X86ISD::FMADD_RND:
case ISD::FMA: return combineFMA(N, DAG, Subtarget);		case ISD::FMA: return combineFMA(N, DAG, Subtarget);
case ISD::MGATHER:		case ISD::MGATHER:
case ISD::MSCATTER: return combineGatherScatter(N, DAG);		case ISD::MSCATTER: return combineGatherScatter(N, DAG);
case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);		case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
case X86ISD::TESTM: return combineTestM(N, DAG);		case X86ISD::TESTM: return combineTestM(N, DAG);
case X86ISD::PCMPEQ:		case X86ISD::PCMPEQ:
case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);		case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
}		}
▲ Show 20 Lines • Show All 859 Lines • Show Last 20 Lines

../test/CodeGen/X86/avx2-fma-fneg-combine.ll

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 -mattr=+fma \| FileCheck %s

				; This test checks combinations of FNEG and FMA intrinsics

				define <8 x float> @test1(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
				; CHECK-LABEL: test1:
				; CHECK: # BB#0: # %entry
				; CHECK-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
				; CHECK-NEXT: retq
				entry:
				%sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
				%0 = tail call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %sub.i) #2
				ret <8 x float> %0
				}

				declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)


				define <4 x float> @test2(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
				; CHECK-LABEL: test2:
				; CHECK: # BB#0: # %entry
				; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
				; CHECK-NEXT: retq
				entry:
				%0 = tail call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) #2
				%sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0
				ret <4 x float> %sub.i
				}

				declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c)

				define <4 x float> @test3(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
				; CHECK-LABEL: test3:
				; CHECK: # BB#0: # %entry
				; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0
				; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
				; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0
				; CHECK-NEXT: retq
				entry:
				%0 = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) #2
				%sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0
				ret <4 x float> %sub.i
				}

				declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)

				define <8 x float> @test4(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
				; CHECK-LABEL: test4:
				; CHECK: # BB#0: # %entry
				; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
				; CHECK-NEXT: retq
				entry:
				%0 = tail call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c) #2
				%sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0
				ret <8 x float> %sub.i
				}

				define <8 x float> @test5(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
				; CHECK-LABEL: test5:
				; CHECK: # BB#0: # %entry
				; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %ymm3
				; CHECK-NEXT: vxorps %ymm3, %ymm2, %ymm2
				; CHECK-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
				; CHECK-NEXT: retq
				entry:
				%sub.c = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
				%0 = tail call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %sub.c) #2
				ret <8 x float> %0
				}

				declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)

../test/CodeGen/X86/fma-fneg-combine.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq \| FileCheck %s			; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq \| FileCheck %s

	; This test checks combinations of FNEG and FMA intrinsics on AVX-512 target			; This test checks combinations of FNEG and FMA intrinsics on AVX-512 target
	; PR28892			; PR28892

	define <16 x float> @test1(<16 x float> %a, <16 x float> %b, <16 x float> %c) {			define <16 x float> @test1(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
	; CHECK-LABEL: test1:			; CHECK-LABEL: test1:
	; CHECK: # BB#0: # %entry			; CHECK: # BB#0: # %entry
	; CHECK-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm2, %zmm2			; CHECK-NEXT: vfmsub213ps %zmm2, %zmm1, %zmm0
	; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	entry:			entry:
	%sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c			%sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
	%0 = tail call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %sub.i, i16 -1, i32 4) #2			%0 = tail call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %sub.i, i16 -1, i32 4) #2
	ret <16 x float> %0			ret <16 x float> %0
	}			}

	declare <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)			declare <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
	declare <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)			declare <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
	declare <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)			declare <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)


	define <16 x float> @test2(<16 x float> %a, <16 x float> %b, <16 x float> %c) {			define <16 x float> @test2(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
	; CHECK-LABEL: test2:			; CHECK-LABEL: test2:
	; CHECK: # BB#0: # %entry			; CHECK: # BB#0: # %entry
	; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0			; CHECK-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0
	; CHECK-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	entry:			entry:
	%0 = tail call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 4) #2			%0 = tail call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 4) #2
	%sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0			%sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0
	ret <16 x float> %sub.i			ret <16 x float> %sub.i
	}			}

	define <16 x float> @test3(<16 x float> %a, <16 x float> %b, <16 x float> %c) {			define <16 x float> @test3(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
	; CHECK-LABEL: test3:			; CHECK-LABEL: test3:
	; CHECK: # BB#0: # %entry			; CHECK: # BB#0: # %entry
	; CHECK-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm0			; CHECK-NEXT: vfmsub213ps %zmm2, %zmm1, %zmm0
	; CHECK-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	entry:			entry:
	%0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 4) #2			%0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 4) #2
	%sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0			%sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0
	ret <16 x float> %sub.i			ret <16 x float> %sub.i
	}			}

	define <16 x float> @test4(<16 x float> %a, <16 x float> %b, <16 x float> %c) {			define <16 x float> @test4(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
	; CHECK-LABEL: test4:			; CHECK-LABEL: test4:
	; CHECK: # BB#0: # %entry			; CHECK: # BB#0: # %entry
	; CHECK-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0			; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0
	; CHECK-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	entry:			entry:
	%0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 4) #2			%0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 4) #2
	%sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0			%sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0
	ret <16 x float> %sub.i			ret <16 x float> %sub.i
	}			}

	define <16 x float> @test5(<16 x float> %a, <16 x float> %b, <16 x float> %c) {			define <16 x float> @test5(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
	; CHECK-LABEL: test5:			; CHECK-LABEL: test5:
	; CHECK: # BB#0: # %entry			; CHECK: # BB#0: # %entry
	; CHECK-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm2, %zmm2			; CHECK-NEXT: vfmsub213ps {ru-sae}, %zmm2, %zmm1, %zmm0
	; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	entry:			entry:
	%sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c			%sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
	%0 = tail call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %sub.i, i16 -1, i32 2) #2			%0 = tail call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %sub.i, i16 -1, i32 2) #2
	ret <16 x float> %0			ret <16 x float> %0
	}			}

	define <16 x float> @test6(<16 x float> %a, <16 x float> %b, <16 x float> %c) {			define <16 x float> @test6(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
	; CHECK-LABEL: test6:			; CHECK-LABEL: test6:
	; CHECK: # BB#0: # %entry			; CHECK: # BB#0: # %entry
	; CHECK-NEXT: vfnmsub213ps {ru-sae}, %zmm2, %zmm1, %zmm0			; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0
				RKSimonUnsubmitted Not Done Reply Inline Actions Is this case fixed by D23108? RKSimon: Is this case fixed by D23108?
	; CHECK-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	entry:			entry:
	%0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 2) #2			%0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 2) #2
	%sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0			%sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0
	ret <16 x float> %sub.i			ret <16 x float> %sub.i
	}			}


	define <8 x float> @test7(<8 x float> %a, <8 x float> %b, <8 x float> %c) {			define <8 x float> @test7(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
	; CHECK-LABEL: test7:			; CHECK-LABEL: test7:
	; CHECK: # BB#0: # %entry			; CHECK: # BB#0: # %entry
	; CHECK-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0			; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
	; CHECK-NEXT: vxorps {{.*}}(%rip){1to8}, %ymm0, %ymm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	entry:			entry:
	%0 = tail call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c) #2			%0 = tail call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c) #2
	%sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0			%sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0
	ret <8 x float> %sub.i			ret <8 x float> %sub.i
	}			}

	define <8 x float> @test8(<8 x float> %a, <8 x float> %b, <8 x float> %c) {			define <8 x float> @test8(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
	; CHECK-LABEL: test8:			; CHECK-LABEL: test8:
	; CHECK: # BB#0: # %entry			; CHECK: # BB#0: # %entry
	; CHECK-NEXT: vxorps {{.*}}(%rip){1to8}, %ymm2, %ymm2			; CHECK-NEXT: vxorps {{.*}}(%rip){1to8}, %ymm2, %ymm2
	; CHECK-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0			; CHECK-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	entry:			entry:
	%sub.c = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c			%sub.c = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
	%0 = tail call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %sub.c) #2			%0 = tail call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %sub.c) #2
	ret <8 x float> %0			ret <8 x float> %0
	}			}

	declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)			declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)

This is an archive of the discontinued LLVM Phabricator instance.

X86: FMA intrinsic + FNEG - sequence optimization
ClosedPublic

Details

Diff Detail

Event Timeline

Index: lib/Target/X86/X86ISelLowering.cpp

Revision Contents

Diff 69739

../lib/Target/X86/X86ISelLowering.cpp

../test/CodeGen/X86/avx2-fma-fneg-combine.ll

../test/CodeGen/X86/fma-fneg-combine.ll

This is an archive of the discontinued LLVM Phabricator instance.

X86: FMA intrinsic + FNEG - sequence optimizationClosedPublic

Details

Diff Detail

Event Timeline

Index: lib/Target/X86/X86ISelLowering.cpp

Revision Contents

Diff 69739

../lib/Target/X86/X86ISelLowering.cpp

../test/CodeGen/X86/avx2-fma-fneg-combine.ll

../test/CodeGen/X86/fma-fneg-combine.ll

X86: FMA intrinsic + FNEG - sequence optimization
ClosedPublic