Diff 129219

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 8,771 Lines • ▼ Show 20 Lines	if ((N0.getOpcode() == ISD::ADDE \|\| N0.getOpcode() == ISD::ADDCARRY) &&
(!LegalOperations \|\| TLI.isOperationLegal(N0.getOpcode(), VT))) {		(!LegalOperations \|\| TLI.isOperationLegal(N0.getOpcode(), VT))) {
SDLoc SL(N);		SDLoc SL(N);
auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));		auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));		auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
auto VTs = DAG.getVTList(VT, N0->getValueType(1));		auto VTs = DAG.getVTList(VT, N0->getValueType(1));
return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2));		return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2));
}		}

		if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
		RKSimonUnsubmitted Not Done Reply Inline Actions Add comment describing the combine - any chance that you can add additional tests for this? RKSimon: Add comment describing the combine - any chance that you can add additional tests for this?
		zviAuthorUnsubmitted Not Done Reply Inline Actions Sure. zvi: Sure.
		SDValue N00 = N0.getOperand(0);
		if (N00.getOpcode() == ISD::SIGN_EXTEND \|\|
		N00.getOpcode() == ISD::ZERO_EXTEND \|\|
		N00.getOpcode() == ISD::ANY_EXTEND) {
		if (N00.getOperand(0)->getValueType(0).getVectorElementType() ==
		VT.getVectorElementType())
		return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N0->getOperand(0)), VT,
		N00.getOperand(0), N0.getOperand(1));
		}
		}

if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))		if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
return NewVSel;		return NewVSel;

return SDValue();		return SDValue();
}		}

static SDNode getBuildPairElt(SDNode N, unsigned i) {		static SDNode getBuildPairElt(SDNode N, unsigned i) {
SDValue Elt = N->getOperand(i);		SDValue Elt = N->getOperand(i);
▲ Show 20 Lines • Show All 8,967 Lines • Show Last 20 Lines

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 32,759 Lines • ▼ Show 20 Lines
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();			const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(In.getValueType()) \|\| !TLI.isTypeLegal(VT))			if (!TLI.isTypeLegal(In.getValueType()) \|\| !TLI.isTypeLegal(VT))
	return SDValue();			return SDValue();
	if (auto USatVal = detectUSatPattern(In, VT))			if (auto USatVal = detectUSatPattern(In, VT))
	if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))			if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
	return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);			return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
	return SDValue();			return SDValue();
	}			}
				// Split vectors to legal target size and apply AVG.
				template <typename F>
				SDValue LowerBinTo(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDLoc DL,
				EVT VT, SDValue Op0, SDValue Op1, F Builder) {
				unsigned NumSubs = 1;
				if (Subtarget.hasBWI()) {
				if (VT.getSizeInBits() > 512)
				NumSubs = VT.getSizeInBits() / 512;
				} else if (Subtarget.hasAVX2()) {
				if (VT.getSizeInBits() > 256)
				NumSubs = VT.getSizeInBits() / 256;
				} else {
				if (VT.getSizeInBits() > 128)
				NumSubs = VT.getSizeInBits() / 128;
				}

				if (NumSubs == 1)
				return Builder(DAG, DL, /VT,/ Op0, Op1);

				SmallVector<SDValue, 4> Subs;
				EVT InVT = Op0.getValueType();
				EVT SubVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
				InVT.getVectorNumElements() / NumSubs);
				for (unsigned i = 0; i != NumSubs; ++i) {
				unsigned Idx = i * SubVT.getVectorNumElements();
				SDValue LHS = extractSubVector(Op0, Idx, DAG, DL, SubVT.getSizeInBits());
				SDValue RHS = extractSubVector(Op1, Idx, DAG, DL, SubVT.getSizeInBits());
				Subs.push_back(Builder(DAG, DL, /SubVT,/ LHS, RHS));
				}
				return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
				}
	/// This function detects the AVG pattern between vectors of unsigned i8/i16,			/// This function detects the AVG pattern between vectors of unsigned i8/i16,
	/// which is c = (a + b + 1) / 2, and replace this operation with the efficient			/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
	/// X86ISD::AVG instruction.			/// X86ISD::AVG instruction.
	static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,			static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,			const X86Subtarget &Subtarget,
	const SDLoc &DL) {			const SDLoc &DL) {
	if (!VT.isVector() \|\| !VT.isSimple())			if (!VT.isVector() \|\| !VT.isSimple())
	return SDValue();			return SDValue();
	Show All 39 Lines
	return false;			return false;
	uint64_t Val = C->getZExtValue();			uint64_t Val = C->getZExtValue();
	if (Val < Min \|\| Val > Max)			if (Val < Min \|\| Val > Max)
	return false;			return false;
	}			}
	return true;			return true;
	};			};

	// Split vectors to legal target size and apply AVG.
	auto LowerToAVG = [&](SDValue Op0, SDValue Op1) {
	unsigned NumSubs = 1;
	if (Subtarget.hasBWI()) {
	if (VT.getSizeInBits() > 512)
	NumSubs = VT.getSizeInBits() / 512;
	} else if (Subtarget.hasAVX2()) {
	if (VT.getSizeInBits() > 256)
	NumSubs = VT.getSizeInBits() / 256;
	} else {
	if (VT.getSizeInBits() > 128)
	NumSubs = VT.getSizeInBits() / 128;
	}

	if (NumSubs == 1)
	return DAG.getNode(X86ISD::AVG, DL, VT, Op0, Op1);

	SmallVector<SDValue, 4> Subs;
	EVT SubVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
	VT.getVectorNumElements() / NumSubs);
	for (unsigned i = 0; i != NumSubs; ++i) {
	unsigned Idx = i * SubVT.getVectorNumElements();
	SDValue LHS = extractSubVector(Op0, Idx, DAG, DL, SubVT.getSizeInBits());
	SDValue RHS = extractSubVector(Op1, Idx, DAG, DL, SubVT.getSizeInBits());
	Subs.push_back(DAG.getNode(X86ISD::AVG, DL, SubVT, LHS, RHS));
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
	};

	// Check if each element of the vector is left-shifted by one.			// Check if each element of the vector is left-shifted by one.
	auto LHS = In.getOperand(0);			auto LHS = In.getOperand(0);
	auto RHS = In.getOperand(1);			auto RHS = In.getOperand(1);
	if (!IsConstVectorInRange(RHS, 1, 1))			if (!IsConstVectorInRange(RHS, 1, 1))
	return SDValue();			return SDValue();
	if (LHS.getOpcode() != ISD::ADD)			if (LHS.getOpcode() != ISD::ADD)
	return SDValue();			return SDValue();

	// Detect a pattern of a + b + 1 where the order doesn't matter.			// Detect a pattern of a + b + 1 where the order doesn't matter.
	SDValue Operands[3];			SDValue Operands[3];
	Operands[0] = LHS.getOperand(0);			Operands[0] = LHS.getOperand(0);
	Operands[1] = LHS.getOperand(1);			Operands[1] = LHS.getOperand(1);

				auto AVGBuilder = [](SelectionDAG &DAG, SDLoc DL, SDValue Op0, SDValue Op1) {
				return DAG.getNode(X86ISD::AVG, DL, /VT/ Op0.getValueType(), Op0, Op1);
				};

	// Take care of the case when one of the operands is a constant vector whose			// Take care of the case when one of the operands is a constant vector whose
	// element is in the range [1, 256].			// element is in the range [1, 256].
	if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&			if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
	Operands[0].getOpcode() == ISD::ZERO_EXTEND &&			Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
	Operands[0].getOperand(0).getValueType() == VT) {			Operands[0].getOperand(0).getValueType() == VT) {
	// The pattern is detected. Subtract one from the constant vector, then			// The pattern is detected. Subtract one from the constant vector, then
	// demote it and emit X86ISD::AVG instruction.			// demote it and emit X86ISD::AVG instruction.
	SDValue VecOnes = DAG.getConstant(1, DL, InVT);			SDValue VecOnes = DAG.getConstant(1, DL, InVT);
	Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);			Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
	Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);			Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
	return LowerToAVG(Operands[0].getOperand(0), Operands[1]);			return LowerBinTo(DAG, Subtarget, DL, VT, Operands[0].getOperand(0),
				Operands[1], AVGBuilder);
	}			}

	if (Operands[0].getOpcode() == ISD::ADD)			if (Operands[0].getOpcode() == ISD::ADD)
	std::swap(Operands[0], Operands[1]);			std::swap(Operands[0], Operands[1]);
	else if (Operands[1].getOpcode() != ISD::ADD)			else if (Operands[1].getOpcode() != ISD::ADD)
	return SDValue();			return SDValue();
	Operands[2] = Operands[1].getOperand(0);			Operands[2] = Operands[1].getOperand(0);
	Operands[1] = Operands[1].getOperand(1);			Operands[1] = Operands[1].getOperand(1);

	// Now we have three operands of two additions. Check that one of them is a			// Now we have three operands of two additions. Check that one of them is a
	// constant vector with ones, and the other two are promoted from i8/i16.			// constant vector with ones, and the other two are promoted from i8/i16.
	for (int i = 0; i < 3; ++i) {			for (int i = 0; i < 3; ++i) {
	if (!IsConstVectorInRange(Operands[i], 1, 1))			if (!IsConstVectorInRange(Operands[i], 1, 1))
	continue;			continue;
	std::swap(Operands[i], Operands[2]);			std::swap(Operands[i], Operands[2]);

	// Check if Operands[0] and Operands[1] are results of type promotion.			// Check if Operands[0] and Operands[1] are results of type promotion.
	for (int j = 0; j < 2; ++j)			for (int j = 0; j < 2; ++j)
	if (Operands[j].getOpcode() != ISD::ZERO_EXTEND \|\|			if (Operands[j].getOpcode() != ISD::ZERO_EXTEND \|\|
	Operands[j].getOperand(0).getValueType() != VT)			Operands[j].getOperand(0).getValueType() != VT)
	return SDValue();			return SDValue();

	// The pattern is detected, emit X86ISD::AVG instruction.			// The pattern is detected, emit X86ISD::AVG instruction.
	return LowerToAVG(Operands[0].getOperand(0), Operands[1].getOperand(0));			return LowerBinTo(DAG, Subtarget, DL, VT, Operands[0].getOperand(0),
				Operands[1].getOperand(0), AVGBuilder);
	}			}

	return SDValue();			return SDValue();
	}			}

	static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,			static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,			TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {			const X86Subtarget &Subtarget) {
	▲ Show 20 Lines • Show All 2,882 Lines • ▼ Show 20 Lines
	!SplatVal.isOneValue())			!SplatVal.isOneValue())
	return SDValue();			return SDValue();

	SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));			SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
	unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;			unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
	return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);			return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
	}			}

				static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
				SDLoc DL, EVT VT, const X86Subtarget &Subtarget) {
				// Example of pattern we try to detect:
				// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
				//(add (build_vector (extract_elt t, 0),
				// (extract_elt t, 2),
				// (extract_elt t, 4),
				// (extract_elt t, 6)),
				// (build_vector (extract_elt t, 1),
				// (extract_elt t, 3),
				// (extract_elt t, 5),
				// (extract_elt t, 7)))

				if (!Subtarget.hasSSE2())
				return SDValue();

				if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i32 \|\|
				VT.getVectorNumElements() < 4 \|\|
				!isPowerOf2_32(VT.getVectorNumElements()))
				return SDValue();

				unsigned ValNumElts = VT.getVectorNumElements();

				craig.topperUnsubmitted Not Done Reply Inline Actions Second line is indented 2 extra spaces. craig.topper: Second line is indented 2 extra spaces.
				// Helper for examining one ADD operand.
				auto IsBuildVectorOfExtractsFromMul = [](
				SDValue Op, ArrayRef<unsigned> ExpectedIndices, SDValue &RetMul) {
				if (Op->getOpcode() != ISD::BUILD_VECTOR)
				return false;
				SDValue Mul;
				for (unsigned i = 0, e = ExpectedIndices.size(); i != e; ++i) {
				// TODO: Be more tolerant to undefs.
				if (Op->getOperand(i)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
				return false;
				auto *Idx = dyn_cast<ConstantSDNode>(Op->getOperand(i)->getOperand(1));
				if (!Idx \|\| Idx->getZExtValue() != ExpectedIndices[i])
				craig.topperUnsubmitted Done Reply Inline Actions Do you need a hasSSE2 check on the v4i32? I don't see one before this call in combineAdd. craig.topper: Do you need a hasSSE2 check on the v4i32? I don't see one before this call in combineAdd.
				return false;
				if (Mul) {
				// Check that the extract is from the same MUL previously seen.
				if (Mul != Op->getOperand(i)->getOperand(0))
				return false;
				} else {
				// First time an extract_elt's source vector is visited. Must be a MUL
				// with 2X number of vector elements than the BUILD_VECTOR.
				craig.topperUnsubmitted Done Reply Inline Actions What ensures the multiply has exactly 2X the elements of the build_vector? Couldn't it have more? Which would cause the truncate later to fail. craig.topper: What ensures the multiply has exactly 2X the elements of the build_vector? Couldn't it have…
				zviAuthorUnsubmitted Done Reply Inline Actions You're right! Will fix and add tests zvi: You're right! Will fix and add tests
				Mul = Op->getOperand(i)->getOperand(0);
				if (Mul->getOpcode() != ISD::MUL \|\|
				Mul.getValueType().getVectorNumElements() !=
				2 * ExpectedIndices.size())
				craig.topperUnsubmitted Done Reply Inline Actions tolerant* craig.topper: tolerant*
				return false;
				}
				}
				RetMul = Mul;
				return true;
				};
				SDValue L, R;
				const unsigned ExpectedEvenIndices[] = {0, 2, 4, 6, 8, 10, 12, 14,
				16, 18, 20, 22, 24, 26, 28, 30};
				const unsigned ExpectedOddIndices[] = {1, 3, 5, 7, 9, 11, 13, 15,
				17, 19, 21, 23, 25, 27, 29, 31};
				// Try the two possible orderings: (add even, odd) , (add odd, even)
				if (!(IsBuildVectorOfExtractsFromMul(
				Op0, makeArrayRef(ExpectedEvenIndices, ValNumElts), L) &&
				IsBuildVectorOfExtractsFromMul(
				Op1, makeArrayRef(ExpectedOddIndices, ValNumElts), R) &&
				L == R) &&
				!(IsBuildVectorOfExtractsFromMul(
				Op0, makeArrayRef(ExpectedOddIndices, ValNumElts), L) &&
				IsBuildVectorOfExtractsFromMul(
				Op1, makeArrayRef(ExpectedEvenIndices, ValNumElts), R) &&
				craig.topperUnsubmitted Done Reply Inline Actions Is there anything that guarantees even indices will be on the LHS? craig.topper: Is there anything that guarantees even indices will be on the LHS?
				zviAuthorUnsubmitted Done Reply Inline Actions Will add checks for both orderings. thanks zvi: Will add checks for both orderings. thanks
				L == R))
				return SDValue();

				// Check if the Mul source can be safely shrunk.
				ShrinkMode Mode;
				if (!canReduceVMulWidth(L.getNode(), DAG, Mode) \|\| Mode == MULU16)
				return SDValue();

				auto PMADDBuilder = [](SelectionDAG &DAG, SDLoc DL, SDValue Op0,
				SDValue Op1) {
				// Shrink by adding truncate nodes and let DAGCombine fold with the
				// sources.
				EVT InVT = Op0.getValueType();
				assert(InVT.getScalarType() == MVT::i32 &&
				"Unexpected scalar element type");
				assert(InVT == Op1.getValueType() && "Operands' types mismatch");
				EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
				InVT.getVectorNumElements() / 2);
				EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
				InVT.getVectorNumElements());
				return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
				DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Op0),
				DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Op1));
				};
				return LowerBinTo(DAG, Subtarget, DL, VT, L.getOperand(0), L.getOperand(1),
				PMADDBuilder);
				}

	static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,			static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {			const X86Subtarget &Subtarget) {
	const SDNodeFlags Flags = N->getFlags();			const SDNodeFlags Flags = N->getFlags();
	if (Flags.hasVectorReduction()) {			if (Flags.hasVectorReduction()) {
	if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))			if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
	return Sad;			return Sad;
	if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))			if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
	return MAdd;			return MAdd;
	}			}
	EVT VT = N->getValueType(0);			EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);			SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);			SDValue Op1 = N->getOperand(1);

				if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
				return MAdd;

	// Try to synthesize horizontal adds from adds of shuffles.			// Try to synthesize horizontal adds from adds of shuffles.
	if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|			if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|
	(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&			(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&
	isHorizontalBinOp(Op0, Op1, true))			isHorizontalBinOp(Op0, Op1, true))
	return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);			return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);

	if (SDValue V = combineIncDecVector(N, DAG))			if (SDValue V = combineIncDecVector(N, DAG))
	return V;			return V;

	return combineAddOrSubToADCOrSBB(N, DAG);			return combineAddOrSubToADCOrSBB(N, DAG);
	}			}

	static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,			static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {			const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);			SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);			SDValue Op1 = N->getOperand(1);
	EVT VT = N->getValueType(0);			EVT VT = N->getValueType(0);

				RKSimonUnsubmitted Not Done Reply Inline Actions Can't you do this in a fully general manner, checking both LHS+RHS at the same time that one is the odd and the other is the even? It seems as the moment you can only check for all_odd+all_even or all_even+all_odd. RKSimon: Can't you do this in a fully general manner, checking both LHS+RHS at the same time that one is…
	// PSUBUS is supported, starting from SSE2, but special preprocessing			// PSUBUS is supported, starting from SSE2, but special preprocessing
	// for v8i32 requires umin, which appears in SSE41.			// for v8i32 requires umin, which appears in SSE41.
	if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 \|\| VT == MVT::v8i16)) &&			if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 \|\| VT == MVT::v8i16)) &&
	!(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&			!(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&
	!(Subtarget.hasAVX2() && (VT == MVT::v32i8 \|\| VT == MVT::v16i16)) &&			!(Subtarget.hasAVX2() && (VT == MVT::v32i8 \|\| VT == MVT::v16i16)) &&
	!(Subtarget.hasAVX512() && Subtarget.hasBWI() &&			!(Subtarget.hasAVX512() && Subtarget.hasBWI() &&
	(VT == MVT::v64i8 \|\| VT == MVT::v32i16 \|\| VT == MVT::v16i32 \|\|			(VT == MVT::v64i8 \|\| VT == MVT::v32i16 \|\| VT == MVT::v16i32 \|\|
	VT == MVT::v8i64)))			VT == MVT::v8i64)))
	return SDValue();			return SDValue();

	SDValue SubusLHS, SubusRHS;			SDValue SubusLHS, SubusRHS;
	// Try to find umax(a,b) - b or a - umin(a,b) patterns			// Try to find umax(a,b) - b or a - umin(a,b) patterns
	// they may be converted to subus(a,b).			// they may be converted to subus(a,b).
	// TODO: Need to add IR cannonicialization for this code.			// TODO: Need to add IR cannonicialization for this code.
	if (Op0.getOpcode() == ISD::UMAX) {			if (Op0.getOpcode() == ISD::UMAX) {
	SubusRHS = Op1;			SubusRHS = Op1;
	SDValue MaxLHS = Op0.getOperand(0);			SDValue MaxLHS = Op0.getOperand(0);
	SDValue MaxRHS = Op0.getOperand(1);			SDValue MaxRHS = Op0.getOperand(1);
	if (MaxLHS == Op1)			if (MaxLHS == Op1)
	SubusLHS = MaxRHS;			SubusLHS = MaxRHS;
	else if (MaxRHS == Op1)			else if (MaxRHS == Op1)
	SubusLHS = MaxLHS;			SubusLHS = MaxLHS;
				RKSimonUnsubmitted Not Done Reply Inline Actions Mul.getValueType().getVectorNumElements() != (2 * e) RKSimon: Mul.getValueType().getVectorNumElements() != (2 * e)
	else			else
	return SDValue();			return SDValue();
	} else if (Op1.getOpcode() == ISD::UMIN) {			} else if (Op1.getOpcode() == ISD::UMIN) {
	SubusLHS = Op0;			SubusLHS = Op0;
	SDValue MinLHS = Op1.getOperand(0);			SDValue MinLHS = Op1.getOperand(0);
	SDValue MinRHS = Op1.getOperand(1);			SDValue MinRHS = Op1.getOperand(1);
	if (MinLHS == Op0)			if (MinLHS == Op0)
	SubusRHS = MinRHS;			SubusRHS = MinRHS;
	▲ Show 20 Lines • Show All 1,559 Lines • Show Last 20 Lines

test/CodeGen/X86/madd.ll

Show First 20 Lines • Show All 310 Lines • ▼ Show 20 Lines	middle.block:
%bin.rdx20 = add <16 x i32> %bin.rdx18, %rdx.shuf19		%bin.rdx20 = add <16 x i32> %bin.rdx18, %rdx.shuf19
%13 = extractelement <16 x i32> %bin.rdx20, i32 0		%13 = extractelement <16 x i32> %bin.rdx20, i32 0
ret i32 %13		ret i32 %13
}		}

define <4 x i32> @pmaddwd_8(<8 x i16> %A, <8 x i16> %B) {		define <4 x i32> @pmaddwd_8(<8 x i16> %A, <8 x i16> %B) {
; SSE2-LABEL: pmaddwd_8:		; SSE2-LABEL: pmaddwd_8:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2		; SSE2-NEXT: pmaddwd %xmm1, %xmm0
; SSE2-NEXT: pmulhw %xmm1, %xmm2
; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: retq		; SSE2-NEXT: retq
;		;
; AVX-LABEL: pmaddwd_8:		; AVX-LABEL: pmaddwd_8:
; AVX: # %bb.0:		; AVX: # %bb.0:
; AVX-NEXT: vpmovsxwd %xmm0, %ymm0		; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq		; AVX-NEXT: retq
%a = sext <8 x i16> %A to <8 x i32>		%a = sext <8 x i16> %A to <8 x i32>
%b = sext <8 x i16> %B to <8 x i32>		%b = sext <8 x i16> %B to <8 x i32>
%m = mul nsw <8 x i32> %a, %b		%m = mul nsw <8 x i32> %a, %b
%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>		%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>		%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%ret = add <4 x i32> %odd, %even		%ret = add <4 x i32> %odd, %even
ret <4 x i32> %ret		ret <4 x i32> %ret
}		}

define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) {		define <4 x i32> @pmaddwd_8_swapped(<8 x i16> %A, <8 x i16> %B) {
; SSE2-LABEL: pmaddwd_16:		; SSE2-LABEL: pmaddwd_8_swapped:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm4		; SSE2-NEXT: pmaddwd %xmm1, %xmm0
; SSE2-NEXT: pmulhw %xmm2, %xmm4		; SSE2-NEXT: retq
		;
		; AVX-LABEL: pmaddwd_8_swapped:
		; AVX: # %bb.0:
		; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
		; AVX-NEXT: retq
		%a = sext <8 x i16> %A to <8 x i32>
		%b = sext <8 x i16> %B to <8 x i32>
		%m = mul nsw <8 x i32> %a, %b
		%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
		%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
		%ret = add <4 x i32> %even, %odd
		ret <4 x i32> %ret
		}

		define <4 x i32> @larger_mul(<16 x i16> %A, <16 x i16> %B) {
		; SSE2-LABEL: larger_mul:
		; SSE2: # %bb.0:
		; SSE2-NEXT: movdqa %xmm0, %xmm1
		; SSE2-NEXT: pmulhw %xmm2, %xmm1
; SSE2-NEXT: pmullw %xmm2, %xmm0		; SSE2-NEXT: pmullw %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2		; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]		; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]		; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: movdqa %xmm1, %xmm4		; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pmulhw %xmm3, %xmm4		; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; SSE2-NEXT: pmullw %xmm3, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
; SSE2-NEXT: paddd %xmm4, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]		; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
; SSE2-NEXT: paddd %xmm5, %xmm0		; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: retq		; SSE2-NEXT: retq
		RKSimonUnsubmitted Done Reply Inline Actions Whats missing to get SSE2 to lower to 2 x pmaddwd? (TBH I'm more interested in AVX1 but it'd be good for SSE as well). RKSimon: Whats missing to get SSE2 to lower to 2 x pmaddwd? (TBH I'm more interested in AVX1 but it'd be…
		zviAuthorUnsubmitted Done Reply Inline Actions That's a good idea. Though it might be tricky to split the 'mul' operands so that type-legalization won't mess-up illegal types. I can try to rework this patch or leave it as a follow-up if it gets too messy. zvi: That's a good idea. Though it might be tricky to split the 'mul' operands so that type…
		RKSimonUnsubmitted Done Reply Inline Actions I did something similar in D41440 for PAVG - we could pull out and generalize the 'LowerToAVG' code to split into legal ops and concat the results. RKSimon: I did something similar in D41440 for PAVG - we could pull out and generalize the 'LowerToAVG'…
;		;
; AVX2-LABEL: pmaddwd_16:		; AVX2-LABEL: larger_mul:
; AVX2: # %bb.0:		; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0		; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vpmovsxwd %xmm1, %ymm3
; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1		; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0		; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm0[0,2],ymm2[4,6],ymm0[4,6]		; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]		; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7]		; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]		; AVX2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0		; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq		; AVX2-NEXT: retq
;		;
; AVX512-LABEL: pmaddwd_16:		; AVX512-LABEL: larger_mul:
; AVX512: # %bb.0:		; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0		; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1		; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1
; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0		; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1		; AVX512-NEXT: vpextrd $2, %xmm0, %eax
; AVX512-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]		; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1
; AVX512-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]		; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]		; AVX512-NEXT: vmovd %xmm2, %eax
; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]		; AVX512-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0		; AVX512-NEXT: vpextrd $2, %xmm2, %eax
		; AVX512-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
		; AVX512-NEXT: vpextrd $3, %xmm0, %eax
		; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
		; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
		; AVX512-NEXT: vpextrd $1, %xmm2, %eax
		; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
		; AVX512-NEXT: vpextrd $3, %xmm2, %eax
		; AVX512-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
		; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0
		; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq		; AVX512-NEXT: retq
%a = sext <16 x i16> %A to <16 x i32>		%a = sext <16 x i16> %A to <16 x i32>
%b = sext <16 x i16> %B to <16 x i32>		%b = sext <16 x i16> %B to <16 x i32>
%m = mul nsw <16 x i32> %a, %b		%m = mul nsw <16 x i32> %a, %b
		%odd = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
		%even = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
		%ret = add <4 x i32> %odd, %even
		ret <4 x i32> %ret
		}

		define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) {
		; SSE2-LABEL: pmaddwd_16:
		; SSE2: # %bb.0:
		; SSE2-NEXT: pmaddwd %xmm2, %xmm0
		; SSE2-NEXT: pmaddwd %xmm3, %xmm1
		; SSE2-NEXT: retq
		;
		; AVX-LABEL: pmaddwd_16:
		; AVX: # %bb.0:
		; AVX-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
		; AVX-NEXT: retq
		%a = sext <16 x i16> %A to <16 x i32>
		%b = sext <16 x i16> %B to <16 x i32>
		%m = mul nsw <16 x i32> %a, %b
%odd = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>		%odd = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%even = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>		%even = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%ret = add <8 x i32> %odd, %even		%ret = add <8 x i32> %odd, %even
ret <8 x i32> %ret		ret <8 x i32> %ret
}		}

define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) {		define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) {
; SSE2-LABEL: pmaddwd_32:		; SSE2-LABEL: pmaddwd_32:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm8		; SSE2-NEXT: pmaddwd %xmm4, %xmm0
; SSE2-NEXT: pmulhw %xmm4, %xmm8		; SSE2-NEXT: pmaddwd %xmm5, %xmm1
; SSE2-NEXT: pmullw %xmm4, %xmm0		; SSE2-NEXT: pmaddwd %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm9		; SSE2-NEXT: pmaddwd %xmm7, %xmm3
; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pmulhw %xmm5, %xmm4
; SSE2-NEXT: pmullw %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm8
; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pmulhw %xmm6, %xmm4
; SSE2-NEXT: pmullw %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm6
; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pmulhw %xmm7, %xmm4
; SSE2-NEXT: pmullw %xmm7, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm7
; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[0,2]
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2]
; SSE2-NEXT: movdqa %xmm1, %xmm10
; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm8[0,2]
; SSE2-NEXT: movdqa %xmm0, %xmm11
; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm9[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm7[1,3]
; SSE2-NEXT: paddd %xmm4, %xmm3
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm6[1,3]
; SSE2-NEXT: paddd %xmm5, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm8[1,3]
; SSE2-NEXT: paddd %xmm10, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm9[1,3]
; SSE2-NEXT: paddd %xmm11, %xmm0
; SSE2-NEXT: retq		; SSE2-NEXT: retq
;		;
; AVX2-LABEL: pmaddwd_32:		; AVX2-LABEL: pmaddwd_32:
; AVX2: # %bb.0:		; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxwd %xmm1, %ymm4		; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1		; AVX2-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm5
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vpmovsxwd %xmm3, %ymm6
; AVX2-NEXT: vpmulld %ymm6, %ymm4, %ymm4
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3
; AVX2-NEXT: vpmovsxwd %xmm3, %ymm3
; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpmovsxwd %xmm2, %ymm3
; AVX2-NEXT: vpmulld %ymm3, %ymm5, %ymm3
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX2-NEXT: vpmovsxwd %xmm2, %ymm2
; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,2],ymm1[0,2],ymm4[4,6],ymm1[4,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,2],ymm0[0,2],ymm3[4,6],ymm0[4,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3]
; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,3],ymm1[1,3],ymm4[5,7],ymm1[5,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,3],ymm0[1,3],ymm3[5,7],ymm0[5,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpaddd %ymm0, %ymm5, %ymm0
; AVX2-NEXT: retq		; AVX2-NEXT: retq
;		;
; AVX512F-LABEL: pmaddwd_32:		; AVX512F-LABEL: pmaddwd_32:
; AVX512F: # %bb.0:		; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0		; AVX512F-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1		; AVX512F-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2		; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmulld %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpmovsxwd %ymm3, %zmm2
; AVX512F-NEXT: vpmulld %zmm2, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
; AVX512F-NEXT: vpaddd %zmm3, %zmm2, %zmm0
; AVX512F-NEXT: retq		; AVX512F-NEXT: retq
;		;
; AVX512BW-LABEL: pmaddwd_32:		; AVX512BW-LABEL: pmaddwd_32:
; AVX512BW: # %bb.0:		; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovsxwd %ymm0, %zmm2		; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512BW-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm3
; AVX512BW-NEXT: vpmulld %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm1
; AVX512BW-NEXT: vpmulld %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1
; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm3
; AVX512BW-NEXT: vpaddd %zmm3, %zmm1, %zmm0
; AVX512BW-NEXT: retq		; AVX512BW-NEXT: retq
%a = sext <32 x i16> %A to <32 x i32>		%a = sext <32 x i16> %A to <32 x i32>
%b = sext <32 x i16> %B to <32 x i32>		%b = sext <32 x i16> %B to <32 x i32>
%m = mul nsw <32 x i32> %a, %b		%m = mul nsw <32 x i32> %a, %b
%odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>		%odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
%even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>		%even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
%ret = add <16 x i32> %odd, %even		%ret = add <16 x i32> %odd, %even
ret <16 x i32> %ret		ret <16 x i32> %ret
}		}

define <4 x i32> @pmaddwd_const(<8 x i16> %A) {		define <4 x i32> @pmaddwd_const(<8 x i16> %A) {
; SSE2-LABEL: pmaddwd_const:		; SSE2-LABEL: pmaddwd_const:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32767,32768,0,0,1,7,42,32]		; SSE2-NEXT: pmaddwd {{.*}}(%rip), %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmulhw %xmm1, %xmm2
; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: retq		; SSE2-NEXT: retq
;		;
; AVX-LABEL: pmaddwd_const:		; AVX-LABEL: pmaddwd_const:
; AVX: # %bb.0:		; AVX: # %bb.0:
; AVX-NEXT: vpmovsxwd %xmm0, %ymm0		; AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq		; AVX-NEXT: retq
%a = sext <8 x i16> %A to <8 x i32>		%a = sext <8 x i16> %A to <8 x i32>
%m = mul nsw <8 x i32> %a, <i32 32767, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32>		%m = mul nsw <8 x i32> %a, <i32 32767, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32>
%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>		%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>		%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%ret = add <4 x i32> %odd, %even		%ret = add <4 x i32> %odd, %even
ret <4 x i32> %ret		ret <4 x i32> %ret
}		}

; Check that there is not selection for unsigned multiplication		; Do not select unsigned i16 multiplication
define <4 x i32> @pmaddwd_negative1(<8 x i16> %A, <8 x i16> %B) {		define <4 x i32> @pmaddwd_negative1(<8 x i16> %A, <8 x i16> %B) {
; SSE2-LABEL: pmaddwd_negative1:		; SSE2-LABEL: pmaddwd_negative1:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2		; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmulhuw %xmm1, %xmm2		; SSE2-NEXT: pmulhuw %xmm1, %xmm2
; SSE2-NEXT: pmullw %xmm1, %xmm0		; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1		; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]		; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
Show All 17 Lines	; AVX-NEXT: retq
%b = zext <8 x i16> %B to <8 x i32>		%b = zext <8 x i16> %B to <8 x i32>
%m = mul nuw <8 x i32> %a, %b		%m = mul nuw <8 x i32> %a, %b
%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>		%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>		%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%ret = add <4 x i32> %odd, %even		%ret = add <4 x i32> %odd, %even
ret <4 x i32> %ret		ret <4 x i32> %ret
}		}

; Check that there is not selection for out-of-bounds constant		; Do not select if constant is too large
define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {		define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {
; SSE2-LABEL: pmaddwd_negative2:		; SSE2-LABEL: pmaddwd_negative2:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]		; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: psrad $16, %xmm1		; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]		; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE2-NEXT: psrad $16, %xmm0		; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,7,42,32]		; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,7,42,32]
Show All 36 Lines

This is an archive of the discontinued LLVM Phabricator instance.

X86: Add pattern matching for PMADDWD
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 129219

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/madd.ll

This is an archive of the discontinued LLVM Phabricator instance.

X86: Add pattern matching for PMADDWDClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 129219

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/madd.ll

X86: Add pattern matching for PMADDWD
ClosedPublic