Diff 129470

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 32,676 Lines • ▼ Show 20 Lines
	SDValue Op0 = N->getOperand(0);			SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);			SDValue Op1 = N->getOperand(1);

	// TODO: There's nothing special about i32, any integer type above i16 should			// TODO: There's nothing special about i32, any integer type above i16 should
	// work just as well.			// work just as well.
	if (!VT.isVector() \|\| !VT.isSimple() \|\|			if (!VT.isVector() \|\| !VT.isSimple() \|\|
	!(VT.getVectorElementType() == MVT::i32))			!(VT.getVectorElementType() == MVT::i32))
	return SDValue();			return SDValue();

				craig.topperUnsubmitted Not Done Reply Inline Actions Second line is indented 2 extra spaces. craig.topper: Second line is indented 2 extra spaces.
	unsigned RegSize = 128;			unsigned RegSize = 128;
	if (Subtarget.hasBWI())			if (Subtarget.hasBWI())
	RegSize = 512;			RegSize = 512;
	else if (Subtarget.hasAVX2())			else if (Subtarget.hasAVX2())
	RegSize = 256;			RegSize = 256;

	// We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.			// We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before			// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.			// feeding them into several SADs, and then reducing over those.
	if (VT.getSizeInBits() / 4 > RegSize)			if (VT.getSizeInBits() / 4 > RegSize)
	return SDValue();			return SDValue();

				craig.topperUnsubmitted Done Reply Inline Actions Do you need a hasSSE2 check on the v4i32? I don't see one before this call in combineAdd. craig.topper: Do you need a hasSSE2 check on the v4i32? I don't see one before this call in combineAdd.
	// We know N is a reduction add, which means one of its operands is a phi.			// We know N is a reduction add, which means one of its operands is a phi.
	// To match SAD, we need the other operand to be a vector select.			// To match SAD, we need the other operand to be a vector select.
	SDValue SelectOp, Phi;			SDValue SelectOp, Phi;
	if (Op0.getOpcode() == ISD::VSELECT) {			if (Op0.getOpcode() == ISD::VSELECT) {
	SelectOp = Op0;			SelectOp = Op0;
	Phi = Op1;			Phi = Op1;
	} else if (Op1.getOpcode() == ISD::VSELECT) {			} else if (Op1.getOpcode() == ISD::VSELECT) {
	SelectOp = Op1;			SelectOp = Op1;
				craig.topperUnsubmitted Done Reply Inline Actions What ensures the multiply has exactly 2X the elements of the build_vector? Couldn't it have more? Which would cause the truncate later to fail. craig.topper: What ensures the multiply has exactly 2X the elements of the build_vector? Couldn't it have…
				zviAuthorUnsubmitted Done Reply Inline Actions You're right! Will fix and add tests zvi: You're right! Will fix and add tests
	Phi = Op0;			Phi = Op0;
	} else			} else
	return SDValue();			return SDValue();

				craig.topperUnsubmitted Done Reply Inline Actions tolerant* craig.topper: tolerant*
	// Check whether we have an abs-diff pattern feeding into the select.			// Check whether we have an abs-diff pattern feeding into the select.
	if(!detectZextAbsDiff(SelectOp, Op0, Op1))			if(!detectZextAbsDiff(SelectOp, Op0, Op1))
	return SDValue();			return SDValue();

	// SAD pattern detected. Now build a SAD instruction and an addition for			// SAD pattern detected. Now build a SAD instruction and an addition for
	// reduction. Note that the number of elements of the result of SAD is less			// reduction. Note that the number of elements of the result of SAD is less
	// than the number of elements of its input. Therefore, we could only update			// than the number of elements of its input. Therefore, we could only update
	// part of elements in the reduction vector.			// part of elements in the reduction vector.
	SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);			SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);

	// The output of PSADBW is a vector of i64.			// The output of PSADBW is a vector of i64.
	// We need to turn the vector of i64 into a vector of i32.			// We need to turn the vector of i64 into a vector of i32.
	// If the reduction vector is at least as wide as the psadbw result, just			// If the reduction vector is at least as wide as the psadbw result, just
	// bitcast. If it's narrower, truncate - the high i32 of each i64 is zero			// bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
	// anyway.			// anyway.
	MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);			MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
	if (VT.getSizeInBits() >= ResVT.getSizeInBits())			if (VT.getSizeInBits() >= ResVT.getSizeInBits())
	Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);			Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
	else			else
	Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);			Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);

				craig.topperUnsubmitted Done Reply Inline Actions Is there anything that guarantees even indices will be on the LHS? craig.topper: Is there anything that guarantees even indices will be on the LHS?
				zviAuthorUnsubmitted Done Reply Inline Actions Will add checks for both orderings. thanks zvi: Will add checks for both orderings. thanks
	if (VT.getSizeInBits() > ResVT.getSizeInBits()) {			if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
	// Fill the upper elements with zero to match the add width.			// Fill the upper elements with zero to match the add width.
	SDValue Zero = DAG.getConstant(0, DL, VT);			SDValue Zero = DAG.getConstant(0, DL, VT);
	Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,			Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
	DAG.getIntPtrConstant(0, DL));			DAG.getIntPtrConstant(0, DL));
	}			}

	return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);			return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
	Show All 21 Lines
	!SplatVal.isOneValue())			!SplatVal.isOneValue())
	return SDValue();			return SDValue();

	SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));			SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
	unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;			unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
	return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);			return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
	}			}

				static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
				SDLoc DL, EVT VT, const X86Subtarget &Subtarget) {
				// Example of pattern we try to detect:
				// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
				//(add (build_vector (extract_elt t, 0),
				// (extract_elt t, 2),
				// (extract_elt t, 4),
				// (extract_elt t, 6)),
				// (build_vector (extract_elt t, 1),
				// (extract_elt t, 3),
				// (extract_elt t, 5),
				// (extract_elt t, 7)))

				if (!Subtarget.hasSSE2())
				return SDValue();

				if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i32 \|\|
				VT.getVectorNumElements() < 4 \|\|
				!isPowerOf2_32(VT.getVectorNumElements()))
				return SDValue();

				unsigned ValNumElts = VT.getVectorNumElements();

				// Helper for examining one ADD operand.
				auto IsBuildVectorOfExtractsFromMul = [](
				RKSimonUnsubmitted Not Done Reply Inline Actions Can't you do this in a fully general manner, checking both LHS+RHS at the same time that one is the odd and the other is the even? It seems as the moment you can only check for all_odd+all_even or all_even+all_odd. RKSimon: Can't you do this in a fully general manner, checking both LHS+RHS at the same time that one is…
				SDValue Op, ArrayRef<unsigned> ExpectedIndices, SDValue &RetMul) {
				if (Op->getOpcode() != ISD::BUILD_VECTOR)
				return false;
				SDValue Mul;
				for (unsigned i = 0, e = ExpectedIndices.size(); i != e; ++i) {
				// TODO: Be more tolerant to undefs.
				if (Op->getOperand(i)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
				return false;
				auto *Idx = dyn_cast<ConstantSDNode>(Op->getOperand(i)->getOperand(1));
				if (!Idx \|\| Idx->getZExtValue() != ExpectedIndices[i])
				return false;
				if (Mul) {
				// Check that the extract is from the same MUL previously seen.
				if (Mul != Op->getOperand(i)->getOperand(0))
				return false;
				} else {
				// First time an extract_elt's source vector is visited. Must be a MUL
				// with 2X number of vector elements than the BUILD_VECTOR.
				Mul = Op->getOperand(i)->getOperand(0);
				if (Mul->getOpcode() != ISD::MUL \|\|
				Mul.getValueType().getVectorNumElements() !=
				2 * ExpectedIndices.size())
				RKSimonUnsubmitted Not Done Reply Inline Actions Mul.getValueType().getVectorNumElements() != (2 * e) RKSimon: Mul.getValueType().getVectorNumElements() != (2 * e)
				return false;
				}
				}
				RetMul = Mul;
				return true;
				};
				SDValue L, R;
				const unsigned ExpectedEvenIndices[] = {0, 2, 4, 6, 8, 10, 12, 14,
				16, 18, 20, 22, 24, 26, 28, 30};
				const unsigned ExpectedOddIndices[] = {1, 3, 5, 7, 9, 11, 13, 15,
				17, 19, 21, 23, 25, 27, 29, 31};
				// Try the two possible orderings: (add even, odd) , (add odd, even)
				if (!(IsBuildVectorOfExtractsFromMul(
				Op0, makeArrayRef(ExpectedEvenIndices, ValNumElts), L) &&
				IsBuildVectorOfExtractsFromMul(
				Op1, makeArrayRef(ExpectedOddIndices, ValNumElts), R) &&
				L == R) &&
				!(IsBuildVectorOfExtractsFromMul(
				Op0, makeArrayRef(ExpectedOddIndices, ValNumElts), L) &&
				IsBuildVectorOfExtractsFromMul(
				Op1, makeArrayRef(ExpectedEvenIndices, ValNumElts), R) &&
				L == R))
				return SDValue();

				// Check if the Mul source can be safely shrunk.
				ShrinkMode Mode;
				if (!canReduceVMulWidth(L.getNode(), DAG, Mode) \|\| Mode == MULU16)
				return SDValue();

				auto PMADDBuilder = [](SelectionDAG &DAG, SDLoc DL, SDValue Op0,
				SDValue Op1) {
				// Shrink by adding truncate nodes and let DAGCombine fold with the
				// sources.
				EVT InVT = Op0.getValueType();
				assert(InVT.getScalarType() == MVT::i32 &&
				"Unexpected scalar element type");
				assert(InVT == Op1.getValueType() && "Operands' types mismatch");
				EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
				InVT.getVectorNumElements() / 2);
				EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
				InVT.getVectorNumElements());
				return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
				DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Op0),
				DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Op1));
				};
				return SplitBinaryOpsAndApply(DAG, Subtarget, DL, VT, L.getOperand(0),
				L.getOperand(1), PMADDBuilder);
				}

	static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,			static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {			const X86Subtarget &Subtarget) {
	const SDNodeFlags Flags = N->getFlags();			const SDNodeFlags Flags = N->getFlags();
	if (Flags.hasVectorReduction()) {			if (Flags.hasVectorReduction()) {
	if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))			if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
	return Sad;			return Sad;
	if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))			if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
	return MAdd;			return MAdd;
	}			}
	EVT VT = N->getValueType(0);			EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);			SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);			SDValue Op1 = N->getOperand(1);

				if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
				return MAdd;

	// Try to synthesize horizontal adds from adds of shuffles.			// Try to synthesize horizontal adds from adds of shuffles.
	if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|			if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|
	(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&			(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&
	isHorizontalBinOp(Op0, Op1, true))			isHorizontalBinOp(Op0, Op1, true))
	return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);			return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);

	if (SDValue V = combineIncDecVector(N, DAG))			if (SDValue V = combineIncDecVector(N, DAG))
	return V;			return V;
	▲ Show 20 Lines • Show All 1,599 Lines • Show Last 20 Lines

test/CodeGen/X86/madd.ll

Show First 20 Lines • Show All 310 Lines • ▼ Show 20 Lines	middle.block:
%bin.rdx20 = add <16 x i32> %bin.rdx18, %rdx.shuf19		%bin.rdx20 = add <16 x i32> %bin.rdx18, %rdx.shuf19
%13 = extractelement <16 x i32> %bin.rdx20, i32 0		%13 = extractelement <16 x i32> %bin.rdx20, i32 0
ret i32 %13		ret i32 %13
}		}

define <4 x i32> @pmaddwd_8(<8 x i16> %A, <8 x i16> %B) {		define <4 x i32> @pmaddwd_8(<8 x i16> %A, <8 x i16> %B) {
; SSE2-LABEL: pmaddwd_8:		; SSE2-LABEL: pmaddwd_8:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2		; SSE2-NEXT: pmaddwd %xmm1, %xmm0
; SSE2-NEXT: pmulhw %xmm1, %xmm2
; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: retq		; SSE2-NEXT: retq
;		;
; AVX-LABEL: pmaddwd_8:		; AVX-LABEL: pmaddwd_8:
; AVX: # %bb.0:		; AVX: # %bb.0:
; AVX-NEXT: vpmovsxwd %xmm0, %ymm0		; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq		; AVX-NEXT: retq
%a = sext <8 x i16> %A to <8 x i32>		%a = sext <8 x i16> %A to <8 x i32>
%b = sext <8 x i16> %B to <8 x i32>		%b = sext <8 x i16> %B to <8 x i32>
%m = mul nsw <8 x i32> %a, %b		%m = mul nsw <8 x i32> %a, %b
%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>		%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>		%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%ret = add <4 x i32> %odd, %even		%ret = add <4 x i32> %odd, %even
ret <4 x i32> %ret		ret <4 x i32> %ret
}		}

define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) {		define <4 x i32> @pmaddwd_8_swapped(<8 x i16> %A, <8 x i16> %B) {
; SSE2-LABEL: pmaddwd_16:		; SSE2-LABEL: pmaddwd_8_swapped:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm4		; SSE2-NEXT: pmaddwd %xmm1, %xmm0
; SSE2-NEXT: pmulhw %xmm2, %xmm4		; SSE2-NEXT: retq
		;
		; AVX-LABEL: pmaddwd_8_swapped:
		; AVX: # %bb.0:
		; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
		; AVX-NEXT: retq
		%a = sext <8 x i16> %A to <8 x i32>
		%b = sext <8 x i16> %B to <8 x i32>
		%m = mul nsw <8 x i32> %a, %b
		%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
		%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
		%ret = add <4 x i32> %even, %odd
		ret <4 x i32> %ret
		}

		define <4 x i32> @larger_mul(<16 x i16> %A, <16 x i16> %B) {
		; SSE2-LABEL: larger_mul:
		; SSE2: # %bb.0:
		; SSE2-NEXT: movdqa %xmm0, %xmm1
		; SSE2-NEXT: pmulhw %xmm2, %xmm1
; SSE2-NEXT: pmullw %xmm2, %xmm0		; SSE2-NEXT: pmullw %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2		; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]		; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]		; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: movdqa %xmm1, %xmm4		; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pmulhw %xmm3, %xmm4		; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; SSE2-NEXT: pmullw %xmm3, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
; SSE2-NEXT: paddd %xmm4, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]		; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
; SSE2-NEXT: paddd %xmm5, %xmm0		; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: retq		; SSE2-NEXT: retq
		RKSimonUnsubmitted Done Reply Inline Actions Whats missing to get SSE2 to lower to 2 x pmaddwd? (TBH I'm more interested in AVX1 but it'd be good for SSE as well). RKSimon: Whats missing to get SSE2 to lower to 2 x pmaddwd? (TBH I'm more interested in AVX1 but it'd be…
		zviAuthorUnsubmitted Done Reply Inline Actions That's a good idea. Though it might be tricky to split the 'mul' operands so that type-legalization won't mess-up illegal types. I can try to rework this patch or leave it as a follow-up if it gets too messy. zvi: That's a good idea. Though it might be tricky to split the 'mul' operands so that type…
		RKSimonUnsubmitted Done Reply Inline Actions I did something similar in D41440 for PAVG - we could pull out and generalize the 'LowerToAVG' code to split into legal ops and concat the results. RKSimon: I did something similar in D41440 for PAVG - we could pull out and generalize the 'LowerToAVG'…
;		;
; AVX2-LABEL: pmaddwd_16:		; AVX2-LABEL: larger_mul:
; AVX2: # %bb.0:		; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0		; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vpmovsxwd %xmm1, %ymm3
; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1		; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0		; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm0[0,2],ymm2[4,6],ymm0[4,6]		; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]		; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7]		; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]		; AVX2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0		; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq		; AVX2-NEXT: retq
;		;
; AVX512-LABEL: pmaddwd_16:		; AVX512-LABEL: larger_mul:
; AVX512: # %bb.0:		; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0		; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1		; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1
; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0		; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1		; AVX512-NEXT: vpextrd $2, %xmm0, %eax
; AVX512-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]		; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1
; AVX512-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]		; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]		; AVX512-NEXT: vmovd %xmm2, %eax
; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]		; AVX512-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0		; AVX512-NEXT: vpextrd $2, %xmm2, %eax
		; AVX512-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
		; AVX512-NEXT: vpextrd $3, %xmm0, %eax
		; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
		; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
		; AVX512-NEXT: vpextrd $1, %xmm2, %eax
		; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
		; AVX512-NEXT: vpextrd $3, %xmm2, %eax
		; AVX512-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
		; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0
		; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq		; AVX512-NEXT: retq
%a = sext <16 x i16> %A to <16 x i32>		%a = sext <16 x i16> %A to <16 x i32>
%b = sext <16 x i16> %B to <16 x i32>		%b = sext <16 x i16> %B to <16 x i32>
%m = mul nsw <16 x i32> %a, %b		%m = mul nsw <16 x i32> %a, %b
		%odd = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
		%even = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
		%ret = add <4 x i32> %odd, %even
		ret <4 x i32> %ret
		}

		define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) {
		; SSE2-LABEL: pmaddwd_16:
		; SSE2: # %bb.0:
		; SSE2-NEXT: pmaddwd %xmm2, %xmm0
		; SSE2-NEXT: pmaddwd %xmm3, %xmm1
		; SSE2-NEXT: retq
		;
		; AVX-LABEL: pmaddwd_16:
		; AVX: # %bb.0:
		; AVX-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
		; AVX-NEXT: retq
		%a = sext <16 x i16> %A to <16 x i32>
		%b = sext <16 x i16> %B to <16 x i32>
		%m = mul nsw <16 x i32> %a, %b
%odd = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>		%odd = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%even = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>		%even = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%ret = add <8 x i32> %odd, %even		%ret = add <8 x i32> %odd, %even
ret <8 x i32> %ret		ret <8 x i32> %ret
}		}

define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) {		define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) {
; SSE2-LABEL: pmaddwd_32:		; SSE2-LABEL: pmaddwd_32:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm8		; SSE2-NEXT: pmaddwd %xmm4, %xmm0
; SSE2-NEXT: pmulhw %xmm4, %xmm8		; SSE2-NEXT: pmaddwd %xmm5, %xmm1
; SSE2-NEXT: pmullw %xmm4, %xmm0		; SSE2-NEXT: pmaddwd %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm9		; SSE2-NEXT: pmaddwd %xmm7, %xmm3
; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pmulhw %xmm5, %xmm4
; SSE2-NEXT: pmullw %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm8
; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pmulhw %xmm6, %xmm4
; SSE2-NEXT: pmullw %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm6
; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pmulhw %xmm7, %xmm4
; SSE2-NEXT: pmullw %xmm7, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm7
; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[0,2]
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2]
; SSE2-NEXT: movdqa %xmm1, %xmm10
; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm8[0,2]
; SSE2-NEXT: movdqa %xmm0, %xmm11
; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm9[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm7[1,3]
; SSE2-NEXT: paddd %xmm4, %xmm3
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm6[1,3]
; SSE2-NEXT: paddd %xmm5, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm8[1,3]
; SSE2-NEXT: paddd %xmm10, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm9[1,3]
; SSE2-NEXT: paddd %xmm11, %xmm0
; SSE2-NEXT: retq		; SSE2-NEXT: retq
;		;
; AVX2-LABEL: pmaddwd_32:		; AVX2-LABEL: pmaddwd_32:
; AVX2: # %bb.0:		; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxwd %xmm1, %ymm4		; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1		; AVX2-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm5
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vpmovsxwd %xmm3, %ymm6
; AVX2-NEXT: vpmulld %ymm6, %ymm4, %ymm4
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3
; AVX2-NEXT: vpmovsxwd %xmm3, %ymm3
; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpmovsxwd %xmm2, %ymm3
; AVX2-NEXT: vpmulld %ymm3, %ymm5, %ymm3
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX2-NEXT: vpmovsxwd %xmm2, %ymm2
; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,2],ymm1[0,2],ymm4[4,6],ymm1[4,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,2],ymm0[0,2],ymm3[4,6],ymm0[4,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3]
; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,3],ymm1[1,3],ymm4[5,7],ymm1[5,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,3],ymm0[1,3],ymm3[5,7],ymm0[5,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpaddd %ymm0, %ymm5, %ymm0
; AVX2-NEXT: retq		; AVX2-NEXT: retq
;		;
; AVX512F-LABEL: pmaddwd_32:		; AVX512F-LABEL: pmaddwd_32:
; AVX512F: # %bb.0:		; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0		; AVX512F-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1		; AVX512F-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2		; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmulld %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpmovsxwd %ymm3, %zmm2
; AVX512F-NEXT: vpmulld %zmm2, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
; AVX512F-NEXT: vpaddd %zmm3, %zmm2, %zmm0
; AVX512F-NEXT: retq		; AVX512F-NEXT: retq
;		;
; AVX512BW-LABEL: pmaddwd_32:		; AVX512BW-LABEL: pmaddwd_32:
; AVX512BW: # %bb.0:		; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovsxwd %ymm0, %zmm2		; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512BW-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm3
; AVX512BW-NEXT: vpmulld %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm1
; AVX512BW-NEXT: vpmulld %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1
; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm3
; AVX512BW-NEXT: vpaddd %zmm3, %zmm1, %zmm0
; AVX512BW-NEXT: retq		; AVX512BW-NEXT: retq
%a = sext <32 x i16> %A to <32 x i32>		%a = sext <32 x i16> %A to <32 x i32>
%b = sext <32 x i16> %B to <32 x i32>		%b = sext <32 x i16> %B to <32 x i32>
%m = mul nsw <32 x i32> %a, %b		%m = mul nsw <32 x i32> %a, %b
%odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>		%odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
%even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>		%even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
%ret = add <16 x i32> %odd, %even		%ret = add <16 x i32> %odd, %even
ret <16 x i32> %ret		ret <16 x i32> %ret
}		}

define <4 x i32> @pmaddwd_const(<8 x i16> %A) {		define <4 x i32> @pmaddwd_const(<8 x i16> %A) {
; SSE2-LABEL: pmaddwd_const:		; SSE2-LABEL: pmaddwd_const:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32767,32768,0,0,1,7,42,32]		; SSE2-NEXT: pmaddwd {{.*}}(%rip), %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmulhw %xmm1, %xmm2
; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: retq		; SSE2-NEXT: retq
;		;
; AVX-LABEL: pmaddwd_const:		; AVX-LABEL: pmaddwd_const:
; AVX: # %bb.0:		; AVX: # %bb.0:
; AVX-NEXT: vpmovsxwd %xmm0, %ymm0		; AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq		; AVX-NEXT: retq
%a = sext <8 x i16> %A to <8 x i32>		%a = sext <8 x i16> %A to <8 x i32>
%m = mul nsw <8 x i32> %a, <i32 32767, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32>		%m = mul nsw <8 x i32> %a, <i32 32767, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32>
%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>		%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>		%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%ret = add <4 x i32> %odd, %even		%ret = add <4 x i32> %odd, %even
ret <4 x i32> %ret		ret <4 x i32> %ret
}		}

; Check that there is not selection for unsigned multiplication		; Do not select unsigned i16 multiplication
define <4 x i32> @pmaddwd_negative1(<8 x i16> %A, <8 x i16> %B) {		define <4 x i32> @pmaddwd_negative1(<8 x i16> %A, <8 x i16> %B) {
; SSE2-LABEL: pmaddwd_negative1:		; SSE2-LABEL: pmaddwd_negative1:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2		; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmulhuw %xmm1, %xmm2		; SSE2-NEXT: pmulhuw %xmm1, %xmm2
; SSE2-NEXT: pmullw %xmm1, %xmm0		; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1		; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]		; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
Show All 17 Lines	; AVX-NEXT: retq
%b = zext <8 x i16> %B to <8 x i32>		%b = zext <8 x i16> %B to <8 x i32>
%m = mul nuw <8 x i32> %a, %b		%m = mul nuw <8 x i32> %a, %b
%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>		%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>		%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%ret = add <4 x i32> %odd, %even		%ret = add <4 x i32> %odd, %even
ret <4 x i32> %ret		ret <4 x i32> %ret
}		}

; Check that there is not selection for out-of-bounds constant		; Do not select if constant is too large
define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {		define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {
; SSE2-LABEL: pmaddwd_negative2:		; SSE2-LABEL: pmaddwd_negative2:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]		; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: psrad $16, %xmm1		; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]		; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE2-NEXT: psrad $16, %xmm0		; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,7,42,32]		; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,7,42,32]
Show All 36 Lines

This is an archive of the discontinued LLVM Phabricator instance.

X86: Add pattern matching for PMADDWD
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 129470

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/madd.ll

This is an archive of the discontinued LLVM Phabricator instance.

X86: Add pattern matching for PMADDWDClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 129470

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/madd.ll

X86: Add pattern matching for PMADDWD
ClosedPublic