Diff 129145

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 32,759 Lines • ▼ Show 20 Lines
	!SplatVal.isOneValue())			!SplatVal.isOneValue())
	return SDValue();			return SDValue();

	SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));			SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
	unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;			unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
	return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);			return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
	}			}

				static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, EVT VT,
				const X86Subtarget &Subtarget) {
				// Example of pattern we try to detect:
				// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
				//(add (build_vector (extract_elt t, 0),
				// (extract_elt t, 2),
				// (extract_elt t, 4),
				// (extract_elt t, 6)),
				// (build_vector (extract_elt t, 1),
				// (extract_elt t, 3),
				// (extract_elt t, 5),
				// (extract_elt t, 7)))

				if (!Subtarget.hasSSE2())
				return SDValue();
				if (VT != MVT::v4i32 && (VT != MVT::v8i32 \|\| !Subtarget.hasAVX2()) &&
				(VT != MVT::v16i32 \|\| !Subtarget.hasBWI()))
				return SDValue();
				unsigned ValNumElts = VT.getVectorNumElements();

				// Helper for examining one ADD operand.
				auto IsBuildVectorOfExtractsFromMul = [](
				SDValue Op, ArrayRef<unsigned> ExpectedIndices, SDValue &RetMul) {
				if (Op->getOpcode() != ISD::BUILD_VECTOR)
				return false;
				SDValue Mul;
				for (unsigned i = 0, e = ExpectedIndices.size(); i != e; ++i) {
				//TODO: Be more tolerant to undefs.
				if (Op->getOperand(i)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
				return false;
				craig.topperUnsubmitted Not Done Reply Inline Actions Second line is indented 2 extra spaces. craig.topper: Second line is indented 2 extra spaces.
				auto *Idx = dyn_cast<ConstantSDNode>(Op->getOperand(i)->getOperand(1));
				if (!Idx \|\| Idx->getZExtValue() != ExpectedIndices[i])
				return false;
				if (Mul) {
				// Check that the extract is from the same MUL previously seen.
				if (Mul != Op->getOperand(i)->getOperand(0))
				return false;
				} else {
				// First time an extract_elt's source vector is visited. Must be a MUL
				// with 2X number of vector elements than the BUILD_VECTOR.
				Mul = Op->getOperand(i)->getOperand(0);
				if (Mul->getOpcode() != ISD::MUL \|\|
				craig.topperUnsubmitted Done Reply Inline Actions Do you need a hasSSE2 check on the v4i32? I don't see one before this call in combineAdd. craig.topper: Do you need a hasSSE2 check on the v4i32? I don't see one before this call in combineAdd.
				Mul.getValueType().getVectorNumElements() !=
				2 * ExpectedIndices.size())
				return false;
				}
				}
				RetMul = Mul;
				return true;
				};
				craig.topperUnsubmitted Done Reply Inline Actions What ensures the multiply has exactly 2X the elements of the build_vector? Couldn't it have more? Which would cause the truncate later to fail. craig.topper: What ensures the multiply has exactly 2X the elements of the build_vector? Couldn't it have…
				zviAuthorUnsubmitted Done Reply Inline Actions You're right! Will fix and add tests zvi: You're right! Will fix and add tests
				SDValue L, R;
				const unsigned ExpectedEvenIndices[] = {0, 2, 4, 6, 8, 10, 12, 14,
				16, 18, 20, 22, 24, 26, 28, 30};
				const unsigned ExpectedOddIndices[] = {1, 3, 5, 7, 9, 11, 13, 15,
				craig.topperUnsubmitted Done Reply Inline Actions tolerant* craig.topper: tolerant*
				17, 19, 21, 23, 25, 27, 29, 31};
				// Try the two possible orderings: (add even, odd) , (add odd, even)
				if (!(IsBuildVectorOfExtractsFromMul(
				Op0, makeArrayRef(ExpectedEvenIndices, ValNumElts), L) &&
				IsBuildVectorOfExtractsFromMul(
				Op1, makeArrayRef(ExpectedOddIndices, ValNumElts), R) &&
				L == R) &&
				!(IsBuildVectorOfExtractsFromMul(
				Op0, makeArrayRef(ExpectedOddIndices, ValNumElts), L) &&
				IsBuildVectorOfExtractsFromMul(
				Op1, makeArrayRef(ExpectedEvenIndices, ValNumElts), R) &&
				L == R))
				return SDValue();

				// Check if the Mul source can be safely shrunk.
				ShrinkMode Mode;
				if (!canReduceVMulWidth(L.getNode(), DAG, Mode) \|\| Mode == MULU16)
				return SDValue();

				// Shrink by adding truncate nodes and let DAGCombine fold with the
				// sources.
				craig.topperUnsubmitted Done Reply Inline Actions Is there anything that guarantees even indices will be on the LHS? craig.topper: Is there anything that guarantees even indices will be on the LHS?
				zviAuthorUnsubmitted Done Reply Inline Actions Will add checks for both orderings. thanks zvi: Will add checks for both orderings. thanks
				MVT TruncVT;
				switch (ValNumElts) {
				default: llvm_unreachable("Unexpected number of elements");
				case 4: TruncVT = MVT::v8i16; break;
				case 8: TruncVT = MVT::v16i16; break;
				case 16: TruncVT = MVT::v32i16; break;
				}
				return DAG.getNode(X86ISD::VPMADDWD, SDLoc(L), VT,
				DAG.getNode(ISD::TRUNCATE, SDLoc(L.getOperand(0)),
				TruncVT, L.getOperand(0)),
				DAG.getNode(ISD::TRUNCATE, SDLoc(L.getOperand(1)),
				TruncVT, L.getOperand(1)));
				}

	static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,			static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {			const X86Subtarget &Subtarget) {
	const SDNodeFlags Flags = N->getFlags();			const SDNodeFlags Flags = N->getFlags();
	if (Flags.hasVectorReduction()) {			if (Flags.hasVectorReduction()) {
	if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))			if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
	return Sad;			return Sad;
	if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))			if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
	return MAdd;			return MAdd;
	}			}
	EVT VT = N->getValueType(0);			EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);			SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);			SDValue Op1 = N->getOperand(1);

				if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, VT, Subtarget))
				return MAdd;

	// Try to synthesize horizontal adds from adds of shuffles.			// Try to synthesize horizontal adds from adds of shuffles.
	if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|			if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|
	(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&			(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&
	isHorizontalBinOp(Op0, Op1, true))			isHorizontalBinOp(Op0, Op1, true))
	return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);			return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);

	if (SDValue V = combineIncDecVector(N, DAG))			if (SDValue V = combineIncDecVector(N, DAG))
	return V;			return V;
	Show All 15 Lines
	!(Subtarget.hasAVX512() && Subtarget.hasBWI() &&			!(Subtarget.hasAVX512() && Subtarget.hasBWI() &&
	(VT == MVT::v64i8 \|\| VT == MVT::v32i16 \|\| VT == MVT::v16i32 \|\|			(VT == MVT::v64i8 \|\| VT == MVT::v32i16 \|\| VT == MVT::v16i32 \|\|
	VT == MVT::v8i64)))			VT == MVT::v8i64)))
	return SDValue();			return SDValue();

	SDValue SubusLHS, SubusRHS;			SDValue SubusLHS, SubusRHS;
	// Try to find umax(a,b) - b or a - umin(a,b) patterns			// Try to find umax(a,b) - b or a - umin(a,b) patterns
	// they may be converted to subus(a,b).			// they may be converted to subus(a,b).
	// TODO: Need to add IR cannonicialization for this code.			// TODO: Need to add IR cannonicialization for this code.
				RKSimonUnsubmitted Not Done Reply Inline Actions Can't you do this in a fully general manner, checking both LHS+RHS at the same time that one is the odd and the other is the even? It seems as the moment you can only check for all_odd+all_even or all_even+all_odd. RKSimon: Can't you do this in a fully general manner, checking both LHS+RHS at the same time that one is…
	if (Op0.getOpcode() == ISD::UMAX) {			if (Op0.getOpcode() == ISD::UMAX) {
	SubusRHS = Op1;			SubusRHS = Op1;
	SDValue MaxLHS = Op0.getOperand(0);			SDValue MaxLHS = Op0.getOperand(0);
	SDValue MaxRHS = Op0.getOperand(1);			SDValue MaxRHS = Op0.getOperand(1);
	if (MaxLHS == Op1)			if (MaxLHS == Op1)
	SubusLHS = MaxRHS;			SubusLHS = MaxRHS;
	else if (MaxRHS == Op1)			else if (MaxRHS == Op1)
	SubusLHS = MaxLHS;			SubusLHS = MaxLHS;
	else			else
	return SDValue();			return SDValue();
	} else if (Op1.getOpcode() == ISD::UMIN) {			} else if (Op1.getOpcode() == ISD::UMIN) {
	SubusLHS = Op0;			SubusLHS = Op0;
	SDValue MinLHS = Op1.getOperand(0);			SDValue MinLHS = Op1.getOperand(0);
	SDValue MinRHS = Op1.getOperand(1);			SDValue MinRHS = Op1.getOperand(1);
	if (MinLHS == Op0)			if (MinLHS == Op0)
	SubusRHS = MinRHS;			SubusRHS = MinRHS;
	else if (MinRHS == Op0)			else if (MinRHS == Op0)
	SubusRHS = MinLHS;			SubusRHS = MinLHS;
	else			else
	return SDValue();			return SDValue();
	} else			} else
	return SDValue();			return SDValue();
				RKSimonUnsubmitted Not Done Reply Inline Actions Mul.getValueType().getVectorNumElements() != (2 * e) RKSimon: Mul.getValueType().getVectorNumElements() != (2 * e)

	// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with			// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
	// special preprocessing in some cases.			// special preprocessing in some cases.
	if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)			if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
	return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);			return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);

	// Special preprocessing case can be only applied			// Special preprocessing case can be only applied
	// if the value was zero extended from 16 bit,			// if the value was zero extended from 16 bit,
	▲ Show 20 Lines • Show All 1,545 Lines • Show Last 20 Lines

test/CodeGen/X86/madd.ll

Show First 20 Lines • Show All 310 Lines • ▼ Show 20 Lines	middle.block:
%bin.rdx20 = add <16 x i32> %bin.rdx18, %rdx.shuf19		%bin.rdx20 = add <16 x i32> %bin.rdx18, %rdx.shuf19
%13 = extractelement <16 x i32> %bin.rdx20, i32 0		%13 = extractelement <16 x i32> %bin.rdx20, i32 0
ret i32 %13		ret i32 %13
}		}

define <4 x i32> @pmaddwd_8(<8 x i16> %A, <8 x i16> %B) {		define <4 x i32> @pmaddwd_8(<8 x i16> %A, <8 x i16> %B) {
; SSE2-LABEL: pmaddwd_8:		; SSE2-LABEL: pmaddwd_8:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2		; SSE2-NEXT: pmaddwd %xmm1, %xmm0
; SSE2-NEXT: pmulhw %xmm1, %xmm2
; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: retq		; SSE2-NEXT: retq
;		;
; AVX-LABEL: pmaddwd_8:		; AVX-LABEL: pmaddwd_8:
; AVX: # %bb.0:		; AVX: # %bb.0:
; AVX-NEXT: vpmovsxwd %xmm0, %ymm0		; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq		; AVX-NEXT: retq
%a = sext <8 x i16> %A to <8 x i32>		%a = sext <8 x i16> %A to <8 x i32>
%b = sext <8 x i16> %B to <8 x i32>		%b = sext <8 x i16> %B to <8 x i32>
%m = mul nsw <8 x i32> %a, %b		%m = mul nsw <8 x i32> %a, %b
%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>		%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>		%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%ret = add <4 x i32> %odd, %even		%ret = add <4 x i32> %odd, %even
ret <4 x i32> %ret		ret <4 x i32> %ret
}		}

		define <4 x i32> @pmaddwd_8_swapped(<8 x i16> %A, <8 x i16> %B) {
		; SSE2-LABEL: pmaddwd_8_swapped:
		; SSE2: # %bb.0:
		; SSE2-NEXT: pmaddwd %xmm1, %xmm0
		; SSE2-NEXT: retq
		;
		; AVX-LABEL: pmaddwd_8_swapped:
		; AVX: # %bb.0:
		; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
		; AVX-NEXT: retq
		%a = sext <8 x i16> %A to <8 x i32>
		%b = sext <8 x i16> %B to <8 x i32>
		%m = mul nsw <8 x i32> %a, %b
		%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
		%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
		%ret = add <4 x i32> %even, %odd
		ret <4 x i32> %ret
		}

		define <4 x i32> @larger_mul(<16 x i16> %A, <16 x i16> %B) {
		; SSE2-LABEL: larger_mul:
		; SSE2: # %bb.0:
		; SSE2-NEXT: movdqa %xmm0, %xmm1
		; SSE2-NEXT: pmulhw %xmm2, %xmm1
		; SSE2-NEXT: pmullw %xmm2, %xmm0
		; SSE2-NEXT: movdqa %xmm0, %xmm2
		; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
		; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
		; SSE2-NEXT: movdqa %xmm0, %xmm1
		; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
		; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
		; SSE2-NEXT: paddd %xmm1, %xmm0
		; SSE2-NEXT: retq
		;
		; AVX2-LABEL: larger_mul:
		; AVX2: # %bb.0:
		; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
		; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
		; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
		; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1
		; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
		; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
		; AVX2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
		; AVX2-NEXT: vzeroupper
		; AVX2-NEXT: retq
		;
		; AVX512-LABEL: larger_mul:
		; AVX512: # %bb.0:
		; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
		; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1
		; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
		; AVX512-NEXT: vpextrd $2, %xmm0, %eax
		; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1
		; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
		; AVX512-NEXT: vmovd %xmm2, %eax
		; AVX512-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
		; AVX512-NEXT: vpextrd $2, %xmm2, %eax
		; AVX512-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
		; AVX512-NEXT: vpextrd $3, %xmm0, %eax
		; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
		; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
		; AVX512-NEXT: vpextrd $1, %xmm2, %eax
		; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
		; AVX512-NEXT: vpextrd $3, %xmm2, %eax
		; AVX512-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
		; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0
		; AVX512-NEXT: vzeroupper
		; AVX512-NEXT: retq
		%a = sext <16 x i16> %A to <16 x i32>
		%b = sext <16 x i16> %B to <16 x i32>
		%m = mul nsw <16 x i32> %a, %b
		%odd = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
		%even = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
		%ret = add <4 x i32> %odd, %even
		ret <4 x i32> %ret
		}

define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) {		define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) {
; SSE2-LABEL: pmaddwd_16:		; SSE2-LABEL: pmaddwd_16:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm4		; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pmulhw %xmm2, %xmm4		; SSE2-NEXT: pmulhw %xmm2, %xmm4
; SSE2-NEXT: pmullw %xmm2, %xmm0		; SSE2-NEXT: pmullw %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2		; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]		; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]		; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm1, %xmm4		; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pmulhw %xmm3, %xmm4		; SSE2-NEXT: pmulhw %xmm3, %xmm4
; SSE2-NEXT: pmullw %xmm3, %xmm1		; SSE2-NEXT: pmullw %xmm3, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm3		; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]		; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]		; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm1, %xmm4		; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]		; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
; SSE2-NEXT: movdqa %xmm0, %xmm5		; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,2]		; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]		; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
; SSE2-NEXT: paddd %xmm4, %xmm1		; SSE2-NEXT: paddd %xmm4, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]		; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
; SSE2-NEXT: paddd %xmm5, %xmm0		; SSE2-NEXT: paddd %xmm5, %xmm0
; SSE2-NEXT: retq		; SSE2-NEXT: retq
		RKSimonUnsubmitted Done Reply Inline Actions Whats missing to get SSE2 to lower to 2 x pmaddwd? (TBH I'm more interested in AVX1 but it'd be good for SSE as well). RKSimon: Whats missing to get SSE2 to lower to 2 x pmaddwd? (TBH I'm more interested in AVX1 but it'd be…
		zviAuthorUnsubmitted Done Reply Inline Actions That's a good idea. Though it might be tricky to split the 'mul' operands so that type-legalization won't mess-up illegal types. I can try to rework this patch or leave it as a follow-up if it gets too messy. zvi: That's a good idea. Though it might be tricky to split the 'mul' operands so that type…
		RKSimonUnsubmitted Done Reply Inline Actions I did something similar in D41440 for PAVG - we could pull out and generalize the 'LowerToAVG' code to split into legal ops and concat the results. RKSimon: I did something similar in D41440 for PAVG - we could pull out and generalize the 'LowerToAVG'…
;		;
; AVX2-LABEL: pmaddwd_16:		; AVX-LABEL: pmaddwd_16:
; AVX2: # %bb.0:		; AVX: # %bb.0:
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm2		; AVX-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0		; AVX-NEXT: retq
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vpmovsxwd %xmm1, %ymm3
; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm0[0,2],ymm2[4,6],ymm0[4,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: pmaddwd_16:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1
; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; AVX512-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX512-NEXT: retq
%a = sext <16 x i16> %A to <16 x i32>		%a = sext <16 x i16> %A to <16 x i32>
%b = sext <16 x i16> %B to <16 x i32>		%b = sext <16 x i16> %B to <16 x i32>
%m = mul nsw <16 x i32> %a, %b		%m = mul nsw <16 x i32> %a, %b
%odd = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>		%odd = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%even = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>		%even = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%ret = add <8 x i32> %odd, %even		%ret = add <8 x i32> %odd, %even
ret <8 x i32> %ret		ret <8 x i32> %ret
}		}
▲ Show 20 Lines • Show All 85 Lines • ▼ Show 20 Lines
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2		; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]		; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm3		; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
; AVX512F-NEXT: vpaddd %zmm3, %zmm2, %zmm0		; AVX512F-NEXT: vpaddd %zmm3, %zmm2, %zmm0
; AVX512F-NEXT: retq		; AVX512F-NEXT: retq
;		;
; AVX512BW-LABEL: pmaddwd_32:		; AVX512BW-LABEL: pmaddwd_32:
; AVX512BW: # %bb.0:		; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovsxwd %ymm0, %zmm2		; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512BW-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm3
; AVX512BW-NEXT: vpmulld %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm1
; AVX512BW-NEXT: vpmulld %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1
; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm3
; AVX512BW-NEXT: vpaddd %zmm3, %zmm1, %zmm0
; AVX512BW-NEXT: retq		; AVX512BW-NEXT: retq
%a = sext <32 x i16> %A to <32 x i32>		%a = sext <32 x i16> %A to <32 x i32>
%b = sext <32 x i16> %B to <32 x i32>		%b = sext <32 x i16> %B to <32 x i32>
%m = mul nsw <32 x i32> %a, %b		%m = mul nsw <32 x i32> %a, %b
%odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>		%odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
%even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>		%even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
%ret = add <16 x i32> %odd, %even		%ret = add <16 x i32> %odd, %even
ret <16 x i32> %ret		ret <16 x i32> %ret
}		}

define <4 x i32> @pmaddwd_const(<8 x i16> %A) {		define <4 x i32> @pmaddwd_const(<8 x i16> %A) {
; SSE2-LABEL: pmaddwd_const:		; SSE2-LABEL: pmaddwd_const:
; SSE2: # %bb.0:		; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32767,32768,0,0,1,7,42,32]		; SSE2-NEXT: pmaddwd {{.*}}(%rip), %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmulhw %xmm1, %xmm2
; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: retq		; SSE2-NEXT: retq
;		;
; AVX-LABEL: pmaddwd_const:		; AVX-LABEL: pmaddwd_const:
; AVX: # %bb.0:		; AVX: # %bb.0:
; AVX-NEXT: vpmovsxwd %xmm0, %ymm0		; AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq		; AVX-NEXT: retq
%a = sext <8 x i16> %A to <8 x i32>		%a = sext <8 x i16> %A to <8 x i32>
%m = mul nsw <8 x i32> %a, <i32 32767, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32>		%m = mul nsw <8 x i32> %a, <i32 32767, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32>
%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>		%odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>		%even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%ret = add <4 x i32> %odd, %even		%ret = add <4 x i32> %odd, %even
ret <4 x i32> %ret		ret <4 x i32> %ret
}		}
▲ Show 20 Lines • Show All 80 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

X86: Add pattern matching for PMADDWD
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 129145

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/madd.ll

This is an archive of the discontinued LLVM Phabricator instance.

X86: Add pattern matching for PMADDWDClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 129145

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/madd.ll

X86: Add pattern matching for PMADDWD
ClosedPublic