Diff 211556

llvm/trunk/include/llvm/CodeGen/SelectionDAG.h

Show First 20 Lines • Show All 1,582 Lines • ▼ Show 20 Lines	public:
/// that element from the source vector.		/// that element from the source vector.
SDValue getSplatValue(SDValue V);		SDValue getSplatValue(SDValue V);

/// Match a binop + shuffle pyramid that represents a horizontal reduction		/// Match a binop + shuffle pyramid that represents a horizontal reduction
/// over the elements of a vector starting from the EXTRACT_VECTOR_ELT node /p		/// over the elements of a vector starting from the EXTRACT_VECTOR_ELT node /p
/// Extract. The reduction must use one of the opcodes listed in /p		/// Extract. The reduction must use one of the opcodes listed in /p
/// CandidateBinOps and on success /p BinOp will contain the matching opcode.		/// CandidateBinOps and on success /p BinOp will contain the matching opcode.
/// Returns the vector that is being reduced on, or SDValue() if a reduction		/// Returns the vector that is being reduced on, or SDValue() if a reduction
/// was not matched.		/// was not matched. If \p AllowPartials is set then in the case of a
		/// reduction pattern that only matches the first few stages, the extracted
		/// subvector of the start of the reduction is returned.
SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,		SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
ArrayRef<ISD::NodeType> CandidateBinOps);		ArrayRef<ISD::NodeType> CandidateBinOps,
		bool AllowPartials = false);

/// Utility function used by legalize and lowering to		/// Utility function used by legalize and lowering to
/// "unroll" a vector operation by splitting out the scalars and operating		/// "unroll" a vector operation by splitting out the scalars and operating
/// on each element individually. If the ResNE is 0, fully unroll the vector		/// on each element individually. If the ResNE is 0, fully unroll the vector
/// op. If ResNE is less than the width of the vector op, unroll up to ResNE.		/// op. If ResNE is less than the width of the vector op, unroll up to ResNE.
/// If the ResNE is greater than the width of the vector op, unroll the		/// If the ResNE is greater than the width of the vector op, unroll the
/// vector op and fill the end of the resulting vector with UNDEFS.		/// vector op and fill the end of the resulting vector with UNDEFS.
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE = 0);		SDValue UnrollVectorOp(SDNode *N, unsigned ResNE = 0);
▲ Show 20 Lines • Show All 167 Lines • Show Last 20 Lines

llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 8,999 Lines • ▼ Show 20 Lines
	}			}

	void SDNode::intersectFlagsWith(const SDNodeFlags Flags) {			void SDNode::intersectFlagsWith(const SDNodeFlags Flags) {
	this->Flags.intersectWith(Flags);			this->Flags.intersectWith(Flags);
	}			}

	SDValue			SDValue
	SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,			SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
	ArrayRef<ISD::NodeType> CandidateBinOps) {			ArrayRef<ISD::NodeType> CandidateBinOps,
				bool AllowPartials) {
	// The pattern must end in an extract from index 0.			// The pattern must end in an extract from index 0.
	if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|			if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isNullConstant(Extract->getOperand(1)))			!isNullConstant(Extract->getOperand(1)))
	return SDValue();			return SDValue();

	SDValue Op = Extract->getOperand(0);			SDValue Op = Extract->getOperand(0);
	unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());			unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());

	// Match against one of the candidate binary ops.			// Match against one of the candidate binary ops.
	if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {			if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
	return Op.getOpcode() == unsigned(BinOp);			return Op.getOpcode() == unsigned(BinOp);
	}))			}))
	return SDValue();			return SDValue();
				unsigned CandidateBinOp = Op.getOpcode();

				// Matching failed - attempt to see if we did enough stages that a partial
				// reduction from a subvector is possible.
				auto PartialReduction = [&](SDValue Op, unsigned NumSubElts) {
				if (!AllowPartials \|\| !Op)
				return SDValue();
				EVT OpVT = Op.getValueType();
				EVT OpSVT = OpVT.getScalarType();
				EVT SubVT = EVT::getVectorVT(*getContext(), OpSVT, NumSubElts);
				if (!TLI->isExtractSubvectorCheap(SubVT, OpVT, 0))
				return SDValue();
				BinOp = (ISD::NodeType)CandidateBinOp;
				return getNode(
				ISD::EXTRACT_SUBVECTOR, SDLoc(Op), SubVT, Op,
				getConstant(0, SDLoc(Op), TLI->getVectorIdxTy(getDataLayout())));
				};

	// At each stage, we're looking for something that looks like:			// At each stage, we're looking for something that looks like:
	// %s = shufflevector <8 x i32> %op, <8 x i32> undef,			// %s = shufflevector <8 x i32> %op, <8 x i32> undef,
	// <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,			// <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
	// i32 undef, i32 undef, i32 undef, i32 undef>			// i32 undef, i32 undef, i32 undef, i32 undef>
	// %a = binop <8 x i32> %op, %s			// %a = binop <8 x i32> %op, %s
	// Where the mask changes according to the stage. E.g. for a 3-stage pyramid,			// Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
	// we expect something like:			// we expect something like:
	// <4,5,6,7,u,u,u,u>			// <4,5,6,7,u,u,u,u>
	// <2,3,u,u,u,u,u,u>			// <2,3,u,u,u,u,u,u>
	// <1,u,u,u,u,u,u,u>			// <1,u,u,u,u,u,u,u>
	unsigned CandidateBinOp = Op.getOpcode();			// While a partial reduction match would be:
				// <2,3,u,u,u,u,u,u>
				// <1,u,u,u,u,u,u,u>
				SDValue PrevOp;
	for (unsigned i = 0; i < Stages; ++i) {			for (unsigned i = 0; i < Stages; ++i) {
				unsigned MaskEnd = (1 << i);

	if (Op.getOpcode() != CandidateBinOp)			if (Op.getOpcode() != CandidateBinOp)
	return SDValue();			return PartialReduction(PrevOp, MaskEnd);

	SDValue Op0 = Op.getOperand(0);			SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);			SDValue Op1 = Op.getOperand(1);

	ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(Op0);			ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(Op0);
	if (Shuffle) {			if (Shuffle) {
	Op = Op1;			Op = Op1;
	} else {			} else {
	Shuffle = dyn_cast<ShuffleVectorSDNode>(Op1);			Shuffle = dyn_cast<ShuffleVectorSDNode>(Op1);
	Op = Op0;			Op = Op0;
	}			}

	// The first operand of the shuffle should be the same as the other operand			// The first operand of the shuffle should be the same as the other operand
	// of the binop.			// of the binop.
	if (!Shuffle \|\| Shuffle->getOperand(0) != Op)			if (!Shuffle \|\| Shuffle->getOperand(0) != Op)
	return SDValue();			return PartialReduction(PrevOp, MaskEnd);

	// Verify the shuffle has the expected (at this stage of the pyramid) mask.			// Verify the shuffle has the expected (at this stage of the pyramid) mask.
	for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)			for (int Index = 0; Index < (int)MaskEnd; ++Index)
	if (Shuffle->getMaskElt(Index) != MaskEnd + Index)			if (Shuffle->getMaskElt(Index) != (MaskEnd + Index))
	return SDValue();			return PartialReduction(PrevOp, MaskEnd);

				PrevOp = Op;
	}			}

	BinOp = (ISD::NodeType)CandidateBinOp;			BinOp = (ISD::NodeType)CandidateBinOp;
	return Op;			return Op;
	}			}

	SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {			SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
	assert(N->getNumValues() == 1 &&			assert(N->getNumValues() == 1 &&
	▲ Show 20 Lines • Show All 529 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 35,682 Lines • ▼ Show 20 Lines
/// Try to convert a vector reduction sequence composed of binops and shuffles		/// Try to convert a vector reduction sequence composed of binops and shuffles
/// into horizontal ops.		/// into horizontal ops.
static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,		static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");		assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");

// TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros.		// TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros.
ISD::NodeType Opc;		ISD::NodeType Opc;
SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD});		SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD}, true);
if (!Rdx)		if (!Rdx)
return SDValue();		return SDValue();

SDValue Index = ExtElt->getOperand(1);		SDValue Index = ExtElt->getOperand(1);
assert(isNullConstant(Index) &&		assert(isNullConstant(Index) &&
"Reduction doesn't end in an extract from index 0");		"Reduction doesn't end in an extract from index 0");

EVT VT = ExtElt->getValueType(0);		EVT VT = ExtElt->getValueType(0);
EVT VecVT = ExtElt->getOperand(0).getValueType();		EVT VecVT = Rdx.getValueType();
if (VecVT.getScalarType() != VT)		if (VecVT.getScalarType() != VT)
return SDValue();		return SDValue();

// Must be a >=128-bit vector with pow2 elements.		// Must be a >=128-bit vector with pow2 elements.
if ((VecVT.getSizeInBits() % 128) != 0 \|\|		if ((VecVT.getSizeInBits() % 128) != 0 \|\|
!isPowerOf2_32(VecVT.getVectorNumElements()))		!isPowerOf2_32(VecVT.getVectorNumElements()))
return SDValue();		return SDValue();

SDLoc DL(ExtElt);		SDLoc DL(ExtElt);

// vXi8 reduction - sum lo/hi halves then use PSADBW.		// vXi8 reduction - sum lo/hi halves then use PSADBW.
if (VT == MVT::i8) {		if (VT == MVT::i8) {
while (Rdx.getValueSizeInBits() > 128) {		while (Rdx.getValueSizeInBits() > 128) {
EVT RdxVT = Rdx.getValueType();		unsigned HalfSize = VecVT.getSizeInBits() / 2;
unsigned HalfSize = RdxVT.getSizeInBits() / 2;		unsigned HalfElts = VecVT.getVectorNumElements() / 2;
unsigned HalfElts = RdxVT.getVectorNumElements() / 2;
SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize);		SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize);
SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize);		SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize);
Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi);		Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi);
		VecVT = Rdx.getValueType();
}		}
assert(Rdx.getValueType() == MVT::v16i8 && "v16i8 reduction expected");		assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");

SDValue Hi = DAG.getVectorShuffle(		SDValue Hi = DAG.getVectorShuffle(
MVT::v16i8, DL, Rdx, Rdx,		MVT::v16i8, DL, Rdx, Rdx,
{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});		{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);		Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,		Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
getZeroVector(MVT::v16i8, Subtarget, DAG, DL));		getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
Rdx = DAG.getBitcast(MVT::v16i8, Rdx);		Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
Show All 11 Lines	static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
// across the whole vector, so we need an extract + hop preliminary stage.		// across the whole vector, so we need an extract + hop preliminary stage.
// This is the only step where the operands of the hop are not the same value.		// This is the only step where the operands of the hop are not the same value.
// TODO: We could extend this to handle 512-bit or even longer vectors.		// TODO: We could extend this to handle 512-bit or even longer vectors.
if (((VecVT == MVT::v16i16 \|\| VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) \|\|		if (((VecVT == MVT::v16i16 \|\| VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) \|\|
((VecVT == MVT::v8f32 \|\| VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {		((VecVT == MVT::v8f32 \|\| VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
unsigned NumElts = VecVT.getVectorNumElements();		unsigned NumElts = VecVT.getVectorNumElements();
SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);		SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);		SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2);		Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo);		VecVT = Rdx.getValueType();
}		}
if (!((VecVT == MVT::v8i16 \|\| VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&		if (!((VecVT == MVT::v8i16 \|\| VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
!((VecVT == MVT::v4f32 \|\| VecVT == MVT::v2f64) && Subtarget.hasSSE3()))		!((VecVT == MVT::v4f32 \|\| VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
return SDValue();		return SDValue();

// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0		// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
assert(Rdx.getValueType() == VecVT && "Unexpected reduction match");
unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());		unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
for (unsigned i = 0; i != ReductionSteps; ++i)		for (unsigned i = 0; i != ReductionSteps; ++i)
Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);		Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);		return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
}		}

/// Detect vector gather/scatter index generation and convert it from being a		/// Detect vector gather/scatter index generation and convert it from being a
▲ Show 20 Lines • Show All 9,836 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll

	Show First 20 Lines • Show All 1,693 Lines • ▼ Show 20 Lines
	; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]			; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0			; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX-SLOW-NEXT: vmovd %xmm0, %eax			; AVX-SLOW-NEXT: vmovd %xmm0, %eax
	; AVX-SLOW-NEXT: vzeroupper			; AVX-SLOW-NEXT: vzeroupper
	; AVX-SLOW-NEXT: retq			; AVX-SLOW-NEXT: retq
	;			;
	; AVX-FAST-LABEL: partial_reduction_add_v8i32:			; AVX-FAST-LABEL: partial_reduction_add_v8i32:
	; AVX-FAST: # %bb.0:			; AVX-FAST: # %bb.0:
	; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
	; AVX-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0			; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
	; AVX-FAST-NEXT: vmovd %xmm0, %eax			; AVX-FAST-NEXT: vmovd %xmm0, %eax
	; AVX-FAST-NEXT: vzeroupper			; AVX-FAST-NEXT: vzeroupper
	; AVX-FAST-NEXT: retq			; AVX-FAST-NEXT: retq
	%x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>			%x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%x0213 = add <8 x i32> %x, %x23			%x0213 = add <8 x i32> %x, %x23
	%x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>			%x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%x0123 = add <8 x i32> %x0213, %x13			%x0123 = add <8 x i32> %x0213, %x13
	Show All 24 Lines
	; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0			; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]			; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0			; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX-SLOW-NEXT: vmovd %xmm0, %eax			; AVX-SLOW-NEXT: vmovd %xmm0, %eax
	; AVX-SLOW-NEXT: vzeroupper			; AVX-SLOW-NEXT: vzeroupper
	; AVX-SLOW-NEXT: retq			; AVX-SLOW-NEXT: retq
	;			;
	; AVX1-FAST-LABEL: partial_reduction_add_v16i32:			; AVX-FAST-LABEL: partial_reduction_add_v16i32:
	; AVX1-FAST: # %bb.0:			; AVX-FAST: # %bb.0:
	; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
	; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0			; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
	; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0			; AVX-FAST-NEXT: vmovd %xmm0, %eax
	; AVX1-FAST-NEXT: vmovd %xmm0, %eax			; AVX-FAST-NEXT: vzeroupper
	; AVX1-FAST-NEXT: vzeroupper			; AVX-FAST-NEXT: retq
	; AVX1-FAST-NEXT: retq
	;
	; AVX2-FAST-LABEL: partial_reduction_add_v16i32:
	; AVX2-FAST: # %bb.0:
	; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX2-FAST-NEXT: vmovd %xmm0, %eax
	; AVX2-FAST-NEXT: vzeroupper
	; AVX2-FAST-NEXT: retq
	;
	; AVX512-FAST-LABEL: partial_reduction_add_v16i32:
	; AVX512-FAST: # %bb.0:
	; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX512-FAST-NEXT: vmovd %xmm0, %eax
	; AVX512-FAST-NEXT: vzeroupper
	; AVX512-FAST-NEXT: retq
	%x23 = shufflevector <16 x i32> %x, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>			%x23 = shufflevector <16 x i32> %x, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%x0213 = add <16 x i32> %x, %x23			%x0213 = add <16 x i32> %x, %x23
	%x13 = shufflevector <16 x i32> %x0213, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>			%x13 = shufflevector <16 x i32> %x0213, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%x0123 = add <16 x i32> %x0213, %x13			%x0123 = add <16 x i32> %x0213, %x13
	%r = extractelement <16 x i32> %x0123, i32 0			%r = extractelement <16 x i32> %x0123, i32 0
	ret i32 %r			ret i32 %r
	}			}

	▲ Show 20 Lines • Show All 225 Lines • ▼ Show 20 Lines
	; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]			; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0			; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX-SLOW-NEXT: vmovd %xmm0, %eax			; AVX-SLOW-NEXT: vmovd %xmm0, %eax
	; AVX-SLOW-NEXT: vzeroupper			; AVX-SLOW-NEXT: vzeroupper
	; AVX-SLOW-NEXT: retq			; AVX-SLOW-NEXT: retq
	;			;
	; AVX-FAST-LABEL: hadd32_8:			; AVX-FAST-LABEL: hadd32_8:
	; AVX-FAST: # %bb.0:			; AVX-FAST: # %bb.0:
	; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
	; AVX-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0			; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
	; AVX-FAST-NEXT: vmovd %xmm0, %eax			; AVX-FAST-NEXT: vmovd %xmm0, %eax
	; AVX-FAST-NEXT: vzeroupper			; AVX-FAST-NEXT: vzeroupper
	; AVX-FAST-NEXT: retq			; AVX-FAST-NEXT: retq
	%x226 = shufflevector <8 x i32> %x225, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>			%x226 = shufflevector <8 x i32> %x225, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%x227 = add <8 x i32> %x225, %x226			%x227 = add <8 x i32> %x225, %x226
	%x228 = shufflevector <8 x i32> %x227, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>			%x228 = shufflevector <8 x i32> %x227, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%x229 = add <8 x i32> %x227, %x228			%x229 = add <8 x i32> %x227, %x228
	Show All 24 Lines
	; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0			; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]			; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0			; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX-SLOW-NEXT: vmovd %xmm0, %eax			; AVX-SLOW-NEXT: vmovd %xmm0, %eax
	; AVX-SLOW-NEXT: vzeroupper			; AVX-SLOW-NEXT: vzeroupper
	; AVX-SLOW-NEXT: retq			; AVX-SLOW-NEXT: retq
	;			;
	; AVX1-FAST-LABEL: hadd32_16:			; AVX-FAST-LABEL: hadd32_16:
	; AVX1-FAST: # %bb.0:			; AVX-FAST: # %bb.0:
	; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
	; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0			; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
	; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0			; AVX-FAST-NEXT: vmovd %xmm0, %eax
	; AVX1-FAST-NEXT: vmovd %xmm0, %eax			; AVX-FAST-NEXT: vzeroupper
	; AVX1-FAST-NEXT: vzeroupper			; AVX-FAST-NEXT: retq
	; AVX1-FAST-NEXT: retq
	;
	; AVX2-FAST-LABEL: hadd32_16:
	; AVX2-FAST: # %bb.0:
	; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX2-FAST-NEXT: vmovd %xmm0, %eax
	; AVX2-FAST-NEXT: vzeroupper
	; AVX2-FAST-NEXT: retq
	;
	; AVX512-FAST-LABEL: hadd32_16:
	; AVX512-FAST: # %bb.0:
	; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX512-FAST-NEXT: vmovd %xmm0, %eax
	; AVX512-FAST-NEXT: vzeroupper
	; AVX512-FAST-NEXT: retq
	%x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>			%x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%x227 = add <16 x i32> %x225, %x226			%x227 = add <16 x i32> %x225, %x226
	%x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>			%x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%x229 = add <16 x i32> %x227, %x228			%x229 = add <16 x i32> %x227, %x228
	%x230 = extractelement <16 x i32> %x229, i32 0			%x230 = extractelement <16 x i32> %x229, i32 0
	ret i32 %x230			ret i32 %x230
	}			}

	▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines
	; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; SSE3-NEXT: paddd %xmm0, %xmm1			; SSE3-NEXT: paddd %xmm0, %xmm1
	; SSE3-NEXT: phaddd %xmm1, %xmm1			; SSE3-NEXT: phaddd %xmm1, %xmm1
	; SSE3-NEXT: movd %xmm1, %eax			; SSE3-NEXT: movd %xmm1, %eax
	; SSE3-NEXT: retq			; SSE3-NEXT: retq
	;			;
	; AVX-LABEL: hadd32_8_optsize:			; AVX-LABEL: hadd32_8_optsize:
	; AVX: # %bb.0:			; AVX: # %bb.0:
	; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
	; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0			; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
	; AVX-NEXT: vmovd %xmm0, %eax			; AVX-NEXT: vmovd %xmm0, %eax
	; AVX-NEXT: vzeroupper			; AVX-NEXT: vzeroupper
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%x226 = shufflevector <8 x i32> %x225, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>			%x226 = shufflevector <8 x i32> %x225, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%x227 = add <8 x i32> %x225, %x226			%x227 = add <8 x i32> %x225, %x226
	%x228 = shufflevector <8 x i32> %x227, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>			%x228 = shufflevector <8 x i32> %x227, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%x229 = add <8 x i32> %x227, %x228			%x229 = add <8 x i32> %x227, %x228
	%x230 = extractelement <8 x i32> %x229, i32 0			%x230 = extractelement <8 x i32> %x229, i32 0
	ret i32 %x230			ret i32 %x230
	}			}

	define i32 @hadd32_16_optsize(<16 x i32> %x225) optsize {			define i32 @hadd32_16_optsize(<16 x i32> %x225) optsize {
	; SSE3-LABEL: hadd32_16_optsize:			; SSE3-LABEL: hadd32_16_optsize:
	; SSE3: # %bb.0:			; SSE3: # %bb.0:
	; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; SSE3-NEXT: paddd %xmm0, %xmm1			; SSE3-NEXT: paddd %xmm0, %xmm1
	; SSE3-NEXT: phaddd %xmm1, %xmm1			; SSE3-NEXT: phaddd %xmm1, %xmm1
	; SSE3-NEXT: movd %xmm1, %eax			; SSE3-NEXT: movd %xmm1, %eax
	; SSE3-NEXT: retq			; SSE3-NEXT: retq
	;			;
	; AVX1-SLOW-LABEL: hadd32_16_optsize:			; AVX-LABEL: hadd32_16_optsize:
	; AVX1-SLOW: # %bb.0:			; AVX: # %bb.0:
	; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
	; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0			; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
	; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0			; AVX-NEXT: vmovd %xmm0, %eax
	; AVX1-SLOW-NEXT: vmovd %xmm0, %eax			; AVX-NEXT: vzeroupper
	; AVX1-SLOW-NEXT: vzeroupper			; AVX-NEXT: retq
	; AVX1-SLOW-NEXT: retq
	;
	; AVX1-FAST-LABEL: hadd32_16_optsize:
	; AVX1-FAST: # %bb.0:
	; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
	; AVX1-FAST-NEXT: vmovd %xmm0, %eax
	; AVX1-FAST-NEXT: vzeroupper
	; AVX1-FAST-NEXT: retq
	;
	; AVX2-SLOW-LABEL: hadd32_16_optsize:
	; AVX2-SLOW: # %bb.0:
	; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX2-SLOW-NEXT: vmovd %xmm0, %eax
	; AVX2-SLOW-NEXT: vzeroupper
	; AVX2-SLOW-NEXT: retq
	;
	; AVX2-FAST-LABEL: hadd32_16_optsize:
	; AVX2-FAST: # %bb.0:
	; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX2-FAST-NEXT: vmovd %xmm0, %eax
	; AVX2-FAST-NEXT: vzeroupper
	; AVX2-FAST-NEXT: retq
	;
	; AVX512-SLOW-LABEL: hadd32_16_optsize:
	; AVX512-SLOW: # %bb.0:
	; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX512-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; AVX512-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX512-SLOW-NEXT: vmovd %xmm0, %eax
	; AVX512-SLOW-NEXT: vzeroupper
	; AVX512-SLOW-NEXT: retq
	;
	; AVX512-FAST-LABEL: hadd32_16_optsize:
	; AVX512-FAST: # %bb.0:
	; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX512-FAST-NEXT: vmovd %xmm0, %eax
	; AVX512-FAST-NEXT: vzeroupper
	; AVX512-FAST-NEXT: retq
	%x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>			%x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%x227 = add <16 x i32> %x225, %x226			%x227 = add <16 x i32> %x225, %x226
	%x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>			%x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%x229 = add <16 x i32> %x227, %x228			%x229 = add <16 x i32> %x227, %x228
	%x230 = extractelement <16 x i32> %x229, i32 0			%x230 = extractelement <16 x i32> %x229, i32 0
	ret i32 %x230			ret i32 %x230
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombine] matchBinOpReduction - add partial reduction matching
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 211556

llvm/trunk/include/llvm/CodeGen/SelectionDAG.h

llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll

This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombine] matchBinOpReduction - add partial reduction matchingClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 211556

llvm/trunk/include/llvm/CodeGen/SelectionDAG.h

llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll

[DAGCombine] matchBinOpReduction - add partial reduction matching
ClosedPublic