Diff 94033

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 29,138 Lines • ▼ Show 20 Lines
	// This is useful as it is the input into a SAD pattern.			// This is useful as it is the input into a SAD pattern.
	static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,			static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
	SDValue &Op1) {			SDValue &Op1) {
	// Check the condition of the select instruction is greater-than.			// Check the condition of the select instruction is greater-than.
	SDValue SetCC = Select->getOperand(0);			SDValue SetCC = Select->getOperand(0);
	if (SetCC.getOpcode() != ISD::SETCC)			if (SetCC.getOpcode() != ISD::SETCC)
	return false;			return false;
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();			ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
	if (CC != ISD::SETGT)			if (CC != ISD::SETGT && CC != ISD::SETLT)
	return false;			return false;

	SDValue SelectOp1 = Select->getOperand(1);			SDValue SelectOp1 = Select->getOperand(1);
	SDValue SelectOp2 = Select->getOperand(2);			SDValue SelectOp2 = Select->getOperand(2);

				// The following instructions assume SelectOp1 is the subtraction operand
				// and SelectOp2 is the negation operand.
				// In the case of SETLT this is the other way around.
				if (CC == ISD::SETLT)
				std::swap(SelectOp1, SelectOp2);

	// The second operand of the select should be the negation of the first			// The second operand of the select should be the negation of the first
	// operand, which is implemented as 0 - SelectOp1.			// operand, which is implemented as 0 - SelectOp1.
	if (!(SelectOp2.getOpcode() == ISD::SUB &&			if (!(SelectOp2.getOpcode() == ISD::SUB &&
	ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&			ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
	SelectOp2.getOperand(1) == SelectOp1))			SelectOp2.getOperand(1) == SelectOp1))
	return false;			return false;

	// The first operand of SetCC is the first operand of the select, which is the			// The first operand of SetCC is the first operand of the select, which is the
	// difference between the two input vectors.			// difference between the two input vectors.
	if (SetCC.getOperand(0) != SelectOp1)			if (SetCC.getOperand(0) != SelectOp1)
	return false;			return false;

	// The second operand of the comparison can be either -1 or 0.			// In SetLT case, The second operand of the comparison can be either 1 or 0.
	if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) \|\|			APInt SplatVal;
				if ((CC == ISD::SETLT) &&
				!((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
				SplatVal == 1) \|\|
				(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
				return false;

				// In SetGT case, The second operand of the comparison can be either -1 or 0.
				if ((CC == ISD::SETGT) &&
				!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) \|\|
	ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))			ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
	return false;			return false;

	// The first operand of the select is the difference between the two input			// The first operand of the select is the difference between the two input
	// vectors.			// vectors.
	if (SelectOp1.getOpcode() != ISD::SUB)			if (SelectOp1.getOpcode() != ISD::SUB)
	return false;			return false;

	▲ Show 20 Lines • Show All 112 Lines • ▼ Show 20 Lines
	}			}

	static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,			static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {			const X86Subtarget &Subtarget) {
	// PSADBW is only supported on SSE2 and up.			// PSADBW is only supported on SSE2 and up.
	if (!Subtarget.hasSSE2())			if (!Subtarget.hasSSE2())
	return SDValue();			return SDValue();

	// Verify the type we're extracting from is appropriate			// Verify the type we're extracting from is any integer type above i16.
	// TODO: There's nothing special about i32, any integer type above i16 should
	// work just as well.
	EVT VT = Extract->getOperand(0).getValueType();			EVT VT = Extract->getOperand(0).getValueType();
	if (!VT.isSimple() \|\| !(VT.getVectorElementType() == MVT::i32))			if (!VT.isSimple() \|\| !(VT.getVectorElementType().getSizeInBits() > 16))
	return SDValue();			return SDValue();

	unsigned RegSize = 128;			unsigned RegSize = 128;
	if (Subtarget.hasBWI())			if (Subtarget.hasBWI())
	RegSize = 512;			RegSize = 512;
	else if (Subtarget.hasAVX2())			else if (Subtarget.hasAVX2())
	RegSize = 256;			RegSize = 256;

	// We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.			// We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before			// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.			// feeding them into several SADs, and then reducing over those.
	if (VT.getSizeInBits() / 4 > RegSize)			if (RegSize / VT.getVectorNumElements() < 8)
	return SDValue();			return SDValue();

	// Match shuffle + add pyramid.			// Match shuffle + add pyramid.
	SDValue Root = matchBinOpReduction(Extract, ISD::ADD);			SDValue Root = matchBinOpReduction(Extract, ISD::ADD);

				// The operand is expected to be zero extended from i8
				// (verified in detectZextAbsDiff).
				// In order to convert to i64 and above, additional any/zero/sign
				// extend is expected.
				// The zero extend from 32 bit has no mathematical effect on the result.
				// Also the sign extend is basically zero extend
				// (extends the sign bit which is zero).
				// So it is correct to skip the sign/zero extend instruction.
				if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND \|\|
				Root.getOpcode() == ISD::ZERO_EXTEND \|\|
				Root.getOpcode() == ISD::ANY_EXTEND))
				Root = Root.getOperand(0);

	// If there was a match, we want Root to be a select that is the root of an			// If there was a match, we want Root to be a select that is the root of an
	// abs-diff pattern.			// abs-diff pattern.
	if (!Root \|\| (Root.getOpcode() != ISD::VSELECT))			if (!Root \|\| (Root.getOpcode() != ISD::VSELECT))
	return SDValue();			return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.			// Check whether we have an abs-diff pattern feeding into the select.
	SDValue Zext0, Zext1;			SDValue Zext0, Zext1;
	if (!detectZextAbsDiff(Root, Zext0, Zext1))			if (!detectZextAbsDiff(Root, Zext0, Zext1))
	return SDValue();			return SDValue();

	// Create the SAD instruction			// Create the SAD instruction.
	SDLoc DL(Extract);			SDLoc DL(Extract);
	SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);			SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);

	// If the original vector was wider than 8 elements, sum over the results			// If the original vector was wider than 8 elements, sum over the results
	// in the SAD vector.			// in the SAD vector.
	unsigned Stages = Log2_32(VT.getVectorNumElements());			unsigned Stages = Log2_32(VT.getVectorNumElements());
	MVT SadVT = SAD.getSimpleValueType();			MVT SadVT = SAD.getSimpleValueType();
	if (Stages > 3) {			if (Stages > 3) {
	unsigned SadElems = SadVT.getVectorNumElements();			unsigned SadElems = SadVT.getVectorNumElements();

	for(unsigned i = Stages - 3; i > 0; --i) {			for(unsigned i = Stages - 3; i > 0; --i) {
	SmallVector<int, 16> Mask(SadElems, -1);			SmallVector<int, 16> Mask(SadElems, -1);
	for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)			for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
	Mask[j] = MaskEnd + j;			Mask[j] = MaskEnd + j;

	SDValue Shuffle =			SDValue Shuffle =
	DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);			DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
	SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);			SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
	}			}
	}			}

	// Return the lowest i32.			MVT Type = Extract->getSimpleValueType(0);
	MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32);			unsigned TypeSizeInBits = Type.getSizeInBits();
				// Return the lowest TypeSizeInBits bits.
				MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
	SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);			SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD,			return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
	Extract->getOperand(1));			Extract->getOperand(1));
	}			}

	// Attempt to peek through a target shuffle and extract the scalar from the			// Attempt to peek through a target shuffle and extract the scalar from the
	// source.			// source.
	static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,			static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,			TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {			const X86Subtarget &Subtarget) {
	▲ Show 20 Lines • Show All 6,707 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/sad_variations.ll

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 \| FileCheck %s --check-prefix=SSE2
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 \| FileCheck %s --check-prefix=AVX2
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f \| FileCheck %s --check-prefix=AVX512F

				define i32 @sad8_32bit_icmp_sge(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #0 {
				; SSE2-LABEL: sad8_32bit_icmp_sge:
				; SSE2: # BB#0: # %entry
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %eax
				; SSE2-NEXT: retq
				;
				; AVX2-LABEL: sad8_32bit_icmp_sge:
				; AVX2: # BB#0: # %entry
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %eax
				; AVX2-NEXT: retq
				;
				; AVX512F-LABEL: sad8_32bit_icmp_sge:
				; AVX512F: # BB#0: # %entry
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %eax
				; AVX512F-NEXT: retq

				entry:
				%idx.ext = zext i32 %stride to i64
				br label %for.body

				for.body: ; preds = %entry
				%0 = bitcast i8* %cur to <8 x i8>*
				%1 = load <8 x i8>, <8 x i8>* %0, align 1
				%2 = zext <8 x i8> %1 to <8 x i32>
				%3 = bitcast i8* %ref to <8 x i8>*
				%4 = load <8 x i8>, <8 x i8>* %3, align 1
				%5 = zext <8 x i8> %4 to <8 x i32>
				%6 = sub nsw <8 x i32> %2, %5
				%7 = icmp sgt <8 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
				%8 = sub nsw <8 x i32> zeroinitializer, %6
				%9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8
				%rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx = add <8 x i32> %9, %rdx.shuf
				%rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
				%rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
				%10 = extractelement <8 x i32> %bin.rdx232, i32 0
				ret i32 %10
				}

				define i32 @sad8_32bit_icmp_sgt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #1 {
				; SSE2-LABEL: sad8_32bit_icmp_sgt:
				; SSE2: # BB#0: # %entry
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %eax
				; SSE2-NEXT: retq
				;
				; AVX2-LABEL: sad8_32bit_icmp_sgt:
				; AVX2: # BB#0: # %entry
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %eax
				; AVX2-NEXT: retq
				;
				; AVX512F-LABEL: sad8_32bit_icmp_sgt:
				; AVX512F: # BB#0: # %entry
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %eax
				; AVX512F-NEXT: retq
				entry:
				%idx.ext = zext i32 %stride to i64
				br label %for.body

				for.body: ; preds = %entry
				%0 = bitcast i8* %cur to <8 x i8>*
				%1 = load <8 x i8>, <8 x i8>* %0, align 1
				%2 = zext <8 x i8> %1 to <8 x i32>
				%3 = bitcast i8* %ref to <8 x i8>*
				%4 = load <8 x i8>, <8 x i8>* %3, align 1
				%5 = zext <8 x i8> %4 to <8 x i32>
				%6 = sub nsw <8 x i32> %2, %5
				%7 = icmp sgt <8 x i32> %6, zeroinitializer
				%8 = sub nsw <8 x i32> zeroinitializer, %6
				%9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8
				%rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx = add <8 x i32> %9, %rdx.shuf
				%rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
				%rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
				%10 = extractelement <8 x i32> %bin.rdx232, i32 0
				ret i32 %10
				}

				define i32 @sad8_32bit_icmp_sle(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #2 {
				; SSE2-LABEL: sad8_32bit_icmp_sle:
				; SSE2: # BB#0: # %entry
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %eax
				; SSE2-NEXT: retq
				;
				; AVX2-LABEL: sad8_32bit_icmp_sle:
				; AVX2: # BB#0: # %entry
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %eax
				; AVX2-NEXT: retq
				;
				; AVX512F-LABEL: sad8_32bit_icmp_sle:
				; AVX512F: # BB#0: # %entry
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %eax
				; AVX512F-NEXT: retq
				entry:
				%idx.ext = zext i32 %stride to i64
				br label %for.body

				for.body: ; preds = %entry
				%0 = bitcast i8* %cur to <8 x i8>*
				%1 = load <8 x i8>, <8 x i8>* %0, align 1
				%2 = zext <8 x i8> %1 to <8 x i32>
				%3 = bitcast i8* %ref to <8 x i8>*
				%4 = load <8 x i8>, <8 x i8>* %3, align 1
				%5 = zext <8 x i8> %4 to <8 x i32>
				%6 = sub nsw <8 x i32> %2, %5
				%7 = icmp slt <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
				%8 = sub nsw <8 x i32> zeroinitializer, %6
				%9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
				%rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx = add <8 x i32> %9, %rdx.shuf
				%rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
				%rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
				%10 = extractelement <8 x i32> %bin.rdx232, i32 0
				ret i32 %10
				}

				define i32 @sad8_32bit_icmp_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #3 {
				; SSE2-LABEL: sad8_32bit_icmp_slt:
				; SSE2: # BB#0: # %entry
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %eax
				; SSE2-NEXT: retq
				;
				; AVX2-LABEL: sad8_32bit_icmp_slt:
				; AVX2: # BB#0: # %entry
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %eax
				; AVX2-NEXT: retq
				;
				; AVX512F-LABEL: sad8_32bit_icmp_slt:
				; AVX512F: # BB#0: # %entry
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %eax
				; AVX512F-NEXT: retq
				entry:
				%idx.ext = zext i32 %stride to i64
				br label %for.body

				for.body: ; preds = %entry
				%0 = bitcast i8* %cur to <8 x i8>*
				%1 = load <8 x i8>, <8 x i8>* %0, align 1
				%2 = zext <8 x i8> %1 to <8 x i32>
				%3 = bitcast i8* %ref to <8 x i8>*
				%4 = load <8 x i8>, <8 x i8>* %3, align 1
				%5 = zext <8 x i8> %4 to <8 x i32>
				%6 = sub nsw <8 x i32> %2, %5
				%7 = icmp slt <8 x i32> %6, zeroinitializer
				%8 = sub nsw <8 x i32> zeroinitializer, %6
				%9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
				%rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx = add <8 x i32> %9, %rdx.shuf
				%rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
				%rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
				%10 = extractelement <8 x i32> %bin.rdx232, i32 0
				ret i32 %10
				}

				define i64 @sad8_64bit_icmp_sext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
				; SSE2-LABEL: sad8_64bit_icmp_sext_slt:
				; SSE2: # BB#0: # %entry
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %rax
				; SSE2-NEXT: retq
				;
				; AVX2-LABEL: sad8_64bit_icmp_sext_slt:
				; AVX2: # BB#0: # %entry
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovq %xmm0, %rax
				; AVX2-NEXT: retq
				;
				; AVX512F-LABEL: sad8_64bit_icmp_sext_slt:
				; AVX512F: # BB#0: # %entry
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovq %xmm0, %rax
				; AVX512F-NEXT: retq
				entry:
				br label %for.body

				for.body: ; preds = %entry
				%0 = bitcast i8* %cur to <8 x i8>*
				%1 = load <8 x i8>, <8 x i8>* %0, align 1
				%2 = zext <8 x i8> %1 to <8 x i32>
				%3 = bitcast i8* %ref to <8 x i8>*
				%4 = load <8 x i8>, <8 x i8>* %3, align 1
				%5 = zext <8 x i8> %4 to <8 x i32>
				%6 = sub nsw <8 x i32> %2, %5
				%7 = icmp slt <8 x i32> %6, zeroinitializer
				%8 = sub nsw <8 x i32> zeroinitializer, %6
				%9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
				%10 = sext <8 x i32> %9 to <8 x i64>
				%rdx.shuf = shufflevector <8 x i64> %10, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx = add <8 x i64> %rdx.shuf, %10
				%rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236
				%rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238
				%11 = extractelement <8 x i64> %bin.rdx239, i32 0
				ret i64 %11
				}

				define i64 @sad8_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
				; SSE2-LABEL: sad8_64bit_icmp_zext_slt:
				; SSE2: # BB#0: # %entry
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %rax
				; SSE2-NEXT: retq
				;
				; AVX2-LABEL: sad8_64bit_icmp_zext_slt:
				; AVX2: # BB#0: # %entry
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovq %xmm0, %rax
				; AVX2-NEXT: retq
				;
				; AVX512F-LABEL: sad8_64bit_icmp_zext_slt:
				; AVX512F: # BB#0: # %entry
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovq %xmm0, %rax
				; AVX512F-NEXT: retq
				entry:
				br label %for.body

				for.body: ; preds = %entry
				%0 = bitcast i8* %cur to <8 x i8>*
				%1 = load <8 x i8>, <8 x i8>* %0, align 1
				%2 = zext <8 x i8> %1 to <8 x i32>
				%3 = bitcast i8* %ref to <8 x i8>*
				%4 = load <8 x i8>, <8 x i8>* %3, align 1
				%5 = zext <8 x i8> %4 to <8 x i32>
				%6 = sub nsw <8 x i32> %2, %5
				%7 = icmp slt <8 x i32> %6, zeroinitializer
				%8 = sub nsw <8 x i32> zeroinitializer, %6
				%9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
				%10 = zext <8 x i32> %9 to <8 x i64>
				%rdx.shuf = shufflevector <8 x i64> %10, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx = add <8 x i64> %rdx.shuf, %10
				%rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236
				%rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238
				%11 = extractelement <8 x i64> %bin.rdx239, i32 0
				ret i64 %11
				}

				define i64 @sad8_early_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
				; SSE2-LABEL: sad8_early_64bit_icmp_zext_slt:
				; SSE2: # BB#0: # %entry
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %rax
				; SSE2-NEXT: retq
				;
				; AVX2-LABEL: sad8_early_64bit_icmp_zext_slt:
				; AVX2: # BB#0: # %entry
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovq %xmm0, %rax
				; AVX2-NEXT: retq
				;
				; AVX512F-LABEL: sad8_early_64bit_icmp_zext_slt:
				; AVX512F: # BB#0: # %entry
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovq %xmm0, %rax
				; AVX512F-NEXT: retq
				entry:
				br label %for.body

				for.body: ; preds = %entry
				%0 = bitcast i8* %cur to <8 x i8>*
				%1 = load <8 x i8>, <8 x i8>* %0, align 1
				%2 = zext <8 x i8> %1 to <8 x i64>
				%3 = bitcast i8* %ref to <8 x i8>*
				%4 = load <8 x i8>, <8 x i8>* %3, align 1
				%5 = zext <8 x i8> %4 to <8 x i64>
				%6 = sub nsw <8 x i64> %2, %5
				%7 = icmp slt <8 x i64> %6, zeroinitializer
				%8 = sub nsw <8 x i64> zeroinitializer, %6
				%9 = select <8 x i1> %7, <8 x i64> %8, <8 x i64> %6
				%rdx.shuf = shufflevector <8 x i64> %9, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx = add <8 x i64> %rdx.shuf, %9
				%rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236
				%rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238
				%10 = extractelement <8 x i64> %bin.rdx239, i32 0
				ret i64 %10
				}

This is an archive of the discontinued LLVM Phabricator instance.

Add 64 bit pattern matching for PSADBW
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 94033

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

llvm/trunk/test/CodeGen/X86/sad_variations.ll

This is an archive of the discontinued LLVM Phabricator instance.

Add 64 bit pattern matching for PSADBWClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 94033

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

llvm/trunk/test/CodeGen/X86/sad_variations.ll

Add 64 bit pattern matching for PSADBW
ClosedPublic