Diff 93776

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 29,030 Lines • ▼ Show 20 Lines
	// This is useful as it is the input into a SAD pattern.			// This is useful as it is the input into a SAD pattern.
	static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,			static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
	SDValue &Op1) {			SDValue &Op1) {
	// Check the condition of the select instruction is greater-than.			// Check the condition of the select instruction is greater-than.
	SDValue SetCC = Select->getOperand(0);			SDValue SetCC = Select->getOperand(0);
	if (SetCC.getOpcode() != ISD::SETCC)			if (SetCC.getOpcode() != ISD::SETCC)
	return false;			return false;
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();			ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
	if (CC != ISD::SETGT)			if (CC != ISD::SETGT && CC != ISD::SETLT)
	return false;			return false;

	SDValue SelectOp1 = Select->getOperand(1);			SDValue SelectOp1 = Select->getOperand(1);
	SDValue SelectOp2 = Select->getOperand(2);			SDValue SelectOp2 = Select->getOperand(2);

				// The following instrcutions assume SelectOp1 is the subtraction operand
				mkuperUnsubmitted Done Reply Inline Actions instrcutions -> instructions mkuper: instrcutions -> instructions
				// and SelectOp2 is the negation operand.
				// In the case of SETLT this is the other way around.
				if (CC == ISD::SETLT)
				std::swap(SelectOp1, SelectOp2);

	// The second operand of the select should be the negation of the first			// The second operand of the select should be the negation of the first
	// operand, which is implemented as 0 - SelectOp1.			// operand, which is implemented as 0 - SelectOp1.
	if (!(SelectOp2.getOpcode() == ISD::SUB &&			if (!(SelectOp2.getOpcode() == ISD::SUB &&
	ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&			ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
	SelectOp2.getOperand(1) == SelectOp1))			SelectOp2.getOperand(1) == SelectOp1))
	return false;			return false;

	// The first operand of SetCC is the first operand of the select, which is the			// The first operand of SetCC is the first operand of the select, which is the
	// difference between the two input vectors.			// difference between the two input vectors.
	if (SetCC.getOperand(0) != SelectOp1)			if (SetCC.getOperand(0) != SelectOp1)
	return false;			return false;

	// The second operand of the comparison can be either -1 or 0.			// In SetLT case, The second operand of the comparison can be either 1 or 0.
	if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) \|\|			APInt SplatVal;
				if ((CC == ISD::SETLT) &&
				!((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
				SplatVal == 1) \|\|
				(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
				return false;

				// In SetGT case, The second operand of the comparison can be either -1 or 0.
				if ((CC == ISD::SETGT) &&
				!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) \|\|
	ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))			ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
	return false;			return false;

	// The first operand of the select is the difference between the two input			// The first operand of the select is the difference between the two input
	// vectors.			// vectors.
	if (SelectOp1.getOpcode() != ISD::SUB)			if (SelectOp1.getOpcode() != ISD::SUB)
	return false;			return false;

	▲ Show 20 Lines • Show All 112 Lines • ▼ Show 20 Lines
	}			}

	static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,			static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {			const X86Subtarget &Subtarget) {
	// PSADBW is only supported on SSE2 and up.			// PSADBW is only supported on SSE2 and up.
	if (!Subtarget.hasSSE2())			if (!Subtarget.hasSSE2())
	return SDValue();			return SDValue();

	// Verify the type we're extracting from is appropriate			// Verify the type we're extracting from is any integer type above i16.
	// TODO: There's nothing special about i32, any integer type above i16 should
	// work just as well.
	EVT VT = Extract->getOperand(0).getValueType();			EVT VT = Extract->getOperand(0).getValueType();
	if (!VT.isSimple() \|\| !(VT.getVectorElementType() == MVT::i32))			if (!VT.isSimple() \|\| !(VT.getVectorElementType().getSizeInBits() > 16))
	return SDValue();			return SDValue();

	unsigned RegSize = 128;			unsigned RegSize = 128;
	if (Subtarget.hasBWI())			if (Subtarget.hasBWI())
	RegSize = 512;			RegSize = 512;
	else if (Subtarget.hasAVX2())			else if (Subtarget.hasAVX2())
	RegSize = 256;			RegSize = 256;

	// We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.			// We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before			// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.			// feeding them into several SADs, and then reducing over those.
	if (VT.getSizeInBits() / 4 > RegSize)			if (RegSize / VT.getVectorNumElements() < 8)
	return SDValue();			return SDValue();

	// Match shuffle + add pyramid.			// Match shuffle + add pyramid.
	SDValue Root = matchBinOpReduction(Extract, ISD::ADD);			SDValue Root = matchBinOpReduction(Extract, ISD::ADD);

				// The operand is expected to be zero extended from i8
				// (verified in detectZextAbsDiff). As a result the sign extend is
				// basically zero extend (extends the sign bit which is zero).
				// So it is correct to skip the sign extend instruction.
				if (Root && Root.getOpcode() == ISD::SIGN_EXTEND)
				mkuperUnsubmitted Not Done Reply Inline Actions This looks a bit suspicious. I suggest you split the patch in two - one to extend to arbitrary integer types, the other for this. The larger integer types part is basically good to go, but I'd like to understand how the sign extend case happens better. mkuper: This looks a bit suspicious. I suggest you split the patch in two - one to extend to arbitrary…
				oren_ben_simhonAuthorUnsubmitted Not Done Reply Inline Actions Unfortunately, the larger integer types change will have no effect if i don't skip the sign extend instruction, so i prefer to keep it together. I attached a ppt vpsadbw.pptx209 KBDownload that explains the mathematical prove behind removing skipping the sign extend. Basically the logic i try to explain is: The only difference between the 64 bit IR version and 32 bit IR version is that the 64 bit IR version has sign extend. I explain that the sign extend is actually a zero extend. I explain that there is no mathematical difference between 64/32 bit results. Since 32 bit version of psadbw pattern is already proved to be mathematical correct, I conclude that the pattern with sign extend matches psadbw instruction. The last function in the .ll file demonstrates the pattern. Let me know if it makes sense to you. oren_ben_simhon:* Unfortunately, the larger integer types change will have no effect if i don't skip the sign…
				mkuperUnsubmitted Not Done Reply Inline Actions I haven't really looking at the slides yet, but regardless - I don't understand why you say the change will have no effect. It'll have an effect if the final extension is a zext, not a sext. I don't see any reason why you'd have, specifically, sexts for 64-bit. I mean, your lit tests happen to have a sext, but you can just as well replace it with a zext. Can you explain why the two changes (32 -> 64 and zext -> sext) are not orthogonal? mkuper: I haven't really looking at the slides yet, but regardless - I don't understand why you say the…
				oren_ben_simhonAuthorUnsubmitted Not Done Reply Inline Actions You are correct. I can just as well replace it with a zext (I will update the code). The zext/sext only happens when moving to 64 bit. That is why i consider it holistic solution and not orthogonal. If you still feel like it is necessary to break the solution into two step review, let me know and I will create the two reviews. oren_ben_simhon: You are correct. I can just as well replace it with a zext (I will update the code). The…
				mkuperUnsubmitted Done Reply Inline Actions I think you'll need an extra check for the zext as well. In fact, you probably just want to match ZEXT/SEXT/ANYEXT here (assuming it's correct). IIUC, the case that's really equivalent to the i32 one is something that extends to i64 directly. without any extension after the select - something like: %0 = bitcast i8* %cur to <8 x i8>* %1 = load <8 x i8>, <8 x i8>* %0, align 1 %2 = zext <8 x i8> %1 to <8 x i64> %3 = bitcast i8* %ref to <8 x i8>* %4 = load <8 x i8>, <8 x i8>* %3, align 1 %5 = zext <8 x i8> %4 to <8 x i64> %6 = sub nsw <8 x i64> %2, %5 %7 = icmp slt <8 x i64> %6, zeroinitializer %8 = sub nsw <8 x i64> zeroinitializer, %6 %9 = select <8 x i1> %7, <8 x i64> %8, <8 x i64> %6 %rdx.shuf = shufflevector <8 x i64> %9, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> Right? mkuper: I think you'll need an extra check for the zext as well. In fact, you probably just want to…
				oren_ben_simhonAuthorUnsubmitted Not Done Reply Inline Actions I will add any_extend as you suggested. I have to agree with you that the pattern that you specified is the more expected pattern. However for some reason, the vectorizer chooses to do a second zero extend to i64 after the icmp. Anyway, the pattern you suggested will be matched and I added the test case to the test file. This is the example of the C code which is the base to all my lit tests: #define ABS_GT(X) (((X)>0)?(X):-(X)) unsigned int sad8_c(const unsigned char * const cur, const unsigned char * const ref, const unsigned int stride) { unsigned int sad = 0; unsigned int j; unsigned char const ptr_cur = cur; unsigned char const ptr_ref = ref; for (j = 0; j < 8; j++) { sad += ABS_GT(ptr_cur[0] - ptr_ref[0]); sad += ABS_GT(ptr_cur[1] - ptr_ref[1]); sad += ABS_GT(ptr_cur[2] - ptr_ref[2]); sad += ABS_GT(ptr_cur[3] - ptr_ref[3]); sad += ABS_GT(ptr_cur[4] - ptr_ref[4]); sad += ABS_GT(ptr_cur[5] - ptr_ref[5]); sad += ABS_GT(ptr_cur[6] - ptr_ref[6]); sad += ABS_GT(ptr_cur[7] - ptr_ref[7]); ptr_cur += stride; ptr_ref += stride; } return sad; } oren_ben_simhon: I will add any_extend as you suggested. I have to agree with you that the pattern that you…
				Root = Root.getOperand(0);

	// If there was a match, we want Root to be a select that is the root of an			// If there was a match, we want Root to be a select that is the root of an
	// abs-diff pattern.			// abs-diff pattern.
	if (!Root \|\| (Root.getOpcode() != ISD::VSELECT))			if (!Root \|\| (Root.getOpcode() != ISD::VSELECT))
	return SDValue();			return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.			// Check whether we have an abs-diff pattern feeding into the select.
	SDValue Zext0, Zext1;			SDValue Zext0, Zext1;
	if (!detectZextAbsDiff(Root, Zext0, Zext1))			if (!detectZextAbsDiff(Root, Zext0, Zext1))
	return SDValue();			return SDValue();

	// Create the SAD instruction			// Create the SAD instruction.
	SDLoc DL(Extract);			SDLoc DL(Extract);
	SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);			SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);

	// If the original vector was wider than 8 elements, sum over the results			// If the original vector was wider than 8 elements, sum over the results
	// in the SAD vector.			// in the SAD vector.
	unsigned Stages = Log2_32(VT.getVectorNumElements());			unsigned Stages = Log2_32(VT.getVectorNumElements());
	MVT SadVT = SAD.getSimpleValueType();			MVT SadVT = SAD.getSimpleValueType();
	if (Stages > 3) {			if (Stages > 3) {
	unsigned SadElems = SadVT.getVectorNumElements();			unsigned SadElems = SadVT.getVectorNumElements();

	for(unsigned i = Stages - 3; i > 0; --i) {			for(unsigned i = Stages - 3; i > 0; --i) {
	SmallVector<int, 16> Mask(SadElems, -1);			SmallVector<int, 16> Mask(SadElems, -1);
	for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)			for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
	Mask[j] = MaskEnd + j;			Mask[j] = MaskEnd + j;

	SDValue Shuffle =			SDValue Shuffle =
	DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);			DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
	SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);			SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
	}			}
	}			}

	// Return the lowest i32.			MVT Type = Extract->getSimpleValueType(0);
	MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32);			unsigned TypeSizeInBits = Type.getSizeInBits();
				// Return the lowest TypeSizeInBits bits.
				MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
	SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);			SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD,			return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
	Extract->getOperand(1));			Extract->getOperand(1));
	}			}

	// Attempt to peek through a target shuffle and extract the scalar from the			// Attempt to peek through a target shuffle and extract the scalar from the
	// source.			// source.
	static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,			static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,			TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {			const X86Subtarget &Subtarget) {
	▲ Show 20 Lines • Show All 6,707 Lines • Show Last 20 Lines

test/CodeGen/X86/sad_variations.ll

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 \| FileCheck %s --check-prefix=SSE2
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 \| FileCheck %s --check-prefix=AVX2
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f \| FileCheck %s --check-prefix=AVX512F

				define i32 @sad8_64bit_icmp_sge(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #0 {
				; SSE2-LABEL: sad8_64bit_icmp_sge:
				mkuperUnsubmitted Done Reply Inline Actions Any reason the test cases have to be so large? Can you match a single psadbw instead of several? mkuper: Any reason the test cases have to be so large? Can you match a single psadbw instead of several?
				; SSE2: # BB#0: # %entry
				; SSE2-NEXT: movl %edx, %eax
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %ecx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %edx
				; SSE2-NEXT: addl %ecx, %edx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %ecx
				; SSE2-NEXT: addl %edx, %ecx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %edx
				; SSE2-NEXT: addl %ecx, %edx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %ecx
				; SSE2-NEXT: addl %edx, %ecx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %edx
				; SSE2-NEXT: addl %ecx, %edx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %ecx
				; SSE2-NEXT: addl %edx, %ecx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %eax
				; SSE2-NEXT: addl %ecx, %eax
				; SSE2-NEXT: retq
				;
				; AVX2-LABEL: sad8_64bit_icmp_sge:
				; AVX2: # BB#0: # %entry
				; AVX2-NEXT: movl %edx, %eax
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %ecx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %edx
				; AVX2-NEXT: addl %ecx, %edx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %ecx
				; AVX2-NEXT: addl %edx, %ecx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %edx
				; AVX2-NEXT: addl %ecx, %edx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %ecx
				; AVX2-NEXT: addl %edx, %ecx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %edx
				; AVX2-NEXT: addl %ecx, %edx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %ecx
				; AVX2-NEXT: addl %edx, %ecx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %eax
				; AVX2-NEXT: addl %ecx, %eax
				; AVX2-NEXT: retq
				;
				; AVX512F-LABEL: sad8_64bit_icmp_sge:
				; AVX512F: # BB#0: # %entry
				; AVX512F-NEXT: movl %edx, %eax
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %ecx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %edx
				; AVX512F-NEXT: addl %ecx, %edx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %ecx
				; AVX512F-NEXT: addl %edx, %ecx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %edx
				; AVX512F-NEXT: addl %ecx, %edx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %ecx
				; AVX512F-NEXT: addl %edx, %ecx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %edx
				; AVX512F-NEXT: addl %ecx, %edx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %ecx
				; AVX512F-NEXT: addl %edx, %ecx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %eax
				; AVX512F-NEXT: addl %ecx, %eax
				; AVX512F-NEXT: retq

				entry:
				%idx.ext = zext i32 %stride to i64
				br label %for.body

				for.body: ; preds = %entry
				%0 = bitcast i8* %cur to <8 x i8>*
				%1 = load <8 x i8>, <8 x i8>* %0, align 1
				%2 = zext <8 x i8> %1 to <8 x i32>
				%3 = bitcast i8* %ref to <8 x i8>*
				%4 = load <8 x i8>, <8 x i8>* %3, align 1
				%5 = zext <8 x i8> %4 to <8 x i32>
				%6 = sub nsw <8 x i32> %2, %5
				%7 = icmp sgt <8 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
				%8 = sub nsw <8 x i32> zeroinitializer, %6
				%9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8
				%rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx = add <8 x i32> %9, %rdx.shuf
				%rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
				%rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
				%10 = extractelement <8 x i32> %bin.rdx232, i32 0
				%add.ptr = getelementptr inbounds i8, i8* %cur, i64 %idx.ext
				%add.ptr178 = getelementptr inbounds i8, i8* %ref, i64 %idx.ext
				%11 = bitcast i8* %add.ptr to <8 x i8>*
				%12 = load <8 x i8>, <8 x i8>* %11, align 1
				%13 = zext <8 x i8> %12 to <8 x i32>
				%14 = bitcast i8* %add.ptr178 to <8 x i8>*
				%15 = load <8 x i8>, <8 x i8>* %14, align 1
				%16 = zext <8 x i8> %15 to <8 x i32>
				%17 = sub nsw <8 x i32> %13, %16
				%18 = icmp sgt <8 x i32> %17, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
				%19 = sub nsw <8 x i32> zeroinitializer, %17
				%20 = select <8 x i1> %18, <8 x i32> %17, <8 x i32> %19
				%rdx.shuf.1 = shufflevector <8 x i32> %20, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.1 = add <8 x i32> %20, %rdx.shuf.1
				%rdx.shuf229.1 = shufflevector <8 x i32> %bin.rdx.1, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.1 = add <8 x i32> %bin.rdx.1, %rdx.shuf229.1
				%rdx.shuf231.1 = shufflevector <8 x i32> %bin.rdx230.1, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.1 = add <8 x i32> %bin.rdx230.1, %rdx.shuf231.1
				%21 = extractelement <8 x i32> %bin.rdx232.1, i32 0
				%bin.extra.1 = add i32 %21, %10
				%add.ptr.1 = getelementptr inbounds i8, i8* %add.ptr, i64 %idx.ext
				%add.ptr178.1 = getelementptr inbounds i8, i8* %add.ptr178, i64 %idx.ext
				%22 = bitcast i8* %add.ptr.1 to <8 x i8>*
				%23 = load <8 x i8>, <8 x i8>* %22
				%24 = zext <8 x i8> %23 to <8 x i32>
				%25 = bitcast i8* %add.ptr178.1 to <8 x i8>*
				%26 = load <8 x i8>, <8 x i8>* %25
				%27 = zext <8 x i8> %26 to <8 x i32>
				%28 = sub nsw <8 x i32> %24, %27
				%29 = icmp sgt <8 x i32> %28, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
				%30 = sub nsw <8 x i32> zeroinitializer, %28
				%31 = select <8 x i1> %29, <8 x i32> %28, <8 x i32> %30
				%rdx.shuf.2 = shufflevector <8 x i32> %31, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.2 = add <8 x i32> %31, %rdx.shuf.2
				%rdx.shuf229.2 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.2 = add <8 x i32> %bin.rdx.2, %rdx.shuf229.2
				%rdx.shuf231.2 = shufflevector <8 x i32> %bin.rdx230.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.2 = add <8 x i32> %bin.rdx230.2, %rdx.shuf231.2
				%32 = extractelement <8 x i32> %bin.rdx232.2, i32 0
				%bin.extra.2 = add i32 %32, %bin.extra.1
				%add.ptr.2 = getelementptr inbounds i8, i8* %add.ptr.1, i64 %idx.ext
				%add.ptr178.2 = getelementptr inbounds i8, i8* %add.ptr178.1, i64 %idx.ext
				%33 = bitcast i8* %add.ptr.2 to <8 x i8>*
				%34 = load <8 x i8>, <8 x i8>* %33, align 1
				%35 = zext <8 x i8> %34 to <8 x i32>
				%36 = bitcast i8* %add.ptr178.2 to <8 x i8>*
				%37 = load <8 x i8>, <8 x i8>* %36, align 1
				%38 = zext <8 x i8> %37 to <8 x i32>
				%39 = sub nsw <8 x i32> %35, %38
				%40 = icmp sgt <8 x i32> %39, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
				%41 = sub nsw <8 x i32> zeroinitializer, %39
				%42 = select <8 x i1> %40, <8 x i32> %39, <8 x i32> %41
				%rdx.shuf.3 = shufflevector <8 x i32> %42, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.3 = add <8 x i32> %42, %rdx.shuf.3
				%rdx.shuf229.3 = shufflevector <8 x i32> %bin.rdx.3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.3 = add <8 x i32> %bin.rdx.3, %rdx.shuf229.3
				%rdx.shuf231.3 = shufflevector <8 x i32> %bin.rdx230.3, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.3 = add <8 x i32> %bin.rdx230.3, %rdx.shuf231.3
				%43 = extractelement <8 x i32> %bin.rdx232.3, i32 0
				%bin.extra.3 = add i32 %43, %bin.extra.2
				%add.ptr.3 = getelementptr inbounds i8, i8* %add.ptr.2, i64 %idx.ext
				%add.ptr178.3 = getelementptr inbounds i8, i8* %add.ptr178.2, i64 %idx.ext
				%44 = bitcast i8* %add.ptr.3 to <8 x i8>*
				%45 = load <8 x i8>, <8 x i8>* %44, align 1
				%46 = zext <8 x i8> %45 to <8 x i32>
				%47 = bitcast i8* %add.ptr178.3 to <8 x i8>*
				%48 = load <8 x i8>, <8 x i8>* %47, align 1
				%49 = zext <8 x i8> %48 to <8 x i32>
				%50 = sub nsw <8 x i32> %46, %49
				%51 = icmp sgt <8 x i32> %50, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
				%52 = sub nsw <8 x i32> zeroinitializer, %50
				%53 = select <8 x i1> %51, <8 x i32> %50, <8 x i32> %52
				%rdx.shuf.4 = shufflevector <8 x i32> %53, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.4 = add <8 x i32> %53, %rdx.shuf.4
				%rdx.shuf229.4 = shufflevector <8 x i32> %bin.rdx.4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.4 = add <8 x i32> %bin.rdx.4, %rdx.shuf229.4
				%rdx.shuf231.4 = shufflevector <8 x i32> %bin.rdx230.4, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.4 = add <8 x i32> %bin.rdx230.4, %rdx.shuf231.4
				%54 = extractelement <8 x i32> %bin.rdx232.4, i32 0
				%bin.extra.4 = add i32 %54, %bin.extra.3
				%add.ptr.4 = getelementptr inbounds i8, i8* %add.ptr.3, i64 %idx.ext
				%add.ptr178.4 = getelementptr inbounds i8, i8* %add.ptr178.3, i64 %idx.ext
				%55 = bitcast i8* %add.ptr.4 to <8 x i8>*
				%56 = load <8 x i8>, <8 x i8>* %55, align 1
				%57 = zext <8 x i8> %56 to <8 x i32>
				%58 = bitcast i8* %add.ptr178.4 to <8 x i8>*
				%59 = load <8 x i8>, <8 x i8>* %58, align 1
				%60 = zext <8 x i8> %59 to <8 x i32>
				%61 = sub nsw <8 x i32> %57, %60
				%62 = icmp sgt <8 x i32> %61, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
				%63 = sub nsw <8 x i32> zeroinitializer, %61
				%64 = select <8 x i1> %62, <8 x i32> %61, <8 x i32> %63
				%rdx.shuf.5 = shufflevector <8 x i32> %64, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.5 = add <8 x i32> %64, %rdx.shuf.5
				%rdx.shuf229.5 = shufflevector <8 x i32> %bin.rdx.5, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.5 = add <8 x i32> %bin.rdx.5, %rdx.shuf229.5
				%rdx.shuf231.5 = shufflevector <8 x i32> %bin.rdx230.5, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.5 = add <8 x i32> %bin.rdx230.5, %rdx.shuf231.5
				%65 = extractelement <8 x i32> %bin.rdx232.5, i32 0
				%bin.extra.5 = add i32 %65, %bin.extra.4
				%add.ptr.5 = getelementptr inbounds i8, i8* %add.ptr.4, i64 %idx.ext
				%add.ptr178.5 = getelementptr inbounds i8, i8* %add.ptr178.4, i64 %idx.ext
				%66 = bitcast i8* %add.ptr.5 to <8 x i8>*
				%67 = load <8 x i8>, <8 x i8>* %66, align 1
				%68 = zext <8 x i8> %67 to <8 x i32>
				%69 = bitcast i8* %add.ptr178.5 to <8 x i8>*
				%70 = load <8 x i8>, <8 x i8>* %69, align 1
				%71 = zext <8 x i8> %70 to <8 x i32>
				%72 = sub nsw <8 x i32> %68, %71
				%73 = icmp sgt <8 x i32> %72, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
				%74 = sub nsw <8 x i32> zeroinitializer, %72
				%75 = select <8 x i1> %73, <8 x i32> %72, <8 x i32> %74
				%rdx.shuf.6 = shufflevector <8 x i32> %75, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.6 = add <8 x i32> %75, %rdx.shuf.6
				%rdx.shuf229.6 = shufflevector <8 x i32> %bin.rdx.6, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.6 = add <8 x i32> %bin.rdx.6, %rdx.shuf229.6
				%rdx.shuf231.6 = shufflevector <8 x i32> %bin.rdx230.6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.6 = add <8 x i32> %bin.rdx230.6, %rdx.shuf231.6
				%76 = extractelement <8 x i32> %bin.rdx232.6, i32 0
				%bin.extra.6 = add i32 %76, %bin.extra.5
				%add.ptr.6 = getelementptr inbounds i8, i8* %add.ptr.5, i64 %idx.ext
				%add.ptr178.6 = getelementptr inbounds i8, i8* %add.ptr178.5, i64 %idx.ext
				%77 = bitcast i8* %add.ptr.6 to <8 x i8>*
				%78 = load <8 x i8>, <8 x i8>* %77, align 1
				%79 = zext <8 x i8> %78 to <8 x i32>
				%80 = bitcast i8* %add.ptr178.6 to <8 x i8>*
				%81 = load <8 x i8>, <8 x i8>* %80, align 1
				%82 = zext <8 x i8> %81 to <8 x i32>
				%83 = sub nsw <8 x i32> %79, %82
				%84 = icmp sgt <8 x i32> %83, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
				%85 = sub nsw <8 x i32> zeroinitializer, %83
				%86 = select <8 x i1> %84, <8 x i32> %83, <8 x i32> %85
				%rdx.shuf.7 = shufflevector <8 x i32> %86, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.7 = add <8 x i32> %86, %rdx.shuf.7
				%rdx.shuf229.7 = shufflevector <8 x i32> %bin.rdx.7, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.7 = add <8 x i32> %bin.rdx.7, %rdx.shuf229.7
				%rdx.shuf231.7 = shufflevector <8 x i32> %bin.rdx230.7, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.7 = add <8 x i32> %bin.rdx230.7, %rdx.shuf231.7
				%87 = extractelement <8 x i32> %bin.rdx232.7, i32 0
				%bin.extra.7 = add i32 %87, %bin.extra.6
				ret i32 %bin.extra.7
				}

				define i32 @sad8_64bit_icmp_sgt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #1 {
				; SSE2-LABEL: sad8_64bit_icmp_sgt:
				; SSE2: # BB#0: # %entry
				; SSE2-NEXT: movl %edx, %eax
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %ecx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %edx
				; SSE2-NEXT: addl %ecx, %edx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %ecx
				; SSE2-NEXT: addl %edx, %ecx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %edx
				; SSE2-NEXT: addl %ecx, %edx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %ecx
				; SSE2-NEXT: addl %edx, %ecx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %edx
				; SSE2-NEXT: addl %ecx, %edx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %ecx
				; SSE2-NEXT: addl %edx, %ecx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %eax
				; SSE2-NEXT: addl %ecx, %eax
				; SSE2-NEXT: retq
				;
				; AVX2-LABEL: sad8_64bit_icmp_sgt:
				; AVX2: # BB#0: # %entry
				; AVX2-NEXT: movl %edx, %eax
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %ecx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %edx
				; AVX2-NEXT: addl %ecx, %edx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %ecx
				; AVX2-NEXT: addl %edx, %ecx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %edx
				; AVX2-NEXT: addl %ecx, %edx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %ecx
				; AVX2-NEXT: addl %edx, %ecx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %edx
				; AVX2-NEXT: addl %ecx, %edx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %ecx
				; AVX2-NEXT: addl %edx, %ecx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %eax
				; AVX2-NEXT: addl %ecx, %eax
				; AVX2-NEXT: retq
				;
				; AVX512F-LABEL: sad8_64bit_icmp_sgt:
				; AVX512F: # BB#0: # %entry
				; AVX512F-NEXT: movl %edx, %eax
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %ecx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %edx
				; AVX512F-NEXT: addl %ecx, %edx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %ecx
				; AVX512F-NEXT: addl %edx, %ecx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %edx
				; AVX512F-NEXT: addl %ecx, %edx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %ecx
				; AVX512F-NEXT: addl %edx, %ecx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %edx
				; AVX512F-NEXT: addl %ecx, %edx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %ecx
				; AVX512F-NEXT: addl %edx, %ecx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %eax
				; AVX512F-NEXT: addl %ecx, %eax
				; AVX512F-NEXT: retq
				entry:
				%idx.ext = zext i32 %stride to i64
				br label %for.body

				for.body: ; preds = %entry
				%0 = bitcast i8* %cur to <8 x i8>*
				%1 = load <8 x i8>, <8 x i8>* %0, align 1
				%2 = zext <8 x i8> %1 to <8 x i32>
				%3 = bitcast i8* %ref to <8 x i8>*
				%4 = load <8 x i8>, <8 x i8>* %3, align 1
				%5 = zext <8 x i8> %4 to <8 x i32>
				%6 = sub nsw <8 x i32> %2, %5
				%7 = icmp sgt <8 x i32> %6, zeroinitializer
				%8 = sub nsw <8 x i32> zeroinitializer, %6
				%9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8
				%rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx = add <8 x i32> %9, %rdx.shuf
				%rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
				%rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
				%10 = extractelement <8 x i32> %bin.rdx232, i32 0
				%add.ptr = getelementptr inbounds i8, i8* %cur, i64 %idx.ext
				%add.ptr178 = getelementptr inbounds i8, i8* %ref, i64 %idx.ext
				%11 = bitcast i8* %add.ptr to <8 x i8>*
				%12 = load <8 x i8>, <8 x i8>* %11, align 1
				%13 = zext <8 x i8> %12 to <8 x i32>
				%14 = bitcast i8* %add.ptr178 to <8 x i8>*
				%15 = load <8 x i8>, <8 x i8>* %14, align 1
				%16 = zext <8 x i8> %15 to <8 x i32>
				%17 = sub nsw <8 x i32> %13, %16
				%18 = icmp sgt <8 x i32> %17, zeroinitializer
				%19 = sub nsw <8 x i32> zeroinitializer, %17
				%20 = select <8 x i1> %18, <8 x i32> %17, <8 x i32> %19
				%rdx.shuf.1 = shufflevector <8 x i32> %20, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.1 = add <8 x i32> %20, %rdx.shuf.1
				%rdx.shuf229.1 = shufflevector <8 x i32> %bin.rdx.1, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.1 = add <8 x i32> %bin.rdx.1, %rdx.shuf229.1
				%rdx.shuf231.1 = shufflevector <8 x i32> %bin.rdx230.1, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.1 = add <8 x i32> %bin.rdx230.1, %rdx.shuf231.1
				%21 = extractelement <8 x i32> %bin.rdx232.1, i32 0
				%bin.extra.1 = add i32 %21, %10
				%add.ptr.1 = getelementptr inbounds i8, i8* %add.ptr, i64 %idx.ext
				%add.ptr178.1 = getelementptr inbounds i8, i8* %add.ptr178, i64 %idx.ext
				%22 = bitcast i8* %add.ptr.1 to <8 x i8>*
				%23 = load <8 x i8>, <8 x i8>* %22, align 1
				%24 = zext <8 x i8> %23 to <8 x i32>
				%25 = bitcast i8* %add.ptr178.1 to <8 x i8>*
				%26 = load <8 x i8>, <8 x i8>* %25, align 1
				%27 = zext <8 x i8> %26 to <8 x i32>
				%28 = sub nsw <8 x i32> %24, %27
				%29 = icmp sgt <8 x i32> %28, zeroinitializer
				%30 = sub nsw <8 x i32> zeroinitializer, %28
				%31 = select <8 x i1> %29, <8 x i32> %28, <8 x i32> %30
				%rdx.shuf.2 = shufflevector <8 x i32> %31, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.2 = add <8 x i32> %31, %rdx.shuf.2
				%rdx.shuf229.2 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.2 = add <8 x i32> %bin.rdx.2, %rdx.shuf229.2
				%rdx.shuf231.2 = shufflevector <8 x i32> %bin.rdx230.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.2 = add <8 x i32> %bin.rdx230.2, %rdx.shuf231.2
				%32 = extractelement <8 x i32> %bin.rdx232.2, i32 0
				%bin.extra.2 = add i32 %32, %bin.extra.1
				%add.ptr.2 = getelementptr inbounds i8, i8* %add.ptr.1, i64 %idx.ext
				%add.ptr178.2 = getelementptr inbounds i8, i8* %add.ptr178.1, i64 %idx.ext
				%33 = bitcast i8* %add.ptr.2 to <8 x i8>*
				%34 = load <8 x i8>, <8 x i8>* %33, align 1
				%35 = zext <8 x i8> %34 to <8 x i32>
				%36 = bitcast i8* %add.ptr178.2 to <8 x i8>*
				%37 = load <8 x i8>, <8 x i8>* %36, align 1
				%38 = zext <8 x i8> %37 to <8 x i32>
				%39 = sub nsw <8 x i32> %35, %38
				%40 = icmp sgt <8 x i32> %39, zeroinitializer
				%41 = sub nsw <8 x i32> zeroinitializer, %39
				%42 = select <8 x i1> %40, <8 x i32> %39, <8 x i32> %41
				%rdx.shuf.3 = shufflevector <8 x i32> %42, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.3 = add <8 x i32> %42, %rdx.shuf.3
				%rdx.shuf229.3 = shufflevector <8 x i32> %bin.rdx.3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.3 = add <8 x i32> %bin.rdx.3, %rdx.shuf229.3
				%rdx.shuf231.3 = shufflevector <8 x i32> %bin.rdx230.3, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.3 = add <8 x i32> %bin.rdx230.3, %rdx.shuf231.3
				%43 = extractelement <8 x i32> %bin.rdx232.3, i32 0
				%bin.extra.3 = add i32 %43, %bin.extra.2
				%add.ptr.3 = getelementptr inbounds i8, i8* %add.ptr.2, i64 %idx.ext
				%add.ptr178.3 = getelementptr inbounds i8, i8* %add.ptr178.2, i64 %idx.ext
				%44 = bitcast i8* %add.ptr.3 to <8 x i8>*
				%45 = load <8 x i8>, <8 x i8>* %44, align 1
				%46 = zext <8 x i8> %45 to <8 x i32>
				%47 = bitcast i8* %add.ptr178.3 to <8 x i8>*
				%48 = load <8 x i8>, <8 x i8>* %47, align 1
				%49 = zext <8 x i8> %48 to <8 x i32>
				%50 = sub nsw <8 x i32> %46, %49
				%51 = icmp sgt <8 x i32> %50, zeroinitializer
				%52 = sub nsw <8 x i32> zeroinitializer, %50
				%53 = select <8 x i1> %51, <8 x i32> %50, <8 x i32> %52
				%rdx.shuf.4 = shufflevector <8 x i32> %53, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.4 = add <8 x i32> %53, %rdx.shuf.4
				%rdx.shuf229.4 = shufflevector <8 x i32> %bin.rdx.4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.4 = add <8 x i32> %bin.rdx.4, %rdx.shuf229.4
				%rdx.shuf231.4 = shufflevector <8 x i32> %bin.rdx230.4, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.4 = add <8 x i32> %bin.rdx230.4, %rdx.shuf231.4
				%54 = extractelement <8 x i32> %bin.rdx232.4, i32 0
				%bin.extra.4 = add i32 %54, %bin.extra.3
				%add.ptr.4 = getelementptr inbounds i8, i8* %add.ptr.3, i64 %idx.ext
				%add.ptr178.4 = getelementptr inbounds i8, i8* %add.ptr178.3, i64 %idx.ext
				%55 = bitcast i8* %add.ptr.4 to <8 x i8>*
				%56 = load <8 x i8>, <8 x i8>* %55, align 1
				%57 = zext <8 x i8> %56 to <8 x i32>
				%58 = bitcast i8* %add.ptr178.4 to <8 x i8>*
				%59 = load <8 x i8>, <8 x i8>* %58, align 1
				%60 = zext <8 x i8> %59 to <8 x i32>
				%61 = sub nsw <8 x i32> %57, %60
				%62 = icmp sgt <8 x i32> %61, zeroinitializer
				%63 = sub nsw <8 x i32> zeroinitializer, %61
				%64 = select <8 x i1> %62, <8 x i32> %61, <8 x i32> %63
				%rdx.shuf.5 = shufflevector <8 x i32> %64, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.5 = add <8 x i32> %64, %rdx.shuf.5
				%rdx.shuf229.5 = shufflevector <8 x i32> %bin.rdx.5, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.5 = add <8 x i32> %bin.rdx.5, %rdx.shuf229.5
				%rdx.shuf231.5 = shufflevector <8 x i32> %bin.rdx230.5, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.5 = add <8 x i32> %bin.rdx230.5, %rdx.shuf231.5
				%65 = extractelement <8 x i32> %bin.rdx232.5, i32 0
				%bin.extra.5 = add i32 %65, %bin.extra.4
				%add.ptr.5 = getelementptr inbounds i8, i8* %add.ptr.4, i64 %idx.ext
				%add.ptr178.5 = getelementptr inbounds i8, i8* %add.ptr178.4, i64 %idx.ext
				%66 = bitcast i8* %add.ptr.5 to <8 x i8>*
				%67 = load <8 x i8>, <8 x i8>* %66, align 1
				%68 = zext <8 x i8> %67 to <8 x i32>
				%69 = bitcast i8* %add.ptr178.5 to <8 x i8>*
				%70 = load <8 x i8>, <8 x i8>* %69, align 1
				%71 = zext <8 x i8> %70 to <8 x i32>
				%72 = sub nsw <8 x i32> %68, %71
				%73 = icmp sgt <8 x i32> %72, zeroinitializer
				%74 = sub nsw <8 x i32> zeroinitializer, %72
				%75 = select <8 x i1> %73, <8 x i32> %72, <8 x i32> %74
				%rdx.shuf.6 = shufflevector <8 x i32> %75, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.6 = add <8 x i32> %75, %rdx.shuf.6
				%rdx.shuf229.6 = shufflevector <8 x i32> %bin.rdx.6, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.6 = add <8 x i32> %bin.rdx.6, %rdx.shuf229.6
				%rdx.shuf231.6 = shufflevector <8 x i32> %bin.rdx230.6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.6 = add <8 x i32> %bin.rdx230.6, %rdx.shuf231.6
				%76 = extractelement <8 x i32> %bin.rdx232.6, i32 0
				%bin.extra.6 = add i32 %76, %bin.extra.5
				%add.ptr.6 = getelementptr inbounds i8, i8* %add.ptr.5, i64 %idx.ext
				%add.ptr178.6 = getelementptr inbounds i8, i8* %add.ptr178.5, i64 %idx.ext
				%77 = bitcast i8* %add.ptr.6 to <8 x i8>*
				%78 = load <8 x i8>, <8 x i8>* %77, align 1
				%79 = zext <8 x i8> %78 to <8 x i32>
				%80 = bitcast i8* %add.ptr178.6 to <8 x i8>*
				%81 = load <8 x i8>, <8 x i8>* %80, align 1
				%82 = zext <8 x i8> %81 to <8 x i32>
				%83 = sub nsw <8 x i32> %79, %82
				%84 = icmp sgt <8 x i32> %83, zeroinitializer
				%85 = sub nsw <8 x i32> zeroinitializer, %83
				%86 = select <8 x i1> %84, <8 x i32> %83, <8 x i32> %85
				%rdx.shuf.7 = shufflevector <8 x i32> %86, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.7 = add <8 x i32> %86, %rdx.shuf.7
				%rdx.shuf229.7 = shufflevector <8 x i32> %bin.rdx.7, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.7 = add <8 x i32> %bin.rdx.7, %rdx.shuf229.7
				%rdx.shuf231.7 = shufflevector <8 x i32> %bin.rdx230.7, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.7 = add <8 x i32> %bin.rdx230.7, %rdx.shuf231.7
				%87 = extractelement <8 x i32> %bin.rdx232.7, i32 0
				%bin.extra.7 = add i32 %87, %bin.extra.6
				ret i32 %bin.extra.7
				}

				define i32 @sad8_64bit_icmp_sle(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #2 {
				; SSE2-LABEL: sad8_64bit_icmp_sle:
				; SSE2: # BB#0: # %entry
				; SSE2-NEXT: movl %edx, %eax
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %ecx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %edx
				; SSE2-NEXT: addl %ecx, %edx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %ecx
				; SSE2-NEXT: addl %edx, %ecx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %edx
				; SSE2-NEXT: addl %ecx, %edx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %ecx
				; SSE2-NEXT: addl %edx, %ecx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %edx
				; SSE2-NEXT: addl %ecx, %edx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %ecx
				; SSE2-NEXT: addl %edx, %ecx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %eax
				; SSE2-NEXT: addl %ecx, %eax
				; SSE2-NEXT: retq
				;
				; AVX2-LABEL: sad8_64bit_icmp_sle:
				; AVX2: # BB#0: # %entry
				; AVX2-NEXT: movl %edx, %eax
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %ecx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %edx
				; AVX2-NEXT: addl %ecx, %edx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %ecx
				; AVX2-NEXT: addl %edx, %ecx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %edx
				; AVX2-NEXT: addl %ecx, %edx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %ecx
				; AVX2-NEXT: addl %edx, %ecx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %edx
				; AVX2-NEXT: addl %ecx, %edx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %ecx
				; AVX2-NEXT: addl %edx, %ecx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %eax
				; AVX2-NEXT: addl %ecx, %eax
				; AVX2-NEXT: retq
				;
				; AVX512F-LABEL: sad8_64bit_icmp_sle:
				; AVX512F: # BB#0: # %entry
				; AVX512F-NEXT: movl %edx, %eax
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %ecx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %edx
				; AVX512F-NEXT: addl %ecx, %edx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %ecx
				; AVX512F-NEXT: addl %edx, %ecx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %edx
				; AVX512F-NEXT: addl %ecx, %edx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %ecx
				; AVX512F-NEXT: addl %edx, %ecx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %edx
				; AVX512F-NEXT: addl %ecx, %edx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %ecx
				; AVX512F-NEXT: addl %edx, %ecx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %eax
				; AVX512F-NEXT: addl %ecx, %eax
				; AVX512F-NEXT: retq
				entry:
				%idx.ext = zext i32 %stride to i64
				br label %for.body

				for.body: ; preds = %entry
				%0 = bitcast i8* %cur to <8 x i8>*
				%1 = load <8 x i8>, <8 x i8>* %0, align 1
				%2 = zext <8 x i8> %1 to <8 x i32>
				%3 = bitcast i8* %ref to <8 x i8>*
				%4 = load <8 x i8>, <8 x i8>* %3, align 1
				%5 = zext <8 x i8> %4 to <8 x i32>
				%6 = sub nsw <8 x i32> %2, %5
				%7 = icmp slt <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
				%8 = sub nsw <8 x i32> zeroinitializer, %6
				%9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
				%rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx = add <8 x i32> %9, %rdx.shuf
				%rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
				%rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
				%10 = extractelement <8 x i32> %bin.rdx232, i32 0
				%add.ptr = getelementptr inbounds i8, i8* %cur, i64 %idx.ext
				%add.ptr178 = getelementptr inbounds i8, i8* %ref, i64 %idx.ext
				%11 = bitcast i8* %add.ptr to <8 x i8>*
				%12 = load <8 x i8>, <8 x i8>* %11, align 1
				%13 = zext <8 x i8> %12 to <8 x i32>
				%14 = bitcast i8* %add.ptr178 to <8 x i8>*
				%15 = load <8 x i8>, <8 x i8>* %14, align 1
				%16 = zext <8 x i8> %15 to <8 x i32>
				%17 = sub nsw <8 x i32> %13, %16
				%18 = icmp slt <8 x i32> %17, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
				%19 = sub nsw <8 x i32> zeroinitializer, %17
				%20 = select <8 x i1> %18, <8 x i32> %19, <8 x i32> %17
				%rdx.shuf.1 = shufflevector <8 x i32> %20, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.1 = add <8 x i32> %20, %rdx.shuf.1
				%rdx.shuf229.1 = shufflevector <8 x i32> %bin.rdx.1, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.1 = add <8 x i32> %bin.rdx.1, %rdx.shuf229.1
				%rdx.shuf231.1 = shufflevector <8 x i32> %bin.rdx230.1, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.1 = add <8 x i32> %bin.rdx230.1, %rdx.shuf231.1
				%21 = extractelement <8 x i32> %bin.rdx232.1, i32 0
				%bin.extra.1 = add i32 %21, %10
				%add.ptr.1 = getelementptr inbounds i8, i8* %add.ptr, i64 %idx.ext
				%add.ptr178.1 = getelementptr inbounds i8, i8* %add.ptr178, i64 %idx.ext
				%22 = bitcast i8* %add.ptr.1 to <8 x i8>*
				%23 = load <8 x i8>, <8 x i8>* %22, align 1
				%24 = zext <8 x i8> %23 to <8 x i32>
				%25 = bitcast i8* %add.ptr178.1 to <8 x i8>*
				%26 = load <8 x i8>, <8 x i8>* %25, align 1
				%27 = zext <8 x i8> %26 to <8 x i32>
				%28 = sub nsw <8 x i32> %24, %27
				%29 = icmp slt <8 x i32> %28, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
				%30 = sub nsw <8 x i32> zeroinitializer, %28
				%31 = select <8 x i1> %29, <8 x i32> %30, <8 x i32> %28
				%rdx.shuf.2 = shufflevector <8 x i32> %31, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.2 = add <8 x i32> %31, %rdx.shuf.2
				%rdx.shuf229.2 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.2 = add <8 x i32> %bin.rdx.2, %rdx.shuf229.2
				%rdx.shuf231.2 = shufflevector <8 x i32> %bin.rdx230.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.2 = add <8 x i32> %bin.rdx230.2, %rdx.shuf231.2
				%32 = extractelement <8 x i32> %bin.rdx232.2, i32 0
				%bin.extra.2 = add i32 %32, %bin.extra.1
				%add.ptr.2 = getelementptr inbounds i8, i8* %add.ptr.1, i64 %idx.ext
				%add.ptr178.2 = getelementptr inbounds i8, i8* %add.ptr178.1, i64 %idx.ext
				%33 = bitcast i8* %add.ptr.2 to <8 x i8>*
				%34 = load <8 x i8>, <8 x i8>* %33, align 1
				%35 = zext <8 x i8> %34 to <8 x i32>
				%36 = bitcast i8* %add.ptr178.2 to <8 x i8>*
				%37 = load <8 x i8>, <8 x i8>* %36, align 1
				%38 = zext <8 x i8> %37 to <8 x i32>
				%39 = sub nsw <8 x i32> %35, %38
				%40 = icmp slt <8 x i32> %39, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
				%41 = sub nsw <8 x i32> zeroinitializer, %39
				%42 = select <8 x i1> %40, <8 x i32> %41, <8 x i32> %39
				%rdx.shuf.3 = shufflevector <8 x i32> %42, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.3 = add <8 x i32> %42, %rdx.shuf.3
				%rdx.shuf229.3 = shufflevector <8 x i32> %bin.rdx.3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.3 = add <8 x i32> %bin.rdx.3, %rdx.shuf229.3
				%rdx.shuf231.3 = shufflevector <8 x i32> %bin.rdx230.3, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.3 = add <8 x i32> %bin.rdx230.3, %rdx.shuf231.3
				%43 = extractelement <8 x i32> %bin.rdx232.3, i32 0
				%bin.extra.3 = add i32 %43, %bin.extra.2
				%add.ptr.3 = getelementptr inbounds i8, i8* %add.ptr.2, i64 %idx.ext
				%add.ptr178.3 = getelementptr inbounds i8, i8* %add.ptr178.2, i64 %idx.ext
				%44 = bitcast i8* %add.ptr.3 to <8 x i8>*
				%45 = load <8 x i8>, <8 x i8>* %44, align 1
				%46 = zext <8 x i8> %45 to <8 x i32>
				%47 = bitcast i8* %add.ptr178.3 to <8 x i8>*
				%48 = load <8 x i8>, <8 x i8>* %47, align 1
				%49 = zext <8 x i8> %48 to <8 x i32>
				%50 = sub nsw <8 x i32> %46, %49
				%51 = icmp slt <8 x i32> %50, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
				%52 = sub nsw <8 x i32> zeroinitializer, %50
				%53 = select <8 x i1> %51, <8 x i32> %52, <8 x i32> %50
				%rdx.shuf.4 = shufflevector <8 x i32> %53, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.4 = add <8 x i32> %53, %rdx.shuf.4
				%rdx.shuf229.4 = shufflevector <8 x i32> %bin.rdx.4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.4 = add <8 x i32> %bin.rdx.4, %rdx.shuf229.4
				%rdx.shuf231.4 = shufflevector <8 x i32> %bin.rdx230.4, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.4 = add <8 x i32> %bin.rdx230.4, %rdx.shuf231.4
				%54 = extractelement <8 x i32> %bin.rdx232.4, i32 0
				%bin.extra.4 = add i32 %54, %bin.extra.3
				%add.ptr.4 = getelementptr inbounds i8, i8* %add.ptr.3, i64 %idx.ext
				%add.ptr178.4 = getelementptr inbounds i8, i8* %add.ptr178.3, i64 %idx.ext
				%55 = bitcast i8* %add.ptr.4 to <8 x i8>*
				%56 = load <8 x i8>, <8 x i8>* %55, align 1
				%57 = zext <8 x i8> %56 to <8 x i32>
				%58 = bitcast i8* %add.ptr178.4 to <8 x i8>*
				%59 = load <8 x i8>, <8 x i8>* %58, align 1
				%60 = zext <8 x i8> %59 to <8 x i32>
				%61 = sub nsw <8 x i32> %57, %60
				%62 = icmp slt <8 x i32> %61, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
				%63 = sub nsw <8 x i32> zeroinitializer, %61
				%64 = select <8 x i1> %62, <8 x i32> %63, <8 x i32> %61
				%rdx.shuf.5 = shufflevector <8 x i32> %64, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.5 = add <8 x i32> %64, %rdx.shuf.5
				%rdx.shuf229.5 = shufflevector <8 x i32> %bin.rdx.5, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.5 = add <8 x i32> %bin.rdx.5, %rdx.shuf229.5
				%rdx.shuf231.5 = shufflevector <8 x i32> %bin.rdx230.5, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.5 = add <8 x i32> %bin.rdx230.5, %rdx.shuf231.5
				%65 = extractelement <8 x i32> %bin.rdx232.5, i32 0
				%bin.extra.5 = add i32 %65, %bin.extra.4
				%add.ptr.5 = getelementptr inbounds i8, i8* %add.ptr.4, i64 %idx.ext
				%add.ptr178.5 = getelementptr inbounds i8, i8* %add.ptr178.4, i64 %idx.ext
				%66 = bitcast i8* %add.ptr.5 to <8 x i8>*
				%67 = load <8 x i8>, <8 x i8>* %66, align 1
				%68 = zext <8 x i8> %67 to <8 x i32>
				%69 = bitcast i8* %add.ptr178.5 to <8 x i8>*
				%70 = load <8 x i8>, <8 x i8>* %69, align 1
				%71 = zext <8 x i8> %70 to <8 x i32>
				%72 = sub nsw <8 x i32> %68, %71
				%73 = icmp slt <8 x i32> %72, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
				%74 = sub nsw <8 x i32> zeroinitializer, %72
				%75 = select <8 x i1> %73, <8 x i32> %74, <8 x i32> %72
				%rdx.shuf.6 = shufflevector <8 x i32> %75, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.6 = add <8 x i32> %75, %rdx.shuf.6
				%rdx.shuf229.6 = shufflevector <8 x i32> %bin.rdx.6, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.6 = add <8 x i32> %bin.rdx.6, %rdx.shuf229.6
				%rdx.shuf231.6 = shufflevector <8 x i32> %bin.rdx230.6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.6 = add <8 x i32> %bin.rdx230.6, %rdx.shuf231.6
				%76 = extractelement <8 x i32> %bin.rdx232.6, i32 0
				%bin.extra.6 = add i32 %76, %bin.extra.5
				%add.ptr.6 = getelementptr inbounds i8, i8* %add.ptr.5, i64 %idx.ext
				%add.ptr178.6 = getelementptr inbounds i8, i8* %add.ptr178.5, i64 %idx.ext
				%77 = bitcast i8* %add.ptr.6 to <8 x i8>*
				%78 = load <8 x i8>, <8 x i8>* %77, align 1
				%79 = zext <8 x i8> %78 to <8 x i32>
				%80 = bitcast i8* %add.ptr178.6 to <8 x i8>*
				%81 = load <8 x i8>, <8 x i8>* %80, align 1
				%82 = zext <8 x i8> %81 to <8 x i32>
				%83 = sub nsw <8 x i32> %79, %82
				%84 = icmp slt <8 x i32> %83, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
				%85 = sub nsw <8 x i32> zeroinitializer, %83
				%86 = select <8 x i1> %84, <8 x i32> %85, <8 x i32> %83
				%rdx.shuf.7 = shufflevector <8 x i32> %86, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.7 = add <8 x i32> %86, %rdx.shuf.7
				%rdx.shuf229.7 = shufflevector <8 x i32> %bin.rdx.7, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.7 = add <8 x i32> %bin.rdx.7, %rdx.shuf229.7
				%rdx.shuf231.7 = shufflevector <8 x i32> %bin.rdx230.7, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.7 = add <8 x i32> %bin.rdx230.7, %rdx.shuf231.7
				%87 = extractelement <8 x i32> %bin.rdx232.7, i32 0
				%bin.extra.7 = add i32 %87, %bin.extra.6
				ret i32 %bin.extra.7
				}

				define i32 @sad8_64bit_icmp_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #3 {
				; SSE2-LABEL: sad8_64bit_icmp_slt:
				; SSE2: # BB#0: # %entry
				; SSE2-NEXT: movl %edx, %eax
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %ecx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %edx
				; SSE2-NEXT: addl %ecx, %edx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %ecx
				; SSE2-NEXT: addl %edx, %ecx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %edx
				; SSE2-NEXT: addl %ecx, %edx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %ecx
				; SSE2-NEXT: addl %edx, %ecx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %edx
				; SSE2-NEXT: addl %ecx, %edx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rsi
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: addq %rax, %rdi
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %ecx
				; SSE2-NEXT: addl %edx, %ecx
				; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; SSE2-NEXT: psadbw %xmm0, %xmm1
				; SSE2-NEXT: movd %xmm1, %eax
				; SSE2-NEXT: addl %ecx, %eax
				; SSE2-NEXT: retq
				;
				; AVX2-LABEL: sad8_64bit_icmp_slt:
				; AVX2: # BB#0: # %entry
				; AVX2-NEXT: movl %edx, %eax
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %ecx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %edx
				; AVX2-NEXT: addl %ecx, %edx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %ecx
				; AVX2-NEXT: addl %edx, %ecx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %edx
				; AVX2-NEXT: addl %ecx, %edx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %ecx
				; AVX2-NEXT: addl %edx, %ecx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %edx
				; AVX2-NEXT: addl %ecx, %edx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rsi
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: addq %rax, %rdi
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %ecx
				; AVX2-NEXT: addl %edx, %ecx
				; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vmovd %xmm0, %eax
				; AVX2-NEXT: addl %ecx, %eax
				; AVX2-NEXT: retq
				;
				; AVX512F-LABEL: sad8_64bit_icmp_slt:
				; AVX512F: # BB#0: # %entry
				; AVX512F-NEXT: movl %edx, %eax
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %ecx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %edx
				; AVX512F-NEXT: addl %ecx, %edx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %ecx
				; AVX512F-NEXT: addl %edx, %ecx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %edx
				; AVX512F-NEXT: addl %ecx, %edx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %ecx
				; AVX512F-NEXT: addl %edx, %ecx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %edx
				; AVX512F-NEXT: addl %ecx, %edx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rsi
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: addq %rax, %rdi
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %ecx
				; AVX512F-NEXT: addl %edx, %ecx
				; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
				; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
				; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
				; AVX512F-NEXT: vmovd %xmm0, %eax
				; AVX512F-NEXT: addl %ecx, %eax
				; AVX512F-NEXT: retq
				entry:
				%idx.ext = zext i32 %stride to i64
				br label %for.body

				for.body: ; preds = %entry
				%0 = bitcast i8* %cur to <8 x i8>*
				%1 = load <8 x i8>, <8 x i8>* %0, align 1
				%2 = zext <8 x i8> %1 to <8 x i32>
				%3 = bitcast i8* %ref to <8 x i8>*
				%4 = load <8 x i8>, <8 x i8>* %3, align 1
				%5 = zext <8 x i8> %4 to <8 x i32>
				%6 = sub nsw <8 x i32> %2, %5
				%7 = icmp slt <8 x i32> %6, zeroinitializer
				%8 = sub nsw <8 x i32> zeroinitializer, %6
				%9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
				%rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx = add <8 x i32> %9, %rdx.shuf
				%rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
				%rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
				%10 = extractelement <8 x i32> %bin.rdx232, i32 0
				%add.ptr = getelementptr inbounds i8, i8* %cur, i64 %idx.ext
				%add.ptr178 = getelementptr inbounds i8, i8* %ref, i64 %idx.ext
				%11 = bitcast i8* %add.ptr to <8 x i8>*
				%12 = load <8 x i8>, <8 x i8>* %11, align 1
				%13 = zext <8 x i8> %12 to <8 x i32>
				%14 = bitcast i8* %add.ptr178 to <8 x i8>*
				%15 = load <8 x i8>, <8 x i8>* %14, align 1
				%16 = zext <8 x i8> %15 to <8 x i32>
				%17 = sub nsw <8 x i32> %13, %16
				%18 = icmp slt <8 x i32> %17, zeroinitializer
				%19 = sub nsw <8 x i32> zeroinitializer, %17
				%20 = select <8 x i1> %18, <8 x i32> %19, <8 x i32> %17
				%rdx.shuf.1 = shufflevector <8 x i32> %20, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.1 = add <8 x i32> %20, %rdx.shuf.1
				%rdx.shuf229.1 = shufflevector <8 x i32> %bin.rdx.1, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.1 = add <8 x i32> %bin.rdx.1, %rdx.shuf229.1
				%rdx.shuf231.1 = shufflevector <8 x i32> %bin.rdx230.1, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.1 = add <8 x i32> %bin.rdx230.1, %rdx.shuf231.1
				%21 = extractelement <8 x i32> %bin.rdx232.1, i32 0
				%bin.extra.1 = add i32 %21, %10
				%add.ptr.1 = getelementptr inbounds i8, i8* %add.ptr, i64 %idx.ext
				%add.ptr178.1 = getelementptr inbounds i8, i8* %add.ptr178, i64 %idx.ext
				%22 = bitcast i8* %add.ptr.1 to <8 x i8>*
				%23 = load <8 x i8>, <8 x i8>* %22, align 1
				%24 = zext <8 x i8> %23 to <8 x i32>
				%25 = bitcast i8* %add.ptr178.1 to <8 x i8>*
				%26 = load <8 x i8>, <8 x i8>* %25, align 1
				%27 = zext <8 x i8> %26 to <8 x i32>
				%28 = sub nsw <8 x i32> %24, %27
				%29 = icmp slt <8 x i32> %28, zeroinitializer
				%30 = sub nsw <8 x i32> zeroinitializer, %28
				%31 = select <8 x i1> %29, <8 x i32> %30, <8 x i32> %28
				%rdx.shuf.2 = shufflevector <8 x i32> %31, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.2 = add <8 x i32> %31, %rdx.shuf.2
				%rdx.shuf229.2 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.2 = add <8 x i32> %bin.rdx.2, %rdx.shuf229.2
				%rdx.shuf231.2 = shufflevector <8 x i32> %bin.rdx230.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.2 = add <8 x i32> %bin.rdx230.2, %rdx.shuf231.2
				%32 = extractelement <8 x i32> %bin.rdx232.2, i32 0
				%bin.extra.2 = add i32 %32, %bin.extra.1
				%add.ptr.2 = getelementptr inbounds i8, i8* %add.ptr.1, i64 %idx.ext
				%add.ptr178.2 = getelementptr inbounds i8, i8* %add.ptr178.1, i64 %idx.ext
				%33 = bitcast i8* %add.ptr.2 to <8 x i8>*
				%34 = load <8 x i8>, <8 x i8>* %33, align 1
				%35 = zext <8 x i8> %34 to <8 x i32>
				%36 = bitcast i8* %add.ptr178.2 to <8 x i8>*
				%37 = load <8 x i8>, <8 x i8>* %36, align 1
				%38 = zext <8 x i8> %37 to <8 x i32>
				%39 = sub nsw <8 x i32> %35, %38
				%40 = icmp slt <8 x i32> %39, zeroinitializer
				%41 = sub nsw <8 x i32> zeroinitializer, %39
				%42 = select <8 x i1> %40, <8 x i32> %41, <8 x i32> %39
				%rdx.shuf.3 = shufflevector <8 x i32> %42, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.3 = add <8 x i32> %42, %rdx.shuf.3
				%rdx.shuf229.3 = shufflevector <8 x i32> %bin.rdx.3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.3 = add <8 x i32> %bin.rdx.3, %rdx.shuf229.3
				%rdx.shuf231.3 = shufflevector <8 x i32> %bin.rdx230.3, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.3 = add <8 x i32> %bin.rdx230.3, %rdx.shuf231.3
				%43 = extractelement <8 x i32> %bin.rdx232.3, i32 0
				%bin.extra.3 = add i32 %43, %bin.extra.2
				%add.ptr.3 = getelementptr inbounds i8, i8* %add.ptr.2, i64 %idx.ext
				%add.ptr178.3 = getelementptr inbounds i8, i8* %add.ptr178.2, i64 %idx.ext
				%44 = bitcast i8* %add.ptr.3 to <8 x i8>*
				%45 = load <8 x i8>, <8 x i8>* %44, align 1
				%46 = zext <8 x i8> %45 to <8 x i32>
				%47 = bitcast i8* %add.ptr178.3 to <8 x i8>*
				%48 = load <8 x i8>, <8 x i8>* %47, align 1
				%49 = zext <8 x i8> %48 to <8 x i32>
				%50 = sub nsw <8 x i32> %46, %49
				%51 = icmp slt <8 x i32> %50, zeroinitializer
				%52 = sub nsw <8 x i32> zeroinitializer, %50
				%53 = select <8 x i1> %51, <8 x i32> %52, <8 x i32> %50
				%rdx.shuf.4 = shufflevector <8 x i32> %53, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.4 = add <8 x i32> %53, %rdx.shuf.4
				%rdx.shuf229.4 = shufflevector <8 x i32> %bin.rdx.4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.4 = add <8 x i32> %bin.rdx.4, %rdx.shuf229.4
				%rdx.shuf231.4 = shufflevector <8 x i32> %bin.rdx230.4, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.4 = add <8 x i32> %bin.rdx230.4, %rdx.shuf231.4
				%54 = extractelement <8 x i32> %bin.rdx232.4, i32 0
				%bin.extra.4 = add i32 %54, %bin.extra.3
				%add.ptr.4 = getelementptr inbounds i8, i8* %add.ptr.3, i64 %idx.ext
				%add.ptr178.4 = getelementptr inbounds i8, i8* %add.ptr178.3, i64 %idx.ext
				%55 = bitcast i8* %add.ptr.4 to <8 x i8>*
				%56 = load <8 x i8>, <8 x i8>* %55, align 1
				%57 = zext <8 x i8> %56 to <8 x i32>
				%58 = bitcast i8* %add.ptr178.4 to <8 x i8>*
				%59 = load <8 x i8>, <8 x i8>* %58, align 1
				%60 = zext <8 x i8> %59 to <8 x i32>
				%61 = sub nsw <8 x i32> %57, %60
				%62 = icmp slt <8 x i32> %61, zeroinitializer
				%63 = sub nsw <8 x i32> zeroinitializer, %61
				%64 = select <8 x i1> %62, <8 x i32> %63, <8 x i32> %61
				%rdx.shuf.5 = shufflevector <8 x i32> %64, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.5 = add <8 x i32> %64, %rdx.shuf.5
				%rdx.shuf229.5 = shufflevector <8 x i32> %bin.rdx.5, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.5 = add <8 x i32> %bin.rdx.5, %rdx.shuf229.5
				%rdx.shuf231.5 = shufflevector <8 x i32> %bin.rdx230.5, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.5 = add <8 x i32> %bin.rdx230.5, %rdx.shuf231.5
				%65 = extractelement <8 x i32> %bin.rdx232.5, i32 0
				%bin.extra.5 = add i32 %65, %bin.extra.4
				%add.ptr.5 = getelementptr inbounds i8, i8* %add.ptr.4, i64 %idx.ext
				%add.ptr178.5 = getelementptr inbounds i8, i8* %add.ptr178.4, i64 %idx.ext
				%66 = bitcast i8* %add.ptr.5 to <8 x i8>*
				%67 = load <8 x i8>, <8 x i8>* %66, align 1
				%68 = zext <8 x i8> %67 to <8 x i32>
				%69 = bitcast i8* %add.ptr178.5 to <8 x i8>*
				%70 = load <8 x i8>, <8 x i8>* %69, align 1
				%71 = zext <8 x i8> %70 to <8 x i32>
				%72 = sub nsw <8 x i32> %68, %71
				%73 = icmp slt <8 x i32> %72, zeroinitializer
				%74 = sub nsw <8 x i32> zeroinitializer, %72
				%75 = select <8 x i1> %73, <8 x i32> %74, <8 x i32> %72
				%rdx.shuf.6 = shufflevector <8 x i32> %75, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.6 = add <8 x i32> %75, %rdx.shuf.6
				%rdx.shuf229.6 = shufflevector <8 x i32> %bin.rdx.6, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.6 = add <8 x i32> %bin.rdx.6, %rdx.shuf229.6
				%rdx.shuf231.6 = shufflevector <8 x i32> %bin.rdx230.6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.6 = add <8 x i32> %bin.rdx230.6, %rdx.shuf231.6
				%76 = extractelement <8 x i32> %bin.rdx232.6, i32 0
				%bin.extra.6 = add i32 %76, %bin.extra.5
				%add.ptr.6 = getelementptr inbounds i8, i8* %add.ptr.5, i64 %idx.ext
				%add.ptr178.6 = getelementptr inbounds i8, i8* %add.ptr178.5, i64 %idx.ext
				%77 = bitcast i8* %add.ptr.6 to <8 x i8>*
				%78 = load <8 x i8>, <8 x i8>* %77, align 1
				%79 = zext <8 x i8> %78 to <8 x i32>
				%80 = bitcast i8* %add.ptr178.6 to <8 x i8>*
				%81 = load <8 x i8>, <8 x i8>* %80, align 1
				%82 = zext <8 x i8> %81 to <8 x i32>
				%83 = sub nsw <8 x i32> %79, %82
				%84 = icmp slt <8 x i32> %83, zeroinitializer
				%85 = sub nsw <8 x i32> zeroinitializer, %83
				%86 = select <8 x i1> %84, <8 x i32> %85, <8 x i32> %83
				%rdx.shuf.7 = shufflevector <8 x i32> %86, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx.7 = add <8 x i32> %86, %rdx.shuf.7
				%rdx.shuf229.7 = shufflevector <8 x i32> %bin.rdx.7, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx230.7 = add <8 x i32> %bin.rdx.7, %rdx.shuf229.7
				%rdx.shuf231.7 = shufflevector <8 x i32> %bin.rdx230.7, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				%bin.rdx232.7 = add <8 x i32> %bin.rdx230.7, %rdx.shuf231.7
				%87 = extractelement <8 x i32> %bin.rdx232.7, i32 0
				%bin.extra.7 = add i32 %87, %bin.extra.6
				ret i32 %bin.extra.7
				}

This is an archive of the discontinued LLVM Phabricator instance.

Add 64 bit pattern matching for PSADBW
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 93776

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/sad_variations.ll

This is an archive of the discontinued LLVM Phabricator instance.

Add 64 bit pattern matching for PSADBWClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 93776

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/sad_variations.ll

Add 64 bit pattern matching for PSADBW
ClosedPublic