This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
lib/Target/X86/
-
Target/
-
X86/
-
X86ISelLowering.cpp
-
llvm/test/CodeGen/X86/
-
test/
-
CodeGen/
-
X86/
-
memcmp.ll
-
setcc-wide-types.ll

Differential D68457

[X86] Enable AVX512BW for memcmp
ClosedPublic

Authored by davezarzycki on Oct 4 2019, 4:34 AM.

Download Raw Diff

Details

Reviewers

craig.topper

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

davezarzycki created this revision.Oct 4 2019, 4:34 AM

craig.topper added inline comments.Oct 4 2019, 9:58 AM

test/CodeGen/X86/memcmp.ll
1015 ↗	(On Diff #223186)	I'm not sure this is better than the avx code. vpcmpeq to mask register and kortest are both 3 cycle latency. The vpcmpb to xmm is 1 cyc. vpmovmskb is 1 cyc and cmpl is 1 cyc.

That's a reasonable point. From my perspective:

Staying in the opmask registers as long as possible relieves general purpose register pressure.
When/if MaxLoadsPerMemcmp becomes greater than 2, then keeping the intermediate results in the opmask registers is the right thing to do.
I have a pending patch to increase MaxLoadsPerMemcmp to 4 if-and-only-if a compare against zero is happening. Changing the value to 8 (if-and-only-if a compare against zero is happening) will require more work (I think).
In the case of 512-bit ops, using AVX512BW instead of just AVX512F better matches people's mental model (that byte-wise memcmp() should generate byte-wise vector instructions).

What do you think?

I'm fine with the 512-bit change to use byte compare with AVX512BW.

For 128/256, I wonder about the PTEST suggestion in the FIXME above this code. Unfortunately PTEST has 3 cycle latency on recent Intel CPUs. But it would avoid the GPR use entirely.

Simplified to just enabling AVX512BW for 512 and let 128/256 keep using AVX/AVX2.

PS – I trust your knowledge of instruction latencies. I wonder what Agner Fog did wrong if he measured PMOVMSKB as having 2-3 cycles of latency.

I think I just misremembered MOVMSKB latency. I think it is 2-3. I should have looked it up. I remembered from something previously that VPCMPEQ/KORTEST were both 3 cycs.

Is this patch ready to land then?

LGTM

This revision is now accepted and ready to land.Oct 5 2019, 9:33 AM

r373845

Revision Contents

Path

Size

		lib/	Target/	X86/
	llvm/	lib/	Target/	X86/

X86ISelLowering.cpp

9 lines

		llvm/	test/	CodeGen/	X86/
	i/	llvm/	test/	CodeGen/	X86/

memcmp.ll

20 lines

setcc-wide-types.ll

118 lines

Diff 223365

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 42,315 Lines • ▼ Show 20 Lines	if ((!IsVectorBitCastCheap(X) \|\| !IsVectorBitCastCheap(Y)) &&
return SDValue();		return SDValue();

// TODO: Use PXOR + PTEST for SSE4.1 or later?		// TODO: Use PXOR + PTEST for SSE4.1 or later?
EVT VT = SetCC->getValueType(0);		EVT VT = SetCC->getValueType(0);
SDLoc DL(SetCC);		SDLoc DL(SetCC);
if ((OpSize == 128 && Subtarget.hasSSE2()) \|\|		if ((OpSize == 128 && Subtarget.hasSSE2()) \|\|
(OpSize == 256 && Subtarget.hasAVX2()) \|\|		(OpSize == 256 && Subtarget.hasAVX2()) \|\|
(OpSize == 512 && Subtarget.useAVX512Regs())) {		(OpSize == 512 && Subtarget.useAVX512Regs())) {
EVT VecVT = OpSize == 512 ? MVT::v16i32 :		auto BW = Subtarget.hasBWI();
		EVT VecVT = OpSize == 512 ? (BW ? MVT::v64i8 : MVT::v16i32) :
OpSize == 256 ? MVT::v32i8 :		OpSize == 256 ? MVT::v32i8 :
MVT::v16i8;		MVT::v16i8;
EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT;		EVT CmpVT = OpSize == 512 ? (BW ? MVT::v64i1 : MVT::v16i1) : VecVT;

SDValue Cmp;		SDValue Cmp;
if (IsOrXorXorCCZero) {		if (IsOrXorXorCCZero) {
// This is a bitwise-combined equality comparison of 2 pairs of vectors:		// This is a bitwise-combined equality comparison of 2 pairs of vectors:
// setcc i128 (or (xor A, B), (xor C, D)), 0, eq\|ne		// setcc i128 (or (xor A, B), (xor C, D)), 0, eq\|ne
// Use 2 vector equality compares and 'and' the results before doing a		// Use 2 vector equality compares and 'and' the results before doing a
// MOVMSK.		// MOVMSK.
SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));		SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));		SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));		SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));		SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);		SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);		SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);
Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);		Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);
} else {		} else {
SDValue VecX = DAG.getBitcast(VecVT, X);		SDValue VecX = DAG.getBitcast(VecVT, X);
SDValue VecY = DAG.getBitcast(VecVT, Y);		SDValue VecY = DAG.getBitcast(VecVT, Y);
Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);		Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
}		}
// For 512-bits we want to emit a setcc that will lower to kortest.		// For 512-bits we want to emit a setcc that will lower to kortest.
		if (OpSize == 512 && BW)
		return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i64, Cmp),
		DAG.getConstant(0xFFFFFFFFFFFFFFFF, DL, MVT::i64), CC);
if (OpSize == 512)		if (OpSize == 512)
return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp),		return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp),
DAG.getConstant(0xFFFF, DL, MVT::i16), CC);		DAG.getConstant(0xFFFF, DL, MVT::i16), CC);
// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.		// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq		// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne		// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
// setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq		// setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
// setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne		// setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
▲ Show 20 Lines • Show All 3,462 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/memcmp.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov \| FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE			; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov \| FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
	; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse \| FileCheck %s --check-prefix=X86 --check-prefix=SSE --check-prefix=X86-SSE1			; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse \| FileCheck %s --check-prefix=X86 --check-prefix=SSE --check-prefix=X86-SSE1
	; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 \| FileCheck %s --check-prefix=X86 --check-prefix=SSE --check-prefix=X86-SSE2			; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 \| FileCheck %s --check-prefix=X86 --check-prefix=SSE --check-prefix=X86-SSE2
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown \| FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2			; RUN: llc < %s -mtriple=x86_64-unknown-unknown \| FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx \| FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1			; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx \| FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 \| FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2			; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 \| FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f \| FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512F			; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f \| FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512F
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw \| FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512F --check-prefix=X64-AVX512BW			; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw \| FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512BW

	; This tests codegen time inlining/optimization of memcmp			; This tests codegen time inlining/optimization of memcmp
	; rdar://6480398			; rdar://6480398

	@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1			@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1

	declare i32 @memcmp(i8, i8, i64)			declare i32 @memcmp(i8, i8, i64)

	▲ Show 20 Lines • Show All 1,528 Lines • ▼ Show 20 Lines
	; X64-AVX512F-LABEL: length64_eq:			; X64-AVX512F-LABEL: length64_eq:
	; X64-AVX512F: # %bb.0:			; X64-AVX512F: # %bb.0:
	; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0			; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
	; X64-AVX512F-NEXT: vpcmpeqd (%rsi), %zmm0, %k0			; X64-AVX512F-NEXT: vpcmpeqd (%rsi), %zmm0, %k0
	; X64-AVX512F-NEXT: kortestw %k0, %k0			; X64-AVX512F-NEXT: kortestw %k0, %k0
	; X64-AVX512F-NEXT: setae %al			; X64-AVX512F-NEXT: setae %al
	; X64-AVX512F-NEXT: vzeroupper			; X64-AVX512F-NEXT: vzeroupper
	; X64-AVX512F-NEXT: retq			; X64-AVX512F-NEXT: retq
				;
				; X64-AVX512BW-LABEL: length64_eq:
				; X64-AVX512BW: # %bb.0:
				; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0
				; X64-AVX512BW-NEXT: vpcmpeqb (%rsi), %zmm0, %k0
				; X64-AVX512BW-NEXT: kortestq %k0, %k0
				; X64-AVX512BW-NEXT: setae %al
				; X64-AVX512BW-NEXT: vzeroupper
				; X64-AVX512BW-NEXT: retq
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
	%cmp = icmp ne i32 %call, 0			%cmp = icmp ne i32 %call, 0
	ret i1 %cmp			ret i1 %cmp
	}			}

	define i1 @length64_eq_const(i8* %X) nounwind {			define i1 @length64_eq_const(i8* %X) nounwind {
	; X86-LABEL: length64_eq_const:			; X86-LABEL: length64_eq_const:
	; X86: # %bb.0:			; X86: # %bb.0:
	▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines
	; X64-AVX512F-LABEL: length64_eq_const:			; X64-AVX512F-LABEL: length64_eq_const:
	; X64-AVX512F: # %bb.0:			; X64-AVX512F: # %bb.0:
	; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0			; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
	; X64-AVX512F-NEXT: vpcmpeqd {{.*}}(%rip), %zmm0, %k0			; X64-AVX512F-NEXT: vpcmpeqd {{.*}}(%rip), %zmm0, %k0
	; X64-AVX512F-NEXT: kortestw %k0, %k0			; X64-AVX512F-NEXT: kortestw %k0, %k0
	; X64-AVX512F-NEXT: setb %al			; X64-AVX512F-NEXT: setb %al
	; X64-AVX512F-NEXT: vzeroupper			; X64-AVX512F-NEXT: vzeroupper
	; X64-AVX512F-NEXT: retq			; X64-AVX512F-NEXT: retq
				;
				; X64-AVX512BW-LABEL: length64_eq_const:
				; X64-AVX512BW: # %bb.0:
				; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0
				; X64-AVX512BW-NEXT: vpcmpeqb {{.*}}(%rip), %zmm0, %k0
				; X64-AVX512BW-NEXT: kortestq %k0, %k0
				; X64-AVX512BW-NEXT: setb %al
				; X64-AVX512BW-NEXT: vzeroupper
				; X64-AVX512BW-NEXT: retq
	%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind			%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind
	%c = icmp eq i32 %m, 0			%c = icmp eq i32 %m, 0
	ret i1 %c			ret i1 %c
	}			}

	; This checks that we do not do stupid things with huge sizes.			; This checks that we do not do stupid things with huge sizes.
	define i32 @huge_length(i8* %X, i8* %Y) nounwind {			define i32 @huge_length(i8* %X, i8* %Y) nounwind {
	; X86-LABEL: huge_length:			; X86-LABEL: huge_length:
	▲ Show 20 Lines • Show All 83 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/setcc-wide-types.ll

	Show First 20 Lines • Show All 313 Lines • ▼ Show 20 Lines
	; AVX2-NEXT: orq %rcx, %rdi			; AVX2-NEXT: orq %rcx, %rdi
	; AVX2-NEXT: orq %rax, %rdi			; AVX2-NEXT: orq %rax, %rdi
	; AVX2-NEXT: xorl %eax, %eax			; AVX2-NEXT: xorl %eax, %eax
	; AVX2-NEXT: orq %rdx, %rdi			; AVX2-NEXT: orq %rdx, %rdi
	; AVX2-NEXT: setne %al			; AVX2-NEXT: setne %al
	; AVX2-NEXT: vzeroupper			; AVX2-NEXT: vzeroupper
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	; AVX512-LABEL: ne_i512:			; AVX512F-LABEL: ne_i512:
	; AVX512: # %bb.0:			; AVX512F: # %bb.0:
	; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k0			; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
	; AVX512-NEXT: xorl %eax, %eax			; AVX512F-NEXT: xorl %eax, %eax
	; AVX512-NEXT: kortestw %k0, %k0			; AVX512F-NEXT: kortestw %k0, %k0
	; AVX512-NEXT: setae %al			; AVX512F-NEXT: setae %al
	; AVX512-NEXT: vzeroupper			; AVX512F-NEXT: vzeroupper
	; AVX512-NEXT: retq			; AVX512F-NEXT: retq
				;
				; AVX512BW-LABEL: ne_i512:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
				; AVX512BW-NEXT: xorl %eax, %eax
				; AVX512BW-NEXT: kortestq %k0, %k0
				; AVX512BW-NEXT: setae %al
				; AVX512BW-NEXT: vzeroupper
				; AVX512BW-NEXT: retq
	%bcx = bitcast <8 x i64> %x to i512			%bcx = bitcast <8 x i64> %x to i512
	%bcy = bitcast <8 x i64> %y to i512			%bcy = bitcast <8 x i64> %y to i512
	%cmp = icmp ne i512 %bcx, %bcy			%cmp = icmp ne i512 %bcx, %bcy
	%zext = zext i1 %cmp to i32			%zext = zext i1 %cmp to i32
	ret i32 %zext			ret i32 %zext
	}			}

	define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {			define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
	▲ Show 20 Lines • Show All 121 Lines • ▼ Show 20 Lines
	; AVX2-NEXT: orq %rcx, %rdi			; AVX2-NEXT: orq %rcx, %rdi
	; AVX2-NEXT: orq %rax, %rdi			; AVX2-NEXT: orq %rax, %rdi
	; AVX2-NEXT: xorl %eax, %eax			; AVX2-NEXT: xorl %eax, %eax
	; AVX2-NEXT: orq %rdx, %rdi			; AVX2-NEXT: orq %rdx, %rdi
	; AVX2-NEXT: sete %al			; AVX2-NEXT: sete %al
	; AVX2-NEXT: vzeroupper			; AVX2-NEXT: vzeroupper
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	; AVX512-LABEL: eq_i512:			; AVX512F-LABEL: eq_i512:
	; AVX512: # %bb.0:			; AVX512F: # %bb.0:
	; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k0			; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
	; AVX512-NEXT: xorl %eax, %eax			; AVX512F-NEXT: xorl %eax, %eax
	; AVX512-NEXT: kortestw %k0, %k0			; AVX512F-NEXT: kortestw %k0, %k0
	; AVX512-NEXT: setb %al			; AVX512F-NEXT: setb %al
	; AVX512-NEXT: vzeroupper			; AVX512F-NEXT: vzeroupper
	; AVX512-NEXT: retq			; AVX512F-NEXT: retq
				;
				; AVX512BW-LABEL: eq_i512:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
				; AVX512BW-NEXT: xorl %eax, %eax
				; AVX512BW-NEXT: kortestq %k0, %k0
				; AVX512BW-NEXT: setb %al
				; AVX512BW-NEXT: vzeroupper
				; AVX512BW-NEXT: retq
	%bcx = bitcast <8 x i64> %x to i512			%bcx = bitcast <8 x i64> %x to i512
	%bcy = bitcast <8 x i64> %y to i512			%bcy = bitcast <8 x i64> %y to i512
	%cmp = icmp eq i512 %bcx, %bcy			%cmp = icmp eq i512 %bcx, %bcy
	%zext = zext i1 %cmp to i32			%zext = zext i1 %cmp to i32
	ret i32 %zext			ret i32 %zext
	}			}

	; This test models the expansion of 'memcmp(a, b, 32) != 0'			; This test models the expansion of 'memcmp(a, b, 32) != 0'
	▲ Show 20 Lines • Show All 316 Lines • ▼ Show 20 Lines
	; NO512-NEXT: orq %rdi, %rdx			; NO512-NEXT: orq %rdi, %rdx
	; NO512-NEXT: orq %rax, %rdx			; NO512-NEXT: orq %rax, %rdx
	; NO512-NEXT: orq %r9, %rdx			; NO512-NEXT: orq %r9, %rdx
	; NO512-NEXT: xorl %eax, %eax			; NO512-NEXT: xorl %eax, %eax
	; NO512-NEXT: orq %rcx, %rdx			; NO512-NEXT: orq %rcx, %rdx
	; NO512-NEXT: setne %al			; NO512-NEXT: setne %al
	; NO512-NEXT: retq			; NO512-NEXT: retq
	;			;
	; AVX512-LABEL: ne_i512_pair:			; AVX512F-LABEL: ne_i512_pair:
	; AVX512: # %bb.0:			; AVX512F: # %bb.0:
	; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0			; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
	; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1			; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1
	; AVX512-NEXT: vpcmpeqd (%rsi), %zmm0, %k1			; AVX512F-NEXT: vpcmpeqd (%rsi), %zmm0, %k1
	; AVX512-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1}			; AVX512F-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1}
	; AVX512-NEXT: xorl %eax, %eax			; AVX512F-NEXT: xorl %eax, %eax
	; AVX512-NEXT: kortestw %k0, %k0			; AVX512F-NEXT: kortestw %k0, %k0
	; AVX512-NEXT: setae %al			; AVX512F-NEXT: setae %al
	; AVX512-NEXT: vzeroupper			; AVX512F-NEXT: vzeroupper
	; AVX512-NEXT: retq			; AVX512F-NEXT: retq
				;
				; AVX512BW-LABEL: ne_i512_pair:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0
				; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1
				; AVX512BW-NEXT: vpcmpeqb (%rsi), %zmm0, %k1
				; AVX512BW-NEXT: vpcmpeqb 64(%rsi), %zmm1, %k0 {%k1}
				; AVX512BW-NEXT: xorl %eax, %eax
				; AVX512BW-NEXT: kortestq %k0, %k0
				; AVX512BW-NEXT: setae %al
				; AVX512BW-NEXT: vzeroupper
				; AVX512BW-NEXT: retq
	%a0 = load i512, i512* %a			%a0 = load i512, i512* %a
	%b0 = load i512, i512* %b			%b0 = load i512, i512* %b
	%xor1 = xor i512 %a0, %b0			%xor1 = xor i512 %a0, %b0
	%ap1 = getelementptr i512, i512* %a, i512 1			%ap1 = getelementptr i512, i512* %a, i512 1
	%bp1 = getelementptr i512, i512* %b, i512 1			%bp1 = getelementptr i512, i512* %b, i512 1
	%a1 = load i512, i512* %ap1			%a1 = load i512, i512* %ap1
	%b1 = load i512, i512* %bp1			%b1 = load i512, i512* %bp1
	%xor2 = xor i512 %a1, %b1			%xor2 = xor i512 %a1, %b1
	▲ Show 20 Lines • Show All 55 Lines • ▼ Show 20 Lines
	; NO512-NEXT: orq %rdi, %rdx			; NO512-NEXT: orq %rdi, %rdx
	; NO512-NEXT: orq %rax, %rdx			; NO512-NEXT: orq %rax, %rdx
	; NO512-NEXT: orq %r9, %rdx			; NO512-NEXT: orq %r9, %rdx
	; NO512-NEXT: xorl %eax, %eax			; NO512-NEXT: xorl %eax, %eax
	; NO512-NEXT: orq %rcx, %rdx			; NO512-NEXT: orq %rcx, %rdx
	; NO512-NEXT: sete %al			; NO512-NEXT: sete %al
	; NO512-NEXT: retq			; NO512-NEXT: retq
	;			;
	; AVX512-LABEL: eq_i512_pair:			; AVX512F-LABEL: eq_i512_pair:
	; AVX512: # %bb.0:			; AVX512F: # %bb.0:
	; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0			; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
	; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1			; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1
	; AVX512-NEXT: vpcmpeqd (%rsi), %zmm0, %k1			; AVX512F-NEXT: vpcmpeqd (%rsi), %zmm0, %k1
	; AVX512-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1}			; AVX512F-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1}
	; AVX512-NEXT: xorl %eax, %eax			; AVX512F-NEXT: xorl %eax, %eax
	; AVX512-NEXT: kortestw %k0, %k0			; AVX512F-NEXT: kortestw %k0, %k0
	; AVX512-NEXT: setb %al			; AVX512F-NEXT: setb %al
	; AVX512-NEXT: vzeroupper			; AVX512F-NEXT: vzeroupper
	; AVX512-NEXT: retq			; AVX512F-NEXT: retq
				;
				; AVX512BW-LABEL: eq_i512_pair:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0
				; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1
				; AVX512BW-NEXT: vpcmpeqb (%rsi), %zmm0, %k1
				; AVX512BW-NEXT: vpcmpeqb 64(%rsi), %zmm1, %k0 {%k1}
				; AVX512BW-NEXT: xorl %eax, %eax
				; AVX512BW-NEXT: kortestq %k0, %k0
				; AVX512BW-NEXT: setb %al
				; AVX512BW-NEXT: vzeroupper
				; AVX512BW-NEXT: retq
	%a0 = load i512, i512* %a			%a0 = load i512, i512* %a
	%b0 = load i512, i512* %b			%b0 = load i512, i512* %b
	%xor1 = xor i512 %a0, %b0			%xor1 = xor i512 %a0, %b0
	%ap1 = getelementptr i512, i512* %a, i512 1			%ap1 = getelementptr i512, i512* %a, i512 1
	%bp1 = getelementptr i512, i512* %b, i512 1			%bp1 = getelementptr i512, i512* %b, i512 1
	%a1 = load i512, i512* %ap1			%a1 = load i512, i512* %ap1
	%b1 = load i512, i512* %bp1			%b1 = load i512, i512* %bp1
	%xor2 = xor i512 %a1, %b1			%xor2 = xor i512 %a1, %b1
	▲ Show 20 Lines • Show All 189 Lines • Show Last 20 Lines