Skip to content

Commit

Permalink
[X86] Add AVX512 support to combineVectorSizedSetCCEquality.
Browse files Browse the repository at this point in the history
Reviewers: spatel, RKSimon

Reviewed By: spatel

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D52424

llvm-svn: 342989
  • Loading branch information
topperc committed Sep 25, 2018
1 parent 69ed471 commit 6fb1358
Showing 2 changed files with 163 additions and 290 deletions.
21 changes: 14 additions & 7 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
@@ -38653,12 +38653,15 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
return SDValue();

// TODO: Use PXOR + PTEST for SSE4.1 or later?
// TODO: Add support for AVX-512.
EVT VT = SetCC->getValueType(0);
SDLoc DL(SetCC);
if ((OpSize == 128 && Subtarget.hasSSE2()) ||
(OpSize == 256 && Subtarget.hasAVX2())) {
EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
(OpSize == 256 && Subtarget.hasAVX2()) ||
(OpSize == 512 && Subtarget.useAVX512Regs())) {
EVT VecVT = OpSize == 512 ? MVT::v16i32 :
OpSize == 256 ? MVT::v32i8 :
MVT::v16i8;
EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT;
SDValue Cmp;
if (IsOrXorXorCCZero) {
// This is a bitwise-combined equality comparison of 2 pairs of vectors:
@@ -38669,14 +38672,18 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
SDValue Cmp1 = DAG.getSetCC(DL, VecVT, A, B, ISD::SETEQ);
SDValue Cmp2 = DAG.getSetCC(DL, VecVT, C, D, ISD::SETEQ);
Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);
Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);
} else {
SDValue VecX = DAG.getBitcast(VecVT, X);
SDValue VecY = DAG.getBitcast(VecVT, Y);
Cmp = DAG.getSetCC(DL, VecVT, VecX, VecY, ISD::SETEQ);
Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
}
// For 512-bits we want to emit a setcc that will lower to kortest.
if (OpSize == 512)
return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp),
DAG.getConstant(0xFFFF, DL, MVT::i16), CC);
// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
432 changes: 149 additions & 283 deletions llvm/test/CodeGen/X86/setcc-wide-types.ll
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=ANY --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=AVXANY --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512 --check-prefix=AVX512BW

; Equality checks of 128/256-bit values can use PMOVMSK or PTEST to avoid scalarization.

@@ -319,93 +319,14 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: ne_i512:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512F-NEXT: vmovq %xmm2, %rdx
; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm3
; AVX512F-NEXT: vmovq %xmm3, %rsi
; AVX512F-NEXT: vmovq %xmm0, %rdi
; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm4
; AVX512F-NEXT: vmovq %xmm4, %rax
; AVX512F-NEXT: vpextrq $1, %xmm2, %r11
; AVX512F-NEXT: vpextrq $1, %xmm3, %r10
; AVX512F-NEXT: vpextrq $1, %xmm0, %r9
; AVX512F-NEXT: vpextrq $1, %xmm4, %r8
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vmovq %xmm0, %rcx
; AVX512F-NEXT: xorq %rdx, %rcx
; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm2
; AVX512F-NEXT: vmovq %xmm2, %rdx
; AVX512F-NEXT: xorq %rsi, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %xmm1, %rcx
; AVX512F-NEXT: xorq %rdi, %rcx
; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3
; AVX512F-NEXT: vmovq %xmm3, %rsi
; AVX512F-NEXT: xorq %rax, %rsi
; AVX512F-NEXT: orq %rdx, %rsi
; AVX512F-NEXT: orq %rcx, %rsi
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: xorq %r11, %rax
; AVX512F-NEXT: vpextrq $1, %xmm2, %rcx
; AVX512F-NEXT: xorq %r10, %rcx
; AVX512F-NEXT: orq %rax, %rcx
; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
; AVX512F-NEXT: xorq %r9, %rax
; AVX512F-NEXT: vpextrq $1, %xmm3, %rdx
; AVX512F-NEXT: xorq %r8, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: orq %rax, %rdx
; AVX512F-NEXT: xorl %eax, %eax
; AVX512F-NEXT: orq %rsi, %rdx
; AVX512F-NEXT: setne %al
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: ne_i512:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512BW-NEXT: vmovq %xmm2, %rdx
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm3
; AVX512BW-NEXT: vmovq %xmm3, %rsi
; AVX512BW-NEXT: vmovq %xmm0, %rdi
; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm4
; AVX512BW-NEXT: vmovq %xmm4, %rax
; AVX512BW-NEXT: vpextrq $1, %xmm2, %r11
; AVX512BW-NEXT: vpextrq $1, %xmm3, %r10
; AVX512BW-NEXT: vpextrq $1, %xmm0, %r9
; AVX512BW-NEXT: vpextrq $1, %xmm4, %r8
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, %rcx
; AVX512BW-NEXT: xorq %rdx, %rcx
; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm2
; AVX512BW-NEXT: vmovq %xmm2, %rdx
; AVX512BW-NEXT: xorq %rsi, %rdx
; AVX512BW-NEXT: orq %rcx, %rdx
; AVX512BW-NEXT: vmovq %xmm1, %rcx
; AVX512BW-NEXT: xorq %rdi, %rcx
; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3
; AVX512BW-NEXT: vmovq %xmm3, %rsi
; AVX512BW-NEXT: xorq %rax, %rsi
; AVX512BW-NEXT: orq %rdx, %rsi
; AVX512BW-NEXT: orq %rcx, %rsi
; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
; AVX512BW-NEXT: xorq %r11, %rax
; AVX512BW-NEXT: vpextrq $1, %xmm2, %rcx
; AVX512BW-NEXT: xorq %r10, %rcx
; AVX512BW-NEXT: orq %rax, %rcx
; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax
; AVX512BW-NEXT: xorq %r9, %rax
; AVX512BW-NEXT: vpextrq $1, %xmm3, %rdx
; AVX512BW-NEXT: xorq %r8, %rdx
; AVX512BW-NEXT: orq %rcx, %rdx
; AVX512BW-NEXT: orq %rax, %rdx
; AVX512BW-NEXT: xorl %eax, %eax
; AVX512BW-NEXT: orq %rsi, %rdx
; AVX512BW-NEXT: setne %al
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512-LABEL: ne_i512:
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; AVX512-NEXT: xorl %eax, %eax
; AVX512-NEXT: kortestw %k0, %k0
; AVX512-NEXT: setae %al
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%bcx = bitcast <8 x i64> %x to i512
%bcy = bitcast <8 x i64> %y to i512
%cmp = icmp ne i512 %bcx, %bcy
@@ -543,93 +464,14 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: eq_i512:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512F-NEXT: vmovq %xmm2, %rdx
; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm3
; AVX512F-NEXT: vmovq %xmm3, %rsi
; AVX512F-NEXT: vmovq %xmm0, %rdi
; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm4
; AVX512F-NEXT: vmovq %xmm4, %rax
; AVX512F-NEXT: vpextrq $1, %xmm2, %r11
; AVX512F-NEXT: vpextrq $1, %xmm3, %r10
; AVX512F-NEXT: vpextrq $1, %xmm0, %r9
; AVX512F-NEXT: vpextrq $1, %xmm4, %r8
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vmovq %xmm0, %rcx
; AVX512F-NEXT: xorq %rdx, %rcx
; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm2
; AVX512F-NEXT: vmovq %xmm2, %rdx
; AVX512F-NEXT: xorq %rsi, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %xmm1, %rcx
; AVX512F-NEXT: xorq %rdi, %rcx
; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3
; AVX512F-NEXT: vmovq %xmm3, %rsi
; AVX512F-NEXT: xorq %rax, %rsi
; AVX512F-NEXT: orq %rdx, %rsi
; AVX512F-NEXT: orq %rcx, %rsi
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: xorq %r11, %rax
; AVX512F-NEXT: vpextrq $1, %xmm2, %rcx
; AVX512F-NEXT: xorq %r10, %rcx
; AVX512F-NEXT: orq %rax, %rcx
; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
; AVX512F-NEXT: xorq %r9, %rax
; AVX512F-NEXT: vpextrq $1, %xmm3, %rdx
; AVX512F-NEXT: xorq %r8, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: orq %rax, %rdx
; AVX512F-NEXT: xorl %eax, %eax
; AVX512F-NEXT: orq %rsi, %rdx
; AVX512F-NEXT: sete %al
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: eq_i512:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512BW-NEXT: vmovq %xmm2, %rdx
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm3
; AVX512BW-NEXT: vmovq %xmm3, %rsi
; AVX512BW-NEXT: vmovq %xmm0, %rdi
; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm4
; AVX512BW-NEXT: vmovq %xmm4, %rax
; AVX512BW-NEXT: vpextrq $1, %xmm2, %r11
; AVX512BW-NEXT: vpextrq $1, %xmm3, %r10
; AVX512BW-NEXT: vpextrq $1, %xmm0, %r9
; AVX512BW-NEXT: vpextrq $1, %xmm4, %r8
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, %rcx
; AVX512BW-NEXT: xorq %rdx, %rcx
; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm2
; AVX512BW-NEXT: vmovq %xmm2, %rdx
; AVX512BW-NEXT: xorq %rsi, %rdx
; AVX512BW-NEXT: orq %rcx, %rdx
; AVX512BW-NEXT: vmovq %xmm1, %rcx
; AVX512BW-NEXT: xorq %rdi, %rcx
; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3
; AVX512BW-NEXT: vmovq %xmm3, %rsi
; AVX512BW-NEXT: xorq %rax, %rsi
; AVX512BW-NEXT: orq %rdx, %rsi
; AVX512BW-NEXT: orq %rcx, %rsi
; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
; AVX512BW-NEXT: xorq %r11, %rax
; AVX512BW-NEXT: vpextrq $1, %xmm2, %rcx
; AVX512BW-NEXT: xorq %r10, %rcx
; AVX512BW-NEXT: orq %rax, %rcx
; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax
; AVX512BW-NEXT: xorq %r9, %rax
; AVX512BW-NEXT: vpextrq $1, %xmm3, %rdx
; AVX512BW-NEXT: xorq %r8, %rdx
; AVX512BW-NEXT: orq %rcx, %rdx
; AVX512BW-NEXT: orq %rax, %rdx
; AVX512BW-NEXT: xorl %eax, %eax
; AVX512BW-NEXT: orq %rsi, %rdx
; AVX512BW-NEXT: sete %al
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512-LABEL: eq_i512:
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; AVX512-NEXT: xorl %eax, %eax
; AVX512-NEXT: kortestw %k0, %k0
; AVX512-NEXT: setb %al
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%bcx = bitcast <8 x i64> %x to i512
%bcy = bitcast <8 x i64> %y to i512
%cmp = icmp eq i512 %bcx, %bcy
@@ -909,58 +751,70 @@ define i32 @eq_i256_pair(i256* %a, i256* %b) {
; if we allowed 2 pairs of 64-byte loads per block.

define i32 @ne_i512_pair(i512* %a, i512* %b) {
; ANY-LABEL: ne_i512_pair:
; ANY: # %bb.0:
; ANY-NEXT: movq 32(%rdi), %r8
; ANY-NEXT: movq 48(%rdi), %r9
; ANY-NEXT: movq 40(%rdi), %rdx
; ANY-NEXT: movq 56(%rdi), %rcx
; ANY-NEXT: xorq 56(%rsi), %rcx
; ANY-NEXT: movq 120(%rdi), %rax
; ANY-NEXT: xorq 120(%rsi), %rax
; ANY-NEXT: orq %rcx, %rax
; ANY-NEXT: movq 88(%rdi), %rcx
; ANY-NEXT: xorq 88(%rsi), %rcx
; ANY-NEXT: orq %rcx, %rax
; ANY-NEXT: movq 24(%rdi), %rcx
; ANY-NEXT: xorq 24(%rsi), %rcx
; ANY-NEXT: xorq 40(%rsi), %rdx
; ANY-NEXT: orq %rcx, %rax
; ANY-NEXT: movq 104(%rdi), %rcx
; ANY-NEXT: xorq 104(%rsi), %rcx
; ANY-NEXT: orq %rdx, %rcx
; ANY-NEXT: movq 72(%rdi), %rdx
; ANY-NEXT: xorq 72(%rsi), %rdx
; ANY-NEXT: orq %rdx, %rcx
; ANY-NEXT: movq 16(%rdi), %r10
; ANY-NEXT: orq %rax, %rcx
; ANY-NEXT: movq 8(%rdi), %rax
; ANY-NEXT: xorq 8(%rsi), %rax
; ANY-NEXT: xorq 48(%rsi), %r9
; ANY-NEXT: orq %rax, %rcx
; ANY-NEXT: movq 112(%rdi), %rax
; ANY-NEXT: xorq 112(%rsi), %rax
; ANY-NEXT: orq %r9, %rax
; ANY-NEXT: movq 80(%rdi), %rdx
; ANY-NEXT: xorq 80(%rsi), %rdx
; ANY-NEXT: orq %rdx, %rax
; ANY-NEXT: movq (%rdi), %r9
; ANY-NEXT: xorq 16(%rsi), %r10
; ANY-NEXT: xorq (%rsi), %r9
; ANY-NEXT: xorq 32(%rsi), %r8
; ANY-NEXT: orq %r10, %rax
; ANY-NEXT: movq 96(%rdi), %rdx
; ANY-NEXT: movq 64(%rdi), %rdi
; ANY-NEXT: xorq 64(%rsi), %rdi
; ANY-NEXT: xorq 96(%rsi), %rdx
; ANY-NEXT: orq %r8, %rdx
; ANY-NEXT: orq %rdi, %rdx
; ANY-NEXT: orq %rax, %rdx
; ANY-NEXT: orq %r9, %rdx
; ANY-NEXT: xorl %eax, %eax
; ANY-NEXT: orq %rcx, %rdx
; ANY-NEXT: setne %al
; ANY-NEXT: retq
; NO512-LABEL: ne_i512_pair:
; NO512: # %bb.0:
; NO512-NEXT: movq 32(%rdi), %r8
; NO512-NEXT: movq 48(%rdi), %r9
; NO512-NEXT: movq 40(%rdi), %rdx
; NO512-NEXT: movq 56(%rdi), %rcx
; NO512-NEXT: xorq 56(%rsi), %rcx
; NO512-NEXT: movq 120(%rdi), %rax
; NO512-NEXT: xorq 120(%rsi), %rax
; NO512-NEXT: orq %rcx, %rax
; NO512-NEXT: movq 88(%rdi), %rcx
; NO512-NEXT: xorq 88(%rsi), %rcx
; NO512-NEXT: orq %rcx, %rax
; NO512-NEXT: movq 24(%rdi), %rcx
; NO512-NEXT: xorq 24(%rsi), %rcx
; NO512-NEXT: xorq 40(%rsi), %rdx
; NO512-NEXT: orq %rcx, %rax
; NO512-NEXT: movq 104(%rdi), %rcx
; NO512-NEXT: xorq 104(%rsi), %rcx
; NO512-NEXT: orq %rdx, %rcx
; NO512-NEXT: movq 72(%rdi), %rdx
; NO512-NEXT: xorq 72(%rsi), %rdx
; NO512-NEXT: orq %rdx, %rcx
; NO512-NEXT: movq 16(%rdi), %r10
; NO512-NEXT: orq %rax, %rcx
; NO512-NEXT: movq 8(%rdi), %rax
; NO512-NEXT: xorq 8(%rsi), %rax
; NO512-NEXT: xorq 48(%rsi), %r9
; NO512-NEXT: orq %rax, %rcx
; NO512-NEXT: movq 112(%rdi), %rax
; NO512-NEXT: xorq 112(%rsi), %rax
; NO512-NEXT: orq %r9, %rax
; NO512-NEXT: movq 80(%rdi), %rdx
; NO512-NEXT: xorq 80(%rsi), %rdx
; NO512-NEXT: orq %rdx, %rax
; NO512-NEXT: movq (%rdi), %r9
; NO512-NEXT: xorq 16(%rsi), %r10
; NO512-NEXT: xorq (%rsi), %r9
; NO512-NEXT: xorq 32(%rsi), %r8
; NO512-NEXT: orq %r10, %rax
; NO512-NEXT: movq 96(%rdi), %rdx
; NO512-NEXT: movq 64(%rdi), %rdi
; NO512-NEXT: xorq 64(%rsi), %rdi
; NO512-NEXT: xorq 96(%rsi), %rdx
; NO512-NEXT: orq %r8, %rdx
; NO512-NEXT: orq %rdi, %rdx
; NO512-NEXT: orq %rax, %rdx
; NO512-NEXT: orq %r9, %rdx
; NO512-NEXT: xorl %eax, %eax
; NO512-NEXT: orq %rcx, %rdx
; NO512-NEXT: setne %al
; NO512-NEXT: retq
;
; AVX512-LABEL: ne_i512_pair:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1
; AVX512-NEXT: vpcmpeqd (%rsi), %zmm0, %k1
; AVX512-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1}
; AVX512-NEXT: xorl %eax, %eax
; AVX512-NEXT: kortestw %k0, %k0
; AVX512-NEXT: setae %al
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%a0 = load i512, i512* %a
%b0 = load i512, i512* %b
%xor1 = xor i512 %a0, %b0
@@ -979,58 +833,70 @@ define i32 @ne_i512_pair(i512* %a, i512* %b) {
; if we allowed 2 pairs of 64-byte loads per block.

define i32 @eq_i512_pair(i512* %a, i512* %b) {
; ANY-LABEL: eq_i512_pair:
; ANY: # %bb.0:
; ANY-NEXT: movq 32(%rdi), %r8
; ANY-NEXT: movq 48(%rdi), %r9
; ANY-NEXT: movq 40(%rdi), %rdx
; ANY-NEXT: movq 56(%rdi), %rcx
; ANY-NEXT: xorq 56(%rsi), %rcx
; ANY-NEXT: movq 120(%rdi), %rax
; ANY-NEXT: xorq 120(%rsi), %rax
; ANY-NEXT: orq %rcx, %rax
; ANY-NEXT: movq 88(%rdi), %rcx
; ANY-NEXT: xorq 88(%rsi), %rcx
; ANY-NEXT: orq %rcx, %rax
; ANY-NEXT: movq 24(%rdi), %rcx
; ANY-NEXT: xorq 24(%rsi), %rcx
; ANY-NEXT: xorq 40(%rsi), %rdx
; ANY-NEXT: orq %rcx, %rax
; ANY-NEXT: movq 104(%rdi), %rcx
; ANY-NEXT: xorq 104(%rsi), %rcx
; ANY-NEXT: orq %rdx, %rcx
; ANY-NEXT: movq 72(%rdi), %rdx
; ANY-NEXT: xorq 72(%rsi), %rdx
; ANY-NEXT: orq %rdx, %rcx
; ANY-NEXT: movq 16(%rdi), %r10
; ANY-NEXT: orq %rax, %rcx
; ANY-NEXT: movq 8(%rdi), %rax
; ANY-NEXT: xorq 8(%rsi), %rax
; ANY-NEXT: xorq 48(%rsi), %r9
; ANY-NEXT: orq %rax, %rcx
; ANY-NEXT: movq 112(%rdi), %rax
; ANY-NEXT: xorq 112(%rsi), %rax
; ANY-NEXT: orq %r9, %rax
; ANY-NEXT: movq 80(%rdi), %rdx
; ANY-NEXT: xorq 80(%rsi), %rdx
; ANY-NEXT: orq %rdx, %rax
; ANY-NEXT: movq (%rdi), %r9
; ANY-NEXT: xorq 16(%rsi), %r10
; ANY-NEXT: xorq (%rsi), %r9
; ANY-NEXT: xorq 32(%rsi), %r8
; ANY-NEXT: orq %r10, %rax
; ANY-NEXT: movq 96(%rdi), %rdx
; ANY-NEXT: movq 64(%rdi), %rdi
; ANY-NEXT: xorq 64(%rsi), %rdi
; ANY-NEXT: xorq 96(%rsi), %rdx
; ANY-NEXT: orq %r8, %rdx
; ANY-NEXT: orq %rdi, %rdx
; ANY-NEXT: orq %rax, %rdx
; ANY-NEXT: orq %r9, %rdx
; ANY-NEXT: xorl %eax, %eax
; ANY-NEXT: orq %rcx, %rdx
; ANY-NEXT: sete %al
; ANY-NEXT: retq
; NO512-LABEL: eq_i512_pair:
; NO512: # %bb.0:
; NO512-NEXT: movq 32(%rdi), %r8
; NO512-NEXT: movq 48(%rdi), %r9
; NO512-NEXT: movq 40(%rdi), %rdx
; NO512-NEXT: movq 56(%rdi), %rcx
; NO512-NEXT: xorq 56(%rsi), %rcx
; NO512-NEXT: movq 120(%rdi), %rax
; NO512-NEXT: xorq 120(%rsi), %rax
; NO512-NEXT: orq %rcx, %rax
; NO512-NEXT: movq 88(%rdi), %rcx
; NO512-NEXT: xorq 88(%rsi), %rcx
; NO512-NEXT: orq %rcx, %rax
; NO512-NEXT: movq 24(%rdi), %rcx
; NO512-NEXT: xorq 24(%rsi), %rcx
; NO512-NEXT: xorq 40(%rsi), %rdx
; NO512-NEXT: orq %rcx, %rax
; NO512-NEXT: movq 104(%rdi), %rcx
; NO512-NEXT: xorq 104(%rsi), %rcx
; NO512-NEXT: orq %rdx, %rcx
; NO512-NEXT: movq 72(%rdi), %rdx
; NO512-NEXT: xorq 72(%rsi), %rdx
; NO512-NEXT: orq %rdx, %rcx
; NO512-NEXT: movq 16(%rdi), %r10
; NO512-NEXT: orq %rax, %rcx
; NO512-NEXT: movq 8(%rdi), %rax
; NO512-NEXT: xorq 8(%rsi), %rax
; NO512-NEXT: xorq 48(%rsi), %r9
; NO512-NEXT: orq %rax, %rcx
; NO512-NEXT: movq 112(%rdi), %rax
; NO512-NEXT: xorq 112(%rsi), %rax
; NO512-NEXT: orq %r9, %rax
; NO512-NEXT: movq 80(%rdi), %rdx
; NO512-NEXT: xorq 80(%rsi), %rdx
; NO512-NEXT: orq %rdx, %rax
; NO512-NEXT: movq (%rdi), %r9
; NO512-NEXT: xorq 16(%rsi), %r10
; NO512-NEXT: xorq (%rsi), %r9
; NO512-NEXT: xorq 32(%rsi), %r8
; NO512-NEXT: orq %r10, %rax
; NO512-NEXT: movq 96(%rdi), %rdx
; NO512-NEXT: movq 64(%rdi), %rdi
; NO512-NEXT: xorq 64(%rsi), %rdi
; NO512-NEXT: xorq 96(%rsi), %rdx
; NO512-NEXT: orq %r8, %rdx
; NO512-NEXT: orq %rdi, %rdx
; NO512-NEXT: orq %rax, %rdx
; NO512-NEXT: orq %r9, %rdx
; NO512-NEXT: xorl %eax, %eax
; NO512-NEXT: orq %rcx, %rdx
; NO512-NEXT: sete %al
; NO512-NEXT: retq
;
; AVX512-LABEL: eq_i512_pair:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1
; AVX512-NEXT: vpcmpeqd (%rsi), %zmm0, %k1
; AVX512-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1}
; AVX512-NEXT: xorl %eax, %eax
; AVX512-NEXT: kortestw %k0, %k0
; AVX512-NEXT: setb %al
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%a0 = load i512, i512* %a
%b0 = load i512, i512* %b
%xor1 = xor i512 %a0, %b0

0 comments on commit 6fb1358

Please sign in to comment.