Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -38653,12 +38653,15 @@ return SDValue(); // TODO: Use PXOR + PTEST for SSE4.1 or later? - // TODO: Add support for AVX-512. EVT VT = SetCC->getValueType(0); SDLoc DL(SetCC); if ((OpSize == 128 && Subtarget.hasSSE2()) || - (OpSize == 256 && Subtarget.hasAVX2())) { - EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8; + (OpSize == 256 && Subtarget.hasAVX2()) || + (OpSize == 512 && Subtarget.useAVX512Regs())) { + EVT VecVT = OpSize == 512 ? MVT::v16i32 : + OpSize == 256 ? MVT::v32i8 : + MVT::v16i8; + EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT; SDValue Cmp; if (IsOrXorXorCCZero) { // This is a bitwise-combined equality comparison of 2 pairs of vectors: @@ -38669,14 +38672,18 @@ SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1)); SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0)); SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1)); - SDValue Cmp1 = DAG.getSetCC(DL, VecVT, A, B, ISD::SETEQ); - SDValue Cmp2 = DAG.getSetCC(DL, VecVT, C, D, ISD::SETEQ); - Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2); + SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ); + SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ); + Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2); } else { SDValue VecX = DAG.getBitcast(VecVT, X); SDValue VecY = DAG.getBitcast(VecVT, Y); - Cmp = DAG.getSetCC(DL, VecVT, VecX, VecY, ISD::SETEQ); + Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ); } + // For 512-bits we want to emit a setcc that will lower to kortest. + if (OpSize == 512) + return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp), + DAG.getConstant(0xFFFF, DL, MVT::i16), CC); // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality. // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne Index: llvm/trunk/test/CodeGen/X86/setcc-wide-types.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/setcc-wide-types.ll +++ llvm/trunk/test/CodeGen/X86/setcc-wide-types.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=ANY --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=AVXANY --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512 --check-prefix=AVX512BW ; Equality checks of 128/256-bit values can use PMOVMSK or PTEST to avoid scalarization. @@ -319,93 +319,14 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: ne_i512: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rdx -; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; AVX512F-NEXT: vmovq %xmm3, %rsi -; AVX512F-NEXT: vmovq %xmm0, %rdi -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm4 -; AVX512F-NEXT: vmovq %xmm4, %rax -; AVX512F-NEXT: vpextrq $1, %xmm2, %r11 -; AVX512F-NEXT: vpextrq $1, %xmm3, %r10 -; AVX512F-NEXT: vpextrq $1, %xmm0, %r9 -; AVX512F-NEXT: vpextrq $1, %xmm4, %r8 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, %rcx -; AVX512F-NEXT: xorq %rdx, %rcx -; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rdx -; AVX512F-NEXT: xorq %rsi, %rdx -; AVX512F-NEXT: orq %rcx, %rdx -; AVX512F-NEXT: vmovq %xmm1, %rcx -; AVX512F-NEXT: xorq %rdi, %rcx -; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; AVX512F-NEXT: vmovq %xmm3, %rsi -; AVX512F-NEXT: xorq %rax, %rsi -; AVX512F-NEXT: orq %rdx, %rsi -; AVX512F-NEXT: orq %rcx, %rsi -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: xorq %r11, %rax -; AVX512F-NEXT: vpextrq $1, %xmm2, %rcx -; AVX512F-NEXT: xorq %r10, %rcx -; AVX512F-NEXT: orq %rax, %rcx -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: xorq %r9, %rax -; AVX512F-NEXT: vpextrq $1, %xmm3, %rdx -; AVX512F-NEXT: xorq %r8, %rdx -; AVX512F-NEXT: orq %rcx, %rdx -; AVX512F-NEXT: orq %rax, %rdx -; AVX512F-NEXT: xorl %eax, %eax -; AVX512F-NEXT: orq %rsi, %rdx -; AVX512F-NEXT: setne %al -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: ne_i512: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vmovq %xmm2, %rdx -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; AVX512BW-NEXT: vmovq %xmm3, %rsi -; AVX512BW-NEXT: vmovq %xmm0, %rdi -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm4 -; AVX512BW-NEXT: vmovq %xmm4, %rax -; AVX512BW-NEXT: vpextrq $1, %xmm2, %r11 -; AVX512BW-NEXT: vpextrq $1, %xmm3, %r10 -; AVX512BW-NEXT: vpextrq $1, %xmm0, %r9 -; AVX512BW-NEXT: vpextrq $1, %xmm4, %r8 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rcx -; AVX512BW-NEXT: xorq %rdx, %rcx -; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; AVX512BW-NEXT: vmovq %xmm2, %rdx -; AVX512BW-NEXT: xorq %rsi, %rdx -; AVX512BW-NEXT: orq %rcx, %rdx -; AVX512BW-NEXT: vmovq %xmm1, %rcx -; AVX512BW-NEXT: xorq %rdi, %rcx -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; AVX512BW-NEXT: vmovq %xmm3, %rsi -; AVX512BW-NEXT: xorq %rax, %rsi -; AVX512BW-NEXT: orq %rdx, %rsi -; AVX512BW-NEXT: orq %rcx, %rsi -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: xorq %r11, %rax -; AVX512BW-NEXT: vpextrq $1, %xmm2, %rcx -; AVX512BW-NEXT: xorq %r10, %rcx -; AVX512BW-NEXT: orq %rax, %rcx -; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax -; AVX512BW-NEXT: xorq %r9, %rax -; AVX512BW-NEXT: vpextrq $1, %xmm3, %rdx -; AVX512BW-NEXT: xorq %r8, %rdx -; AVX512BW-NEXT: orq %rcx, %rdx -; AVX512BW-NEXT: orq %rax, %rdx -; AVX512BW-NEXT: xorl %eax, %eax -; AVX512BW-NEXT: orq %rsi, %rdx -; AVX512BW-NEXT: setne %al -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: ne_i512: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: setae %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %bcx = bitcast <8 x i64> %x to i512 %bcy = bitcast <8 x i64> %y to i512 %cmp = icmp ne i512 %bcx, %bcy @@ -543,93 +464,14 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: eq_i512: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rdx -; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; AVX512F-NEXT: vmovq %xmm3, %rsi -; AVX512F-NEXT: vmovq %xmm0, %rdi -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm4 -; AVX512F-NEXT: vmovq %xmm4, %rax -; AVX512F-NEXT: vpextrq $1, %xmm2, %r11 -; AVX512F-NEXT: vpextrq $1, %xmm3, %r10 -; AVX512F-NEXT: vpextrq $1, %xmm0, %r9 -; AVX512F-NEXT: vpextrq $1, %xmm4, %r8 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, %rcx -; AVX512F-NEXT: xorq %rdx, %rcx -; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rdx -; AVX512F-NEXT: xorq %rsi, %rdx -; AVX512F-NEXT: orq %rcx, %rdx -; AVX512F-NEXT: vmovq %xmm1, %rcx -; AVX512F-NEXT: xorq %rdi, %rcx -; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; AVX512F-NEXT: vmovq %xmm3, %rsi -; AVX512F-NEXT: xorq %rax, %rsi -; AVX512F-NEXT: orq %rdx, %rsi -; AVX512F-NEXT: orq %rcx, %rsi -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: xorq %r11, %rax -; AVX512F-NEXT: vpextrq $1, %xmm2, %rcx -; AVX512F-NEXT: xorq %r10, %rcx -; AVX512F-NEXT: orq %rax, %rcx -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: xorq %r9, %rax -; AVX512F-NEXT: vpextrq $1, %xmm3, %rdx -; AVX512F-NEXT: xorq %r8, %rdx -; AVX512F-NEXT: orq %rcx, %rdx -; AVX512F-NEXT: orq %rax, %rdx -; AVX512F-NEXT: xorl %eax, %eax -; AVX512F-NEXT: orq %rsi, %rdx -; AVX512F-NEXT: sete %al -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: eq_i512: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vmovq %xmm2, %rdx -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; AVX512BW-NEXT: vmovq %xmm3, %rsi -; AVX512BW-NEXT: vmovq %xmm0, %rdi -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm4 -; AVX512BW-NEXT: vmovq %xmm4, %rax -; AVX512BW-NEXT: vpextrq $1, %xmm2, %r11 -; AVX512BW-NEXT: vpextrq $1, %xmm3, %r10 -; AVX512BW-NEXT: vpextrq $1, %xmm0, %r9 -; AVX512BW-NEXT: vpextrq $1, %xmm4, %r8 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rcx -; AVX512BW-NEXT: xorq %rdx, %rcx -; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; AVX512BW-NEXT: vmovq %xmm2, %rdx -; AVX512BW-NEXT: xorq %rsi, %rdx -; AVX512BW-NEXT: orq %rcx, %rdx -; AVX512BW-NEXT: vmovq %xmm1, %rcx -; AVX512BW-NEXT: xorq %rdi, %rcx -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; AVX512BW-NEXT: vmovq %xmm3, %rsi -; AVX512BW-NEXT: xorq %rax, %rsi -; AVX512BW-NEXT: orq %rdx, %rsi -; AVX512BW-NEXT: orq %rcx, %rsi -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: xorq %r11, %rax -; AVX512BW-NEXT: vpextrq $1, %xmm2, %rcx -; AVX512BW-NEXT: xorq %r10, %rcx -; AVX512BW-NEXT: orq %rax, %rcx -; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax -; AVX512BW-NEXT: xorq %r9, %rax -; AVX512BW-NEXT: vpextrq $1, %xmm3, %rdx -; AVX512BW-NEXT: xorq %r8, %rdx -; AVX512BW-NEXT: orq %rcx, %rdx -; AVX512BW-NEXT: orq %rax, %rdx -; AVX512BW-NEXT: xorl %eax, %eax -; AVX512BW-NEXT: orq %rsi, %rdx -; AVX512BW-NEXT: sete %al -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: eq_i512: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: setb %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %bcx = bitcast <8 x i64> %x to i512 %bcy = bitcast <8 x i64> %y to i512 %cmp = icmp eq i512 %bcx, %bcy @@ -909,58 +751,70 @@ ; if we allowed 2 pairs of 64-byte loads per block. define i32 @ne_i512_pair(i512* %a, i512* %b) { -; ANY-LABEL: ne_i512_pair: -; ANY: # %bb.0: -; ANY-NEXT: movq 32(%rdi), %r8 -; ANY-NEXT: movq 48(%rdi), %r9 -; ANY-NEXT: movq 40(%rdi), %rdx -; ANY-NEXT: movq 56(%rdi), %rcx -; ANY-NEXT: xorq 56(%rsi), %rcx -; ANY-NEXT: movq 120(%rdi), %rax -; ANY-NEXT: xorq 120(%rsi), %rax -; ANY-NEXT: orq %rcx, %rax -; ANY-NEXT: movq 88(%rdi), %rcx -; ANY-NEXT: xorq 88(%rsi), %rcx -; ANY-NEXT: orq %rcx, %rax -; ANY-NEXT: movq 24(%rdi), %rcx -; ANY-NEXT: xorq 24(%rsi), %rcx -; ANY-NEXT: xorq 40(%rsi), %rdx -; ANY-NEXT: orq %rcx, %rax -; ANY-NEXT: movq 104(%rdi), %rcx -; ANY-NEXT: xorq 104(%rsi), %rcx -; ANY-NEXT: orq %rdx, %rcx -; ANY-NEXT: movq 72(%rdi), %rdx -; ANY-NEXT: xorq 72(%rsi), %rdx -; ANY-NEXT: orq %rdx, %rcx -; ANY-NEXT: movq 16(%rdi), %r10 -; ANY-NEXT: orq %rax, %rcx -; ANY-NEXT: movq 8(%rdi), %rax -; ANY-NEXT: xorq 8(%rsi), %rax -; ANY-NEXT: xorq 48(%rsi), %r9 -; ANY-NEXT: orq %rax, %rcx -; ANY-NEXT: movq 112(%rdi), %rax -; ANY-NEXT: xorq 112(%rsi), %rax -; ANY-NEXT: orq %r9, %rax -; ANY-NEXT: movq 80(%rdi), %rdx -; ANY-NEXT: xorq 80(%rsi), %rdx -; ANY-NEXT: orq %rdx, %rax -; ANY-NEXT: movq (%rdi), %r9 -; ANY-NEXT: xorq 16(%rsi), %r10 -; ANY-NEXT: xorq (%rsi), %r9 -; ANY-NEXT: xorq 32(%rsi), %r8 -; ANY-NEXT: orq %r10, %rax -; ANY-NEXT: movq 96(%rdi), %rdx -; ANY-NEXT: movq 64(%rdi), %rdi -; ANY-NEXT: xorq 64(%rsi), %rdi -; ANY-NEXT: xorq 96(%rsi), %rdx -; ANY-NEXT: orq %r8, %rdx -; ANY-NEXT: orq %rdi, %rdx -; ANY-NEXT: orq %rax, %rdx -; ANY-NEXT: orq %r9, %rdx -; ANY-NEXT: xorl %eax, %eax -; ANY-NEXT: orq %rcx, %rdx -; ANY-NEXT: setne %al -; ANY-NEXT: retq +; NO512-LABEL: ne_i512_pair: +; NO512: # %bb.0: +; NO512-NEXT: movq 32(%rdi), %r8 +; NO512-NEXT: movq 48(%rdi), %r9 +; NO512-NEXT: movq 40(%rdi), %rdx +; NO512-NEXT: movq 56(%rdi), %rcx +; NO512-NEXT: xorq 56(%rsi), %rcx +; NO512-NEXT: movq 120(%rdi), %rax +; NO512-NEXT: xorq 120(%rsi), %rax +; NO512-NEXT: orq %rcx, %rax +; NO512-NEXT: movq 88(%rdi), %rcx +; NO512-NEXT: xorq 88(%rsi), %rcx +; NO512-NEXT: orq %rcx, %rax +; NO512-NEXT: movq 24(%rdi), %rcx +; NO512-NEXT: xorq 24(%rsi), %rcx +; NO512-NEXT: xorq 40(%rsi), %rdx +; NO512-NEXT: orq %rcx, %rax +; NO512-NEXT: movq 104(%rdi), %rcx +; NO512-NEXT: xorq 104(%rsi), %rcx +; NO512-NEXT: orq %rdx, %rcx +; NO512-NEXT: movq 72(%rdi), %rdx +; NO512-NEXT: xorq 72(%rsi), %rdx +; NO512-NEXT: orq %rdx, %rcx +; NO512-NEXT: movq 16(%rdi), %r10 +; NO512-NEXT: orq %rax, %rcx +; NO512-NEXT: movq 8(%rdi), %rax +; NO512-NEXT: xorq 8(%rsi), %rax +; NO512-NEXT: xorq 48(%rsi), %r9 +; NO512-NEXT: orq %rax, %rcx +; NO512-NEXT: movq 112(%rdi), %rax +; NO512-NEXT: xorq 112(%rsi), %rax +; NO512-NEXT: orq %r9, %rax +; NO512-NEXT: movq 80(%rdi), %rdx +; NO512-NEXT: xorq 80(%rsi), %rdx +; NO512-NEXT: orq %rdx, %rax +; NO512-NEXT: movq (%rdi), %r9 +; NO512-NEXT: xorq 16(%rsi), %r10 +; NO512-NEXT: xorq (%rsi), %r9 +; NO512-NEXT: xorq 32(%rsi), %r8 +; NO512-NEXT: orq %r10, %rax +; NO512-NEXT: movq 96(%rdi), %rdx +; NO512-NEXT: movq 64(%rdi), %rdi +; NO512-NEXT: xorq 64(%rsi), %rdi +; NO512-NEXT: xorq 96(%rsi), %rdx +; NO512-NEXT: orq %r8, %rdx +; NO512-NEXT: orq %rdi, %rdx +; NO512-NEXT: orq %rax, %rdx +; NO512-NEXT: orq %r9, %rdx +; NO512-NEXT: xorl %eax, %eax +; NO512-NEXT: orq %rcx, %rdx +; NO512-NEXT: setne %al +; NO512-NEXT: retq +; +; AVX512-LABEL: ne_i512_pair: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 +; AVX512-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: setae %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a0 = load i512, i512* %a %b0 = load i512, i512* %b %xor1 = xor i512 %a0, %b0 @@ -979,58 +833,70 @@ ; if we allowed 2 pairs of 64-byte loads per block. define i32 @eq_i512_pair(i512* %a, i512* %b) { -; ANY-LABEL: eq_i512_pair: -; ANY: # %bb.0: -; ANY-NEXT: movq 32(%rdi), %r8 -; ANY-NEXT: movq 48(%rdi), %r9 -; ANY-NEXT: movq 40(%rdi), %rdx -; ANY-NEXT: movq 56(%rdi), %rcx -; ANY-NEXT: xorq 56(%rsi), %rcx -; ANY-NEXT: movq 120(%rdi), %rax -; ANY-NEXT: xorq 120(%rsi), %rax -; ANY-NEXT: orq %rcx, %rax -; ANY-NEXT: movq 88(%rdi), %rcx -; ANY-NEXT: xorq 88(%rsi), %rcx -; ANY-NEXT: orq %rcx, %rax -; ANY-NEXT: movq 24(%rdi), %rcx -; ANY-NEXT: xorq 24(%rsi), %rcx -; ANY-NEXT: xorq 40(%rsi), %rdx -; ANY-NEXT: orq %rcx, %rax -; ANY-NEXT: movq 104(%rdi), %rcx -; ANY-NEXT: xorq 104(%rsi), %rcx -; ANY-NEXT: orq %rdx, %rcx -; ANY-NEXT: movq 72(%rdi), %rdx -; ANY-NEXT: xorq 72(%rsi), %rdx -; ANY-NEXT: orq %rdx, %rcx -; ANY-NEXT: movq 16(%rdi), %r10 -; ANY-NEXT: orq %rax, %rcx -; ANY-NEXT: movq 8(%rdi), %rax -; ANY-NEXT: xorq 8(%rsi), %rax -; ANY-NEXT: xorq 48(%rsi), %r9 -; ANY-NEXT: orq %rax, %rcx -; ANY-NEXT: movq 112(%rdi), %rax -; ANY-NEXT: xorq 112(%rsi), %rax -; ANY-NEXT: orq %r9, %rax -; ANY-NEXT: movq 80(%rdi), %rdx -; ANY-NEXT: xorq 80(%rsi), %rdx -; ANY-NEXT: orq %rdx, %rax -; ANY-NEXT: movq (%rdi), %r9 -; ANY-NEXT: xorq 16(%rsi), %r10 -; ANY-NEXT: xorq (%rsi), %r9 -; ANY-NEXT: xorq 32(%rsi), %r8 -; ANY-NEXT: orq %r10, %rax -; ANY-NEXT: movq 96(%rdi), %rdx -; ANY-NEXT: movq 64(%rdi), %rdi -; ANY-NEXT: xorq 64(%rsi), %rdi -; ANY-NEXT: xorq 96(%rsi), %rdx -; ANY-NEXT: orq %r8, %rdx -; ANY-NEXT: orq %rdi, %rdx -; ANY-NEXT: orq %rax, %rdx -; ANY-NEXT: orq %r9, %rdx -; ANY-NEXT: xorl %eax, %eax -; ANY-NEXT: orq %rcx, %rdx -; ANY-NEXT: sete %al -; ANY-NEXT: retq +; NO512-LABEL: eq_i512_pair: +; NO512: # %bb.0: +; NO512-NEXT: movq 32(%rdi), %r8 +; NO512-NEXT: movq 48(%rdi), %r9 +; NO512-NEXT: movq 40(%rdi), %rdx +; NO512-NEXT: movq 56(%rdi), %rcx +; NO512-NEXT: xorq 56(%rsi), %rcx +; NO512-NEXT: movq 120(%rdi), %rax +; NO512-NEXT: xorq 120(%rsi), %rax +; NO512-NEXT: orq %rcx, %rax +; NO512-NEXT: movq 88(%rdi), %rcx +; NO512-NEXT: xorq 88(%rsi), %rcx +; NO512-NEXT: orq %rcx, %rax +; NO512-NEXT: movq 24(%rdi), %rcx +; NO512-NEXT: xorq 24(%rsi), %rcx +; NO512-NEXT: xorq 40(%rsi), %rdx +; NO512-NEXT: orq %rcx, %rax +; NO512-NEXT: movq 104(%rdi), %rcx +; NO512-NEXT: xorq 104(%rsi), %rcx +; NO512-NEXT: orq %rdx, %rcx +; NO512-NEXT: movq 72(%rdi), %rdx +; NO512-NEXT: xorq 72(%rsi), %rdx +; NO512-NEXT: orq %rdx, %rcx +; NO512-NEXT: movq 16(%rdi), %r10 +; NO512-NEXT: orq %rax, %rcx +; NO512-NEXT: movq 8(%rdi), %rax +; NO512-NEXT: xorq 8(%rsi), %rax +; NO512-NEXT: xorq 48(%rsi), %r9 +; NO512-NEXT: orq %rax, %rcx +; NO512-NEXT: movq 112(%rdi), %rax +; NO512-NEXT: xorq 112(%rsi), %rax +; NO512-NEXT: orq %r9, %rax +; NO512-NEXT: movq 80(%rdi), %rdx +; NO512-NEXT: xorq 80(%rsi), %rdx +; NO512-NEXT: orq %rdx, %rax +; NO512-NEXT: movq (%rdi), %r9 +; NO512-NEXT: xorq 16(%rsi), %r10 +; NO512-NEXT: xorq (%rsi), %r9 +; NO512-NEXT: xorq 32(%rsi), %r8 +; NO512-NEXT: orq %r10, %rax +; NO512-NEXT: movq 96(%rdi), %rdx +; NO512-NEXT: movq 64(%rdi), %rdi +; NO512-NEXT: xorq 64(%rsi), %rdi +; NO512-NEXT: xorq 96(%rsi), %rdx +; NO512-NEXT: orq %r8, %rdx +; NO512-NEXT: orq %rdi, %rdx +; NO512-NEXT: orq %rax, %rdx +; NO512-NEXT: orq %r9, %rdx +; NO512-NEXT: xorl %eax, %eax +; NO512-NEXT: orq %rcx, %rdx +; NO512-NEXT: sete %al +; NO512-NEXT: retq +; +; AVX512-LABEL: eq_i512_pair: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 +; AVX512-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: setb %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a0 = load i512, i512* %a %b0 = load i512, i512* %b %xor1 = xor i512 %a0, %b0