Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -38654,12 +38654,15 @@ return SDValue(); // TODO: Use PXOR + PTEST for SSE4.1 or later? - // TODO: Add support for AVX-512. EVT VT = SetCC->getValueType(0); SDLoc DL(SetCC); if ((OpSize == 128 && Subtarget.hasSSE2()) || - (OpSize == 256 && Subtarget.hasAVX2())) { - EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8; + (OpSize == 256 && Subtarget.hasAVX2()) || + (OpSize == 512 && Subtarget.useAVX512Regs())) { + EVT VecVT = OpSize == 512 ? MVT::v16i32 : + OpSize == 256 ? MVT::v32i8 : + MVT::v16i8; + EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT; SDValue Cmp; if (IsOrXorXorCCZero) { // This is a bitwise-combined equality comparison of 2 pairs of vectors: @@ -38670,14 +38673,18 @@ SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1)); SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0)); SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1)); - SDValue Cmp1 = DAG.getSetCC(DL, VecVT, A, B, ISD::SETEQ); - SDValue Cmp2 = DAG.getSetCC(DL, VecVT, C, D, ISD::SETEQ); - Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2); + SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ); + SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ); + Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2); } else { SDValue VecX = DAG.getBitcast(VecVT, X); SDValue VecY = DAG.getBitcast(VecVT, Y); - Cmp = DAG.getSetCC(DL, VecVT, VecX, VecY, ISD::SETEQ); + Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ); } + // For 512-bits we want to emit a setcc that will lower to kortest. + if (OpSize == 512) + return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp), + DAG.getConstant(0xFFFF, DL, MVT::i16), CC); // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality. // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne Index: test/CodeGen/X86/setcc-wide-types.ll =================================================================== --- test/CodeGen/X86/setcc-wide-types.ll +++ test/CodeGen/X86/setcc-wide-types.ll @@ -321,89 +321,19 @@ ; ; AVX512F-LABEL: ne_i512: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rdx -; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; AVX512F-NEXT: vmovq %xmm3, %rsi -; AVX512F-NEXT: vmovq %xmm0, %rdi -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm4 -; AVX512F-NEXT: vmovq %xmm4, %rax -; AVX512F-NEXT: vpextrq $1, %xmm2, %r11 -; AVX512F-NEXT: vpextrq $1, %xmm3, %r10 -; AVX512F-NEXT: vpextrq $1, %xmm0, %r9 -; AVX512F-NEXT: vpextrq $1, %xmm4, %r8 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, %rcx -; AVX512F-NEXT: xorq %rdx, %rcx -; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rdx -; AVX512F-NEXT: xorq %rsi, %rdx -; AVX512F-NEXT: orq %rcx, %rdx -; AVX512F-NEXT: vmovq %xmm1, %rcx -; AVX512F-NEXT: xorq %rdi, %rcx -; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; AVX512F-NEXT: vmovq %xmm3, %rsi -; AVX512F-NEXT: xorq %rax, %rsi -; AVX512F-NEXT: orq %rdx, %rsi -; AVX512F-NEXT: orq %rcx, %rsi -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: xorq %r11, %rax -; AVX512F-NEXT: vpextrq $1, %xmm2, %rcx -; AVX512F-NEXT: xorq %r10, %rcx -; AVX512F-NEXT: orq %rax, %rcx -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: xorq %r9, %rax -; AVX512F-NEXT: vpextrq $1, %xmm3, %rdx -; AVX512F-NEXT: xorq %r8, %rdx -; AVX512F-NEXT: orq %rcx, %rdx -; AVX512F-NEXT: orq %rax, %rdx +; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512F-NEXT: xorl %eax, %eax -; AVX512F-NEXT: orq %rsi, %rdx -; AVX512F-NEXT: setne %al +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: setae %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ne_i512: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vmovq %xmm2, %rdx -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; AVX512BW-NEXT: vmovq %xmm3, %rsi -; AVX512BW-NEXT: vmovq %xmm0, %rdi -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm4 -; AVX512BW-NEXT: vmovq %xmm4, %rax -; AVX512BW-NEXT: vpextrq $1, %xmm2, %r11 -; AVX512BW-NEXT: vpextrq $1, %xmm3, %r10 -; AVX512BW-NEXT: vpextrq $1, %xmm0, %r9 -; AVX512BW-NEXT: vpextrq $1, %xmm4, %r8 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rcx -; AVX512BW-NEXT: xorq %rdx, %rcx -; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; AVX512BW-NEXT: vmovq %xmm2, %rdx -; AVX512BW-NEXT: xorq %rsi, %rdx -; AVX512BW-NEXT: orq %rcx, %rdx -; AVX512BW-NEXT: vmovq %xmm1, %rcx -; AVX512BW-NEXT: xorq %rdi, %rcx -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; AVX512BW-NEXT: vmovq %xmm3, %rsi -; AVX512BW-NEXT: xorq %rax, %rsi -; AVX512BW-NEXT: orq %rdx, %rsi -; AVX512BW-NEXT: orq %rcx, %rsi -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: xorq %r11, %rax -; AVX512BW-NEXT: vpextrq $1, %xmm2, %rcx -; AVX512BW-NEXT: xorq %r10, %rcx -; AVX512BW-NEXT: orq %rax, %rcx -; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax -; AVX512BW-NEXT: xorq %r9, %rax -; AVX512BW-NEXT: vpextrq $1, %xmm3, %rdx -; AVX512BW-NEXT: xorq %r8, %rdx -; AVX512BW-NEXT: orq %rcx, %rdx -; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: xorl %eax, %eax -; AVX512BW-NEXT: orq %rsi, %rdx -; AVX512BW-NEXT: setne %al +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: setae %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %bcx = bitcast <8 x i64> %x to i512 @@ -545,89 +475,19 @@ ; ; AVX512F-LABEL: eq_i512: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rdx -; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; AVX512F-NEXT: vmovq %xmm3, %rsi -; AVX512F-NEXT: vmovq %xmm0, %rdi -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm4 -; AVX512F-NEXT: vmovq %xmm4, %rax -; AVX512F-NEXT: vpextrq $1, %xmm2, %r11 -; AVX512F-NEXT: vpextrq $1, %xmm3, %r10 -; AVX512F-NEXT: vpextrq $1, %xmm0, %r9 -; AVX512F-NEXT: vpextrq $1, %xmm4, %r8 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, %rcx -; AVX512F-NEXT: xorq %rdx, %rcx -; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rdx -; AVX512F-NEXT: xorq %rsi, %rdx -; AVX512F-NEXT: orq %rcx, %rdx -; AVX512F-NEXT: vmovq %xmm1, %rcx -; AVX512F-NEXT: xorq %rdi, %rcx -; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; AVX512F-NEXT: vmovq %xmm3, %rsi -; AVX512F-NEXT: xorq %rax, %rsi -; AVX512F-NEXT: orq %rdx, %rsi -; AVX512F-NEXT: orq %rcx, %rsi -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: xorq %r11, %rax -; AVX512F-NEXT: vpextrq $1, %xmm2, %rcx -; AVX512F-NEXT: xorq %r10, %rcx -; AVX512F-NEXT: orq %rax, %rcx -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: xorq %r9, %rax -; AVX512F-NEXT: vpextrq $1, %xmm3, %rdx -; AVX512F-NEXT: xorq %r8, %rdx -; AVX512F-NEXT: orq %rcx, %rdx -; AVX512F-NEXT: orq %rax, %rdx +; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512F-NEXT: xorl %eax, %eax -; AVX512F-NEXT: orq %rsi, %rdx -; AVX512F-NEXT: sete %al +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: setb %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: eq_i512: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vmovq %xmm2, %rdx -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; AVX512BW-NEXT: vmovq %xmm3, %rsi -; AVX512BW-NEXT: vmovq %xmm0, %rdi -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm4 -; AVX512BW-NEXT: vmovq %xmm4, %rax -; AVX512BW-NEXT: vpextrq $1, %xmm2, %r11 -; AVX512BW-NEXT: vpextrq $1, %xmm3, %r10 -; AVX512BW-NEXT: vpextrq $1, %xmm0, %r9 -; AVX512BW-NEXT: vpextrq $1, %xmm4, %r8 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rcx -; AVX512BW-NEXT: xorq %rdx, %rcx -; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; AVX512BW-NEXT: vmovq %xmm2, %rdx -; AVX512BW-NEXT: xorq %rsi, %rdx -; AVX512BW-NEXT: orq %rcx, %rdx -; AVX512BW-NEXT: vmovq %xmm1, %rcx -; AVX512BW-NEXT: xorq %rdi, %rcx -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; AVX512BW-NEXT: vmovq %xmm3, %rsi -; AVX512BW-NEXT: xorq %rax, %rsi -; AVX512BW-NEXT: orq %rdx, %rsi -; AVX512BW-NEXT: orq %rcx, %rsi -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: xorq %r11, %rax -; AVX512BW-NEXT: vpextrq $1, %xmm2, %rcx -; AVX512BW-NEXT: xorq %r10, %rcx -; AVX512BW-NEXT: orq %rax, %rcx -; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax -; AVX512BW-NEXT: xorq %r9, %rax -; AVX512BW-NEXT: vpextrq $1, %xmm3, %rdx -; AVX512BW-NEXT: xorq %r8, %rdx -; AVX512BW-NEXT: orq %rcx, %rdx -; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: xorl %eax, %eax -; AVX512BW-NEXT: orq %rsi, %rdx -; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: setb %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %bcx = bitcast <8 x i64> %x to i512 @@ -909,58 +769,188 @@ ; if we allowed 2 pairs of 64-byte loads per block. define i32 @ne_i512_pair(i512* %a, i512* %b) { -; ANY-LABEL: ne_i512_pair: -; ANY: # %bb.0: -; ANY-NEXT: movq 32(%rdi), %r8 -; ANY-NEXT: movq 48(%rdi), %r9 -; ANY-NEXT: movq 40(%rdi), %rdx -; ANY-NEXT: movq 56(%rdi), %rcx -; ANY-NEXT: xorq 56(%rsi), %rcx -; ANY-NEXT: movq 120(%rdi), %rax -; ANY-NEXT: xorq 120(%rsi), %rax -; ANY-NEXT: orq %rcx, %rax -; ANY-NEXT: movq 88(%rdi), %rcx -; ANY-NEXT: xorq 88(%rsi), %rcx -; ANY-NEXT: orq %rcx, %rax -; ANY-NEXT: movq 24(%rdi), %rcx -; ANY-NEXT: xorq 24(%rsi), %rcx -; ANY-NEXT: xorq 40(%rsi), %rdx -; ANY-NEXT: orq %rcx, %rax -; ANY-NEXT: movq 104(%rdi), %rcx -; ANY-NEXT: xorq 104(%rsi), %rcx -; ANY-NEXT: orq %rdx, %rcx -; ANY-NEXT: movq 72(%rdi), %rdx -; ANY-NEXT: xorq 72(%rsi), %rdx -; ANY-NEXT: orq %rdx, %rcx -; ANY-NEXT: movq 16(%rdi), %r10 -; ANY-NEXT: orq %rax, %rcx -; ANY-NEXT: movq 8(%rdi), %rax -; ANY-NEXT: xorq 8(%rsi), %rax -; ANY-NEXT: xorq 48(%rsi), %r9 -; ANY-NEXT: orq %rax, %rcx -; ANY-NEXT: movq 112(%rdi), %rax -; ANY-NEXT: xorq 112(%rsi), %rax -; ANY-NEXT: orq %r9, %rax -; ANY-NEXT: movq 80(%rdi), %rdx -; ANY-NEXT: xorq 80(%rsi), %rdx -; ANY-NEXT: orq %rdx, %rax -; ANY-NEXT: movq (%rdi), %r9 -; ANY-NEXT: xorq 16(%rsi), %r10 -; ANY-NEXT: xorq (%rsi), %r9 -; ANY-NEXT: xorq 32(%rsi), %r8 -; ANY-NEXT: orq %r10, %rax -; ANY-NEXT: movq 96(%rdi), %rdx -; ANY-NEXT: movq 64(%rdi), %rdi -; ANY-NEXT: xorq 64(%rsi), %rdi -; ANY-NEXT: xorq 96(%rsi), %rdx -; ANY-NEXT: orq %r8, %rdx -; ANY-NEXT: orq %rdi, %rdx -; ANY-NEXT: orq %rax, %rdx -; ANY-NEXT: orq %r9, %rdx -; ANY-NEXT: xorl %eax, %eax -; ANY-NEXT: orq %rcx, %rdx -; ANY-NEXT: setne %al -; ANY-NEXT: retq +; SSE2-LABEL: ne_i512_pair: +; SSE2: # %bb.0: +; SSE2-NEXT: movq 32(%rdi), %r8 +; SSE2-NEXT: movq 48(%rdi), %r9 +; SSE2-NEXT: movq 40(%rdi), %rdx +; SSE2-NEXT: movq 56(%rdi), %rcx +; SSE2-NEXT: xorq 56(%rsi), %rcx +; SSE2-NEXT: movq 120(%rdi), %rax +; SSE2-NEXT: xorq 120(%rsi), %rax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: movq 88(%rdi), %rcx +; SSE2-NEXT: xorq 88(%rsi), %rcx +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: movq 24(%rdi), %rcx +; SSE2-NEXT: xorq 24(%rsi), %rcx +; SSE2-NEXT: xorq 40(%rsi), %rdx +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: movq 104(%rdi), %rcx +; SSE2-NEXT: xorq 104(%rsi), %rcx +; SSE2-NEXT: orq %rdx, %rcx +; SSE2-NEXT: movq 72(%rdi), %rdx +; SSE2-NEXT: xorq 72(%rsi), %rdx +; SSE2-NEXT: orq %rdx, %rcx +; SSE2-NEXT: movq 16(%rdi), %r10 +; SSE2-NEXT: orq %rax, %rcx +; SSE2-NEXT: movq 8(%rdi), %rax +; SSE2-NEXT: xorq 8(%rsi), %rax +; SSE2-NEXT: xorq 48(%rsi), %r9 +; SSE2-NEXT: orq %rax, %rcx +; SSE2-NEXT: movq 112(%rdi), %rax +; SSE2-NEXT: xorq 112(%rsi), %rax +; SSE2-NEXT: orq %r9, %rax +; SSE2-NEXT: movq 80(%rdi), %rdx +; SSE2-NEXT: xorq 80(%rsi), %rdx +; SSE2-NEXT: orq %rdx, %rax +; SSE2-NEXT: movq (%rdi), %r9 +; SSE2-NEXT: xorq 16(%rsi), %r10 +; SSE2-NEXT: xorq (%rsi), %r9 +; SSE2-NEXT: xorq 32(%rsi), %r8 +; SSE2-NEXT: orq %r10, %rax +; SSE2-NEXT: movq 96(%rdi), %rdx +; SSE2-NEXT: movq 64(%rdi), %rdi +; SSE2-NEXT: xorq 64(%rsi), %rdi +; SSE2-NEXT: xorq 96(%rsi), %rdx +; SSE2-NEXT: orq %r8, %rdx +; SSE2-NEXT: orq %rdi, %rdx +; SSE2-NEXT: orq %rax, %rdx +; SSE2-NEXT: orq %r9, %rdx +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: setne %al +; SSE2-NEXT: retq +; +; AVX1-LABEL: ne_i512_pair: +; AVX1: # %bb.0: +; AVX1-NEXT: movq 32(%rdi), %r8 +; AVX1-NEXT: movq 48(%rdi), %r9 +; AVX1-NEXT: movq 40(%rdi), %rdx +; AVX1-NEXT: movq 56(%rdi), %rcx +; AVX1-NEXT: xorq 56(%rsi), %rcx +; AVX1-NEXT: movq 120(%rdi), %rax +; AVX1-NEXT: xorq 120(%rsi), %rax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: movq 88(%rdi), %rcx +; AVX1-NEXT: xorq 88(%rsi), %rcx +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: movq 24(%rdi), %rcx +; AVX1-NEXT: xorq 24(%rsi), %rcx +; AVX1-NEXT: xorq 40(%rsi), %rdx +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: movq 104(%rdi), %rcx +; AVX1-NEXT: xorq 104(%rsi), %rcx +; AVX1-NEXT: orq %rdx, %rcx +; AVX1-NEXT: movq 72(%rdi), %rdx +; AVX1-NEXT: xorq 72(%rsi), %rdx +; AVX1-NEXT: orq %rdx, %rcx +; AVX1-NEXT: movq 16(%rdi), %r10 +; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: movq 8(%rdi), %rax +; AVX1-NEXT: xorq 8(%rsi), %rax +; AVX1-NEXT: xorq 48(%rsi), %r9 +; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: movq 112(%rdi), %rax +; AVX1-NEXT: xorq 112(%rsi), %rax +; AVX1-NEXT: orq %r9, %rax +; AVX1-NEXT: movq 80(%rdi), %rdx +; AVX1-NEXT: xorq 80(%rsi), %rdx +; AVX1-NEXT: orq %rdx, %rax +; AVX1-NEXT: movq (%rdi), %r9 +; AVX1-NEXT: xorq 16(%rsi), %r10 +; AVX1-NEXT: xorq (%rsi), %r9 +; AVX1-NEXT: xorq 32(%rsi), %r8 +; AVX1-NEXT: orq %r10, %rax +; AVX1-NEXT: movq 96(%rdi), %rdx +; AVX1-NEXT: movq 64(%rdi), %rdi +; AVX1-NEXT: xorq 64(%rsi), %rdi +; AVX1-NEXT: xorq 96(%rsi), %rdx +; AVX1-NEXT: orq %r8, %rdx +; AVX1-NEXT: orq %rdi, %rdx +; AVX1-NEXT: orq %rax, %rdx +; AVX1-NEXT: orq %r9, %rdx +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: setne %al +; AVX1-NEXT: retq +; +; AVX2-LABEL: ne_i512_pair: +; AVX2: # %bb.0: +; AVX2-NEXT: movq 32(%rdi), %r8 +; AVX2-NEXT: movq 48(%rdi), %r9 +; AVX2-NEXT: movq 40(%rdi), %rdx +; AVX2-NEXT: movq 56(%rdi), %rcx +; AVX2-NEXT: xorq 56(%rsi), %rcx +; AVX2-NEXT: movq 120(%rdi), %rax +; AVX2-NEXT: xorq 120(%rsi), %rax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: movq 88(%rdi), %rcx +; AVX2-NEXT: xorq 88(%rsi), %rcx +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: movq 24(%rdi), %rcx +; AVX2-NEXT: xorq 24(%rsi), %rcx +; AVX2-NEXT: xorq 40(%rsi), %rdx +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: movq 104(%rdi), %rcx +; AVX2-NEXT: xorq 104(%rsi), %rcx +; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: movq 72(%rdi), %rdx +; AVX2-NEXT: xorq 72(%rsi), %rdx +; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: movq 16(%rdi), %r10 +; AVX2-NEXT: orq %rax, %rcx +; AVX2-NEXT: movq 8(%rdi), %rax +; AVX2-NEXT: xorq 8(%rsi), %rax +; AVX2-NEXT: xorq 48(%rsi), %r9 +; AVX2-NEXT: orq %rax, %rcx +; AVX2-NEXT: movq 112(%rdi), %rax +; AVX2-NEXT: xorq 112(%rsi), %rax +; AVX2-NEXT: orq %r9, %rax +; AVX2-NEXT: movq 80(%rdi), %rdx +; AVX2-NEXT: xorq 80(%rsi), %rdx +; AVX2-NEXT: orq %rdx, %rax +; AVX2-NEXT: movq (%rdi), %r9 +; AVX2-NEXT: xorq 16(%rsi), %r10 +; AVX2-NEXT: xorq (%rsi), %r9 +; AVX2-NEXT: xorq 32(%rsi), %r8 +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: movq 96(%rdi), %rdx +; AVX2-NEXT: movq 64(%rdi), %rdi +; AVX2-NEXT: xorq 64(%rsi), %rdi +; AVX2-NEXT: xorq 96(%rsi), %rdx +; AVX2-NEXT: orq %r8, %rdx +; AVX2-NEXT: orq %rdi, %rdx +; AVX2-NEXT: orq %rax, %rdx +; AVX2-NEXT: orq %r9, %rdx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: setne %al +; AVX2-NEXT: retq +; +; AVX512F-LABEL: ne_i512_pair: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512F-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 +; AVX512F-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: setae %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: ne_i512_pair: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 +; AVX512BW-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: setae %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %a0 = load i512, i512* %a %b0 = load i512, i512* %b %xor1 = xor i512 %a0, %b0 @@ -979,58 +969,188 @@ ; if we allowed 2 pairs of 64-byte loads per block. define i32 @eq_i512_pair(i512* %a, i512* %b) { -; ANY-LABEL: eq_i512_pair: -; ANY: # %bb.0: -; ANY-NEXT: movq 32(%rdi), %r8 -; ANY-NEXT: movq 48(%rdi), %r9 -; ANY-NEXT: movq 40(%rdi), %rdx -; ANY-NEXT: movq 56(%rdi), %rcx -; ANY-NEXT: xorq 56(%rsi), %rcx -; ANY-NEXT: movq 120(%rdi), %rax -; ANY-NEXT: xorq 120(%rsi), %rax -; ANY-NEXT: orq %rcx, %rax -; ANY-NEXT: movq 88(%rdi), %rcx -; ANY-NEXT: xorq 88(%rsi), %rcx -; ANY-NEXT: orq %rcx, %rax -; ANY-NEXT: movq 24(%rdi), %rcx -; ANY-NEXT: xorq 24(%rsi), %rcx -; ANY-NEXT: xorq 40(%rsi), %rdx -; ANY-NEXT: orq %rcx, %rax -; ANY-NEXT: movq 104(%rdi), %rcx -; ANY-NEXT: xorq 104(%rsi), %rcx -; ANY-NEXT: orq %rdx, %rcx -; ANY-NEXT: movq 72(%rdi), %rdx -; ANY-NEXT: xorq 72(%rsi), %rdx -; ANY-NEXT: orq %rdx, %rcx -; ANY-NEXT: movq 16(%rdi), %r10 -; ANY-NEXT: orq %rax, %rcx -; ANY-NEXT: movq 8(%rdi), %rax -; ANY-NEXT: xorq 8(%rsi), %rax -; ANY-NEXT: xorq 48(%rsi), %r9 -; ANY-NEXT: orq %rax, %rcx -; ANY-NEXT: movq 112(%rdi), %rax -; ANY-NEXT: xorq 112(%rsi), %rax -; ANY-NEXT: orq %r9, %rax -; ANY-NEXT: movq 80(%rdi), %rdx -; ANY-NEXT: xorq 80(%rsi), %rdx -; ANY-NEXT: orq %rdx, %rax -; ANY-NEXT: movq (%rdi), %r9 -; ANY-NEXT: xorq 16(%rsi), %r10 -; ANY-NEXT: xorq (%rsi), %r9 -; ANY-NEXT: xorq 32(%rsi), %r8 -; ANY-NEXT: orq %r10, %rax -; ANY-NEXT: movq 96(%rdi), %rdx -; ANY-NEXT: movq 64(%rdi), %rdi -; ANY-NEXT: xorq 64(%rsi), %rdi -; ANY-NEXT: xorq 96(%rsi), %rdx -; ANY-NEXT: orq %r8, %rdx -; ANY-NEXT: orq %rdi, %rdx -; ANY-NEXT: orq %rax, %rdx -; ANY-NEXT: orq %r9, %rdx -; ANY-NEXT: xorl %eax, %eax -; ANY-NEXT: orq %rcx, %rdx -; ANY-NEXT: sete %al -; ANY-NEXT: retq +; SSE2-LABEL: eq_i512_pair: +; SSE2: # %bb.0: +; SSE2-NEXT: movq 32(%rdi), %r8 +; SSE2-NEXT: movq 48(%rdi), %r9 +; SSE2-NEXT: movq 40(%rdi), %rdx +; SSE2-NEXT: movq 56(%rdi), %rcx +; SSE2-NEXT: xorq 56(%rsi), %rcx +; SSE2-NEXT: movq 120(%rdi), %rax +; SSE2-NEXT: xorq 120(%rsi), %rax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: movq 88(%rdi), %rcx +; SSE2-NEXT: xorq 88(%rsi), %rcx +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: movq 24(%rdi), %rcx +; SSE2-NEXT: xorq 24(%rsi), %rcx +; SSE2-NEXT: xorq 40(%rsi), %rdx +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: movq 104(%rdi), %rcx +; SSE2-NEXT: xorq 104(%rsi), %rcx +; SSE2-NEXT: orq %rdx, %rcx +; SSE2-NEXT: movq 72(%rdi), %rdx +; SSE2-NEXT: xorq 72(%rsi), %rdx +; SSE2-NEXT: orq %rdx, %rcx +; SSE2-NEXT: movq 16(%rdi), %r10 +; SSE2-NEXT: orq %rax, %rcx +; SSE2-NEXT: movq 8(%rdi), %rax +; SSE2-NEXT: xorq 8(%rsi), %rax +; SSE2-NEXT: xorq 48(%rsi), %r9 +; SSE2-NEXT: orq %rax, %rcx +; SSE2-NEXT: movq 112(%rdi), %rax +; SSE2-NEXT: xorq 112(%rsi), %rax +; SSE2-NEXT: orq %r9, %rax +; SSE2-NEXT: movq 80(%rdi), %rdx +; SSE2-NEXT: xorq 80(%rsi), %rdx +; SSE2-NEXT: orq %rdx, %rax +; SSE2-NEXT: movq (%rdi), %r9 +; SSE2-NEXT: xorq 16(%rsi), %r10 +; SSE2-NEXT: xorq (%rsi), %r9 +; SSE2-NEXT: xorq 32(%rsi), %r8 +; SSE2-NEXT: orq %r10, %rax +; SSE2-NEXT: movq 96(%rdi), %rdx +; SSE2-NEXT: movq 64(%rdi), %rdi +; SSE2-NEXT: xorq 64(%rsi), %rdi +; SSE2-NEXT: xorq 96(%rsi), %rdx +; SSE2-NEXT: orq %r8, %rdx +; SSE2-NEXT: orq %rdi, %rdx +; SSE2-NEXT: orq %rax, %rdx +; SSE2-NEXT: orq %r9, %rdx +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; AVX1-LABEL: eq_i512_pair: +; AVX1: # %bb.0: +; AVX1-NEXT: movq 32(%rdi), %r8 +; AVX1-NEXT: movq 48(%rdi), %r9 +; AVX1-NEXT: movq 40(%rdi), %rdx +; AVX1-NEXT: movq 56(%rdi), %rcx +; AVX1-NEXT: xorq 56(%rsi), %rcx +; AVX1-NEXT: movq 120(%rdi), %rax +; AVX1-NEXT: xorq 120(%rsi), %rax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: movq 88(%rdi), %rcx +; AVX1-NEXT: xorq 88(%rsi), %rcx +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: movq 24(%rdi), %rcx +; AVX1-NEXT: xorq 24(%rsi), %rcx +; AVX1-NEXT: xorq 40(%rsi), %rdx +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: movq 104(%rdi), %rcx +; AVX1-NEXT: xorq 104(%rsi), %rcx +; AVX1-NEXT: orq %rdx, %rcx +; AVX1-NEXT: movq 72(%rdi), %rdx +; AVX1-NEXT: xorq 72(%rsi), %rdx +; AVX1-NEXT: orq %rdx, %rcx +; AVX1-NEXT: movq 16(%rdi), %r10 +; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: movq 8(%rdi), %rax +; AVX1-NEXT: xorq 8(%rsi), %rax +; AVX1-NEXT: xorq 48(%rsi), %r9 +; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: movq 112(%rdi), %rax +; AVX1-NEXT: xorq 112(%rsi), %rax +; AVX1-NEXT: orq %r9, %rax +; AVX1-NEXT: movq 80(%rdi), %rdx +; AVX1-NEXT: xorq 80(%rsi), %rdx +; AVX1-NEXT: orq %rdx, %rax +; AVX1-NEXT: movq (%rdi), %r9 +; AVX1-NEXT: xorq 16(%rsi), %r10 +; AVX1-NEXT: xorq (%rsi), %r9 +; AVX1-NEXT: xorq 32(%rsi), %r8 +; AVX1-NEXT: orq %r10, %rax +; AVX1-NEXT: movq 96(%rdi), %rdx +; AVX1-NEXT: movq 64(%rdi), %rdi +; AVX1-NEXT: xorq 64(%rsi), %rdi +; AVX1-NEXT: xorq 96(%rsi), %rdx +; AVX1-NEXT: orq %r8, %rdx +; AVX1-NEXT: orq %rdi, %rdx +; AVX1-NEXT: orq %rax, %rdx +; AVX1-NEXT: orq %r9, %rdx +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: sete %al +; AVX1-NEXT: retq +; +; AVX2-LABEL: eq_i512_pair: +; AVX2: # %bb.0: +; AVX2-NEXT: movq 32(%rdi), %r8 +; AVX2-NEXT: movq 48(%rdi), %r9 +; AVX2-NEXT: movq 40(%rdi), %rdx +; AVX2-NEXT: movq 56(%rdi), %rcx +; AVX2-NEXT: xorq 56(%rsi), %rcx +; AVX2-NEXT: movq 120(%rdi), %rax +; AVX2-NEXT: xorq 120(%rsi), %rax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: movq 88(%rdi), %rcx +; AVX2-NEXT: xorq 88(%rsi), %rcx +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: movq 24(%rdi), %rcx +; AVX2-NEXT: xorq 24(%rsi), %rcx +; AVX2-NEXT: xorq 40(%rsi), %rdx +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: movq 104(%rdi), %rcx +; AVX2-NEXT: xorq 104(%rsi), %rcx +; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: movq 72(%rdi), %rdx +; AVX2-NEXT: xorq 72(%rsi), %rdx +; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: movq 16(%rdi), %r10 +; AVX2-NEXT: orq %rax, %rcx +; AVX2-NEXT: movq 8(%rdi), %rax +; AVX2-NEXT: xorq 8(%rsi), %rax +; AVX2-NEXT: xorq 48(%rsi), %r9 +; AVX2-NEXT: orq %rax, %rcx +; AVX2-NEXT: movq 112(%rdi), %rax +; AVX2-NEXT: xorq 112(%rsi), %rax +; AVX2-NEXT: orq %r9, %rax +; AVX2-NEXT: movq 80(%rdi), %rdx +; AVX2-NEXT: xorq 80(%rsi), %rdx +; AVX2-NEXT: orq %rdx, %rax +; AVX2-NEXT: movq (%rdi), %r9 +; AVX2-NEXT: xorq 16(%rsi), %r10 +; AVX2-NEXT: xorq (%rsi), %r9 +; AVX2-NEXT: xorq 32(%rsi), %r8 +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: movq 96(%rdi), %rdx +; AVX2-NEXT: movq 64(%rdi), %rdi +; AVX2-NEXT: xorq 64(%rsi), %rdi +; AVX2-NEXT: xorq 96(%rsi), %rdx +; AVX2-NEXT: orq %r8, %rdx +; AVX2-NEXT: orq %rdi, %rdx +; AVX2-NEXT: orq %rax, %rdx +; AVX2-NEXT: orq %r9, %rdx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: sete %al +; AVX2-NEXT: retq +; +; AVX512F-LABEL: eq_i512_pair: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512F-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 +; AVX512F-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: eq_i512_pair: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 +; AVX512BW-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %a0 = load i512, i512* %a %b0 = load i512, i512* %b %xor1 = xor i512 %a0, %b0