Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -36143,13 +36143,21 @@ ISD::CondCode CC = cast(SetCC->getOperand(2))->get(); assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate"); - // We're looking for an oversized integer equality comparison, but ignore a - // comparison with zero because that gets special treatment in EmitTest(). + // We're looking for an oversized integer equality comparison. SDValue X = SetCC->getOperand(0); SDValue Y = SetCC->getOperand(1); EVT OpVT = X.getValueType(); unsigned OpSize = OpVT.getSizeInBits(); - if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y)) + if (!OpVT.isScalarInteger() || OpSize < 128) + return SDValue(); + + // Ignore a comparison with zero because that gets special treatment in + // EmitTest(). But make an exception for the special case of a pair of + // logically-combined vector-sized operands compared to zero. + bool IsOrXorXor = X.getOpcode() == ISD::OR && + X.getOperand(0).getOpcode() == ISD::XOR && + X.getOperand(1).getOpcode() == ISD::XOR; + if (isNullConstant(Y) && !IsOrXorXor) return SDValue(); // Bail out if we know that this is not really just an oversized integer. @@ -36164,15 +36172,29 @@ if ((OpSize == 128 && Subtarget.hasSSE2()) || (OpSize == 256 && Subtarget.hasAVX2())) { EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8; - SDValue VecX = DAG.getBitcast(VecVT, X); - SDValue VecY = DAG.getBitcast(VecVT, Y); - + SDValue Cmp; + if (IsOrXorXor) { + // This is a bitwise-combined equality comparison of 2 pairs of vectors: + // setcc i128 (or (xor A, B), (xor C, D)), 0, eq + // Use 2 vector equality compares and 'and' the results before doing a + // MOVMSK. + SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0)); + SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1)); + SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0)); + SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1)); + SDValue Cmp1 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, A, B); + SDValue Cmp2 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, C, D); + Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2); + } else { + SDValue VecX = DAG.getBitcast(VecVT, X); + SDValue VecY = DAG.getBitcast(VecVT, Y); + Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY); + } // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality. // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne - SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY); SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp); SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL, MVT::i32); Index: test/CodeGen/X86/setcc-wide-types.ll =================================================================== --- test/CodeGen/X86/setcc-wide-types.ll +++ test/CodeGen/X86/setcc-wide-types.ll @@ -138,41 +138,35 @@ ret i32 %zext } -; This test models the expansion of 'memcmp(a, b, 32) != 0' +; This test models the expansion of 'memcmp(a, b, 32) != 0' ; if we allowed 2 pairs of 16-byte loads per block. define i32 @ne_i128_pair(i128* %a, i128* %b) { ; SSE2-LABEL: ne_i128_pair: ; SSE2: # %bb.0: -; SSE2-NEXT: movq (%rdi), %rax -; SSE2-NEXT: movq 8(%rdi), %rcx -; SSE2-NEXT: xorq (%rsi), %rax -; SSE2-NEXT: xorq 8(%rsi), %rcx -; SSE2-NEXT: movq 24(%rdi), %rdx -; SSE2-NEXT: movq 16(%rdi), %rdi -; SSE2-NEXT: xorq 16(%rsi), %rdi -; SSE2-NEXT: orq %rax, %rdi -; SSE2-NEXT: xorq 24(%rsi), %rdx -; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: movdqu 16(%rdi), %xmm1 +; SSE2-NEXT: movdqu (%rsi), %xmm2 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; SSE2-NEXT: movdqu 16(%rsi), %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %ecx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdi, %rdx +; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; AVX2-LABEL: ne_i128_pair: ; AVX2: # %bb.0: -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: movq 8(%rdi), %rcx -; AVX2-NEXT: xorq (%rsi), %rax -; AVX2-NEXT: xorq 8(%rsi), %rcx -; AVX2-NEXT: movq 24(%rdi), %rdx -; AVX2-NEXT: movq 16(%rdi), %rdi -; AVX2-NEXT: xorq 16(%rsi), %rdi -; AVX2-NEXT: orq %rax, %rdi -; AVX2-NEXT: xorq 24(%rsi), %rdx -; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX2-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %ecx ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rdi, %rdx +; AVX2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; AVX2-NEXT: setne %al ; AVX2-NEXT: retq %a0 = load i128, i128* %a @@ -189,41 +183,35 @@ ret i32 %z } -; This test models the expansion of 'memcmp(a, b, 32) == 0' +; This test models the expansion of 'memcmp(a, b, 32) == 0' ; if we allowed 2 pairs of 16-byte loads per block. define i32 @eq_i128_pair(i128* %a, i128* %b) { ; SSE2-LABEL: eq_i128_pair: ; SSE2: # %bb.0: -; SSE2-NEXT: movq (%rdi), %rax -; SSE2-NEXT: movq 8(%rdi), %rcx -; SSE2-NEXT: xorq (%rsi), %rax -; SSE2-NEXT: xorq 8(%rsi), %rcx -; SSE2-NEXT: movq 24(%rdi), %rdx -; SSE2-NEXT: movq 16(%rdi), %rdi -; SSE2-NEXT: xorq 16(%rsi), %rdi -; SSE2-NEXT: orq %rax, %rdi -; SSE2-NEXT: xorq 24(%rsi), %rdx -; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: movdqu 16(%rdi), %xmm1 +; SSE2-NEXT: movdqu (%rsi), %xmm2 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; SSE2-NEXT: movdqu 16(%rsi), %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %ecx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdi, %rdx +; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; AVX2-LABEL: eq_i128_pair: ; AVX2: # %bb.0: -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: movq 8(%rdi), %rcx -; AVX2-NEXT: xorq (%rsi), %rax -; AVX2-NEXT: xorq 8(%rsi), %rcx -; AVX2-NEXT: movq 24(%rdi), %rdx -; AVX2-NEXT: movq 16(%rdi), %rdi -; AVX2-NEXT: xorq 16(%rsi), %rdi -; AVX2-NEXT: orq %rax, %rdi -; AVX2-NEXT: xorq 24(%rsi), %rdx -; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX2-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %ecx ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rdi, %rdx +; AVX2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; AVX2-NEXT: sete %al ; AVX2-NEXT: retq %a0 = load i128, i128* %a @@ -240,7 +228,7 @@ ret i32 %z } -; This test models the expansion of 'memcmp(a, b, 64) != 0' +; This test models the expansion of 'memcmp(a, b, 64) != 0' ; if we allowed 2 pairs of 32-byte loads per block. define i32 @ne_i256_pair(i256* %a, i256* %b) { @@ -275,31 +263,16 @@ ; ; AVX2-LABEL: ne_i256_pair: ; AVX2: # %bb.0: -; AVX2-NEXT: movq 16(%rdi), %r9 -; AVX2-NEXT: movq 24(%rdi), %r11 -; AVX2-NEXT: movq (%rdi), %r8 -; AVX2-NEXT: movq 8(%rdi), %r10 -; AVX2-NEXT: xorq 8(%rsi), %r10 -; AVX2-NEXT: xorq 24(%rsi), %r11 -; AVX2-NEXT: xorq (%rsi), %r8 -; AVX2-NEXT: xorq 16(%rsi), %r9 -; AVX2-NEXT: movq 48(%rdi), %rdx -; AVX2-NEXT: movq 32(%rdi), %rax -; AVX2-NEXT: movq 56(%rdi), %rcx -; AVX2-NEXT: movq 40(%rdi), %rdi -; AVX2-NEXT: xorq 40(%rsi), %rdi -; AVX2-NEXT: xorq 56(%rsi), %rcx -; AVX2-NEXT: orq %r11, %rcx -; AVX2-NEXT: orq %rdi, %rcx -; AVX2-NEXT: orq %r10, %rcx -; AVX2-NEXT: xorq 32(%rsi), %rax -; AVX2-NEXT: xorq 48(%rsi), %rdx -; AVX2-NEXT: orq %r9, %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: orq %r8, %rdx +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %ecx ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: cmpl $-1, %ecx ; AVX2-NEXT: setne %al +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %a0 = load i256, i256* %a %b0 = load i256, i256* %b @@ -315,7 +288,7 @@ ret i32 %z } -; This test models the expansion of 'memcmp(a, b, 64) == 0' +; This test models the expansion of 'memcmp(a, b, 64) == 0' ; if we allowed 2 pairs of 32-byte loads per block. define i32 @eq_i256_pair(i256* %a, i256* %b) { @@ -350,31 +323,16 @@ ; ; AVX2-LABEL: eq_i256_pair: ; AVX2: # %bb.0: -; AVX2-NEXT: movq 16(%rdi), %r9 -; AVX2-NEXT: movq 24(%rdi), %r11 -; AVX2-NEXT: movq (%rdi), %r8 -; AVX2-NEXT: movq 8(%rdi), %r10 -; AVX2-NEXT: xorq 8(%rsi), %r10 -; AVX2-NEXT: xorq 24(%rsi), %r11 -; AVX2-NEXT: xorq (%rsi), %r8 -; AVX2-NEXT: xorq 16(%rsi), %r9 -; AVX2-NEXT: movq 48(%rdi), %rdx -; AVX2-NEXT: movq 32(%rdi), %rax -; AVX2-NEXT: movq 56(%rdi), %rcx -; AVX2-NEXT: movq 40(%rdi), %rdi -; AVX2-NEXT: xorq 40(%rsi), %rdi -; AVX2-NEXT: xorq 56(%rsi), %rcx -; AVX2-NEXT: orq %r11, %rcx -; AVX2-NEXT: orq %rdi, %rcx -; AVX2-NEXT: orq %r10, %rcx -; AVX2-NEXT: xorq 32(%rsi), %rax -; AVX2-NEXT: xorq 48(%rsi), %rdx -; AVX2-NEXT: orq %r9, %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: orq %r8, %rdx +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %ecx ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: cmpl $-1, %ecx ; AVX2-NEXT: sete %al +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %a0 = load i256, i256* %a %b0 = load i256, i256* %b