Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -34022,6 +34022,47 @@ return SDValue(); } +/// Try to map a 128-bit or larger integer comparison to vector instructions +/// before type legalization splits it up into chunks. +static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + ISD::CondCode CC = cast(SetCC->getOperand(2))->get(); + assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate"); + + // We're looking for an oversized integer equality comparison, but ignore a + // comparison with zero because that gets special treatment in EmitTest(). + SDValue X = SetCC->getOperand(0); + SDValue Y = SetCC->getOperand(1); + EVT OpVT = X.getValueType(); + unsigned OpSize = OpVT.getSizeInBits(); + if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y)) + return SDValue(); + + // TODO: Use PXOR + PTEST for SSE4.1 or later? + // TODO: Add support for AVX-512. + EVT VT = SetCC->getValueType(0); + SDLoc DL(SetCC); + if ((OpSize == 128 && Subtarget.hasSSE2()) || + (OpSize == 256 && Subtarget.hasAVX2())) { + EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8; + SDValue VecX = DAG.getBitcast(VecVT, X); + SDValue VecY = DAG.getBitcast(VecVT, Y); + + // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality. + // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq + // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne + // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq + // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne + SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY); + SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp); + SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL, + MVT::i32); + return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC); + } + + return SDValue(); +} + static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { ISD::CondCode CC = cast(N->getOperand(2))->get(); @@ -34046,6 +34087,9 @@ SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1)); return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); } + + if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget)) + return V; } if (VT.getScalarType() == MVT::i1 && Index: llvm/trunk/test/CodeGen/X86/setcc-wide-types.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/setcc-wide-types.ll +++ llvm/trunk/test/CodeGen/X86/setcc-wide-types.ll @@ -2,34 +2,24 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=AVX2 -; FIXME: Equality checks of 128/256-bit values can use PMOVMSK or PTEST to avoid scalarization. +; Equality checks of 128/256-bit values can use PMOVMSK or PTEST to avoid scalarization. define i32 @ne_i128(<2 x i64> %x, <2 x i64> %y) { ; SSE2-LABEL: ne_i128: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movd %xmm0, %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rdx -; SSE2-NEXT: movd %xmm1, %rsi -; SSE2-NEXT: xorq %rcx, %rsi -; SSE2-NEXT: xorq %rax, %rdx +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %ecx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rsi, %rdx +; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; AVX2-LABEL: ne_i128: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: vmovq %xmm1, %rdx -; AVX2-NEXT: vpextrq $1, %xmm1, %rsi -; AVX2-NEXT: xorq %rcx, %rsi -; AVX2-NEXT: xorq %rax, %rdx +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %ecx ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rsi, %rdx +; AVX2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; AVX2-NEXT: setne %al ; AVX2-NEXT: retq %bcx = bitcast <2 x i64> %x to i128 @@ -42,29 +32,19 @@ define i32 @eq_i128(<2 x i64> %x, <2 x i64> %y) { ; SSE2-LABEL: eq_i128: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movd %xmm0, %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rdx -; SSE2-NEXT: movd %xmm1, %rsi -; SSE2-NEXT: xorq %rcx, %rsi -; SSE2-NEXT: xorq %rax, %rdx +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %ecx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rsi, %rdx +; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; AVX2-LABEL: eq_i128: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: vmovq %xmm1, %rdx -; AVX2-NEXT: vpextrq $1, %xmm1, %rsi -; AVX2-NEXT: xorq %rcx, %rsi -; AVX2-NEXT: xorq %rax, %rdx +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %ecx ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rsi, %rdx +; AVX2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; AVX2-NEXT: sete %al ; AVX2-NEXT: retq %bcx = bitcast <2 x i64> %x to i128 @@ -102,24 +82,10 @@ ; ; AVX2-LABEL: ne_i256: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq %xmm0, %r8 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovq %xmm2, %r9 -; AVX2-NEXT: vpextrq $1, %xmm0, %r10 -; AVX2-NEXT: vpextrq $1, %xmm2, %rsi -; AVX2-NEXT: vmovq %xmm1, %rdi -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: vpextrq $1, %xmm0, %rdx -; AVX2-NEXT: xorq %rsi, %rdx -; AVX2-NEXT: xorq %r10, %rcx -; AVX2-NEXT: orq %rdx, %rcx -; AVX2-NEXT: xorq %r9, %rax -; AVX2-NEXT: xorq %r8, %rdi -; AVX2-NEXT: orq %rax, %rdi +; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %ecx ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rcx, %rdi +; AVX2-NEXT: cmpl $-1, %ecx ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -158,24 +124,10 @@ ; ; AVX2-LABEL: eq_i256: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq %xmm0, %r8 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovq %xmm2, %r9 -; AVX2-NEXT: vpextrq $1, %xmm0, %r10 -; AVX2-NEXT: vpextrq $1, %xmm2, %rsi -; AVX2-NEXT: vmovq %xmm1, %rdi -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: vpextrq $1, %xmm0, %rdx -; AVX2-NEXT: xorq %rsi, %rdx -; AVX2-NEXT: xorq %r10, %rcx -; AVX2-NEXT: orq %rdx, %rcx -; AVX2-NEXT: xorq %r9, %rax -; AVX2-NEXT: xorq %r8, %rdi -; AVX2-NEXT: orq %rax, %rdi +; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %ecx ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rcx, %rdi +; AVX2-NEXT: cmpl $-1, %ecx ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq