Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -28129,6 +28129,39 @@ return SDValue(); } +/// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is +/// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to +/// eliminate loading the vector constant mask value. This relies on the fact +/// that a PCMP always creates an all-ones or all-zeros bitmask per element. +static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) { + SDValue Op0 = peekThroughBitcasts(N->getOperand(0)); + SDValue Op1 = peekThroughBitcasts(N->getOperand(1)); + if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT) + return SDValue(); + + // The existence of the PCMP node guarantees that we have the required SSE2 or + // AVX2 for a shift of this vector type, but there is no vector shift by + // immediate for a vector with byte elements (PSRLB). + EVT VT0 = Op0.getValueType(); + EVT VT1 = Op1.getValueType(); + unsigned EltBitWidth = VT0.getScalarType().getSizeInBits(); + if (VT0 != VT1 || EltBitWidth == 8) + return SDValue(); + + // TODO: Is this possible or worthwhile for AVX-512 variants? + if (VT0.getSizeInBits() != 128 && VT0.getSizeInBits() != 256) + return SDValue(); + + APInt SplatVal; + if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1) + return SDValue(); + + SDLoc DL(N); + SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8); + SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt); + return DAG.getBitcast(N->getValueType(0), Shift); +} + static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -28147,6 +28180,9 @@ if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG)) return R; + if (SDValue ShiftRight = combinePCMPAnd1(N, DAG)) + return ShiftRight; + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); Index: test/CodeGen/X86/avx512-ext.ll =================================================================== --- test/CodeGen/X86/avx512-ext.ll +++ test/CodeGen/X86/avx512-ext.ll @@ -1919,10 +1919,9 @@ ; KNL-LABEL: zext_32xi1_to_32xi16: ; KNL: ## BB#0: ; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 -; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; KNL-NEXT: vpand %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0 ; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 -; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1 +; KNL-NEXT: vpsrlw $15, %ymm1, %ymm1 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_32xi1_to_32xi16: @@ -1939,7 +1938,7 @@ ; KNL-LABEL: zext_16xi1_to_16xi16: ; KNL: ## BB#0: ; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_16xi1_to_16xi16: @@ -1983,8 +1982,7 @@ ; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; KNL-NEXT: vpand %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 -; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpsrld $31, %xmm0, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_4xi1_to_4x32: @@ -2007,7 +2005,7 @@ ; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; KNL-NEXT: vpand %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpsrlq $63, %xmm0, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_2xi1_to_2xi64: Index: test/CodeGen/X86/avx512-vec-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-vec-cmp.ll +++ test/CodeGen/X86/avx512-vec-cmp.ll @@ -1215,7 +1215,7 @@ ; KNL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; KNL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] ; KNL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpsrlq $63, %xmm0, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test45: Index: test/CodeGen/X86/shift-pcmp.ll =================================================================== --- test/CodeGen/X86/shift-pcmp.ll +++ test/CodeGen/X86/shift-pcmp.ll @@ -26,14 +26,14 @@ ; SSE-LABEL: bar: ; SSE: # BB#0: ; SSE-NEXT: pcmpeqw %xmm1, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: psrlw $15, %xmm0 ; SSE-NEXT: psllw $5, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: bar: ; AVX: # BB#0: ; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX-NEXT: vpsllw $5, %xmm0, %xmm0 ; AVX-NEXT: retq ; Index: test/CodeGen/X86/vector-pcmp.ll =================================================================== --- test/CodeGen/X86/vector-pcmp.ll +++ test/CodeGen/X86/vector-pcmp.ll @@ -294,10 +294,9 @@ ; SSE-LABEL: cmpeq_zext_v16i16: ; SSE: # BB#0: ; SSE-NEXT: pcmpeqw %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: psrlw $15, %xmm0 ; SSE-NEXT: pcmpeqw %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: psrlw $15, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: cmpeq_zext_v16i16: @@ -313,7 +312,7 @@ ; AVX2-LABEL: cmpeq_zext_v16i16: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX2-NEXT: retq ; %cmp = icmp eq <16 x i16> %a, %b @@ -325,21 +324,14 @@ ; SSE-LABEL: cmpeq_zext_v4i32: ; SSE: # BB#0: ; SSE-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: psrld $31, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: cmpeq_zext_v4i32: -; AVX1: # BB#0: -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: cmpeq_zext_v4i32: -; AVX2: # BB#0: -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: cmpeq_zext_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $31, %xmm0, %xmm0 +; AVX-NEXT: retq ; %cmp = icmp eq <4 x i32> %a, %b %zext = zext <4 x i1> %cmp to <4 x i32> @@ -363,10 +355,9 @@ ; SSE42-LABEL: cmpeq_zext_v4i64: ; SSE42: # BB#0: ; SSE42-NEXT: pcmpeqq %xmm2, %xmm0 -; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [1,1] -; SSE42-NEXT: pand %xmm2, %xmm0 +; SSE42-NEXT: psrlq $63, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm3, %xmm1 -; SSE42-NEXT: pand %xmm2, %xmm1 +; SSE42-NEXT: psrlq $63, %xmm1 ; SSE42-NEXT: retq ; ; AVX1-LABEL: cmpeq_zext_v4i64: @@ -382,8 +373,7 @@ ; AVX2-LABEL: cmpeq_zext_v4i64: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $63, %ymm0, %ymm0 ; AVX2-NEXT: retq ; %cmp = icmp eq <4 x i64> %a, %b @@ -426,13 +416,13 @@ ; SSE-LABEL: cmpgt_zext_v8i16: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: psrlw $15, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: cmpgt_zext_v8i16: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX-NEXT: retq ; %cmp = icmp sgt <8 x i16> %a, %b @@ -444,10 +434,9 @@ ; SSE-LABEL: cmpgt_zext_v8i32: ; SSE: # BB#0: ; SSE-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: psrld $31, %xmm0 ; SSE-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: psrld $31, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: cmpgt_zext_v8i32: @@ -463,8 +452,7 @@ ; AVX2-LABEL: cmpgt_zext_v8i32: ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0 ; AVX2-NEXT: retq ; %cmp = icmp sgt <8 x i32> %a, %b @@ -492,13 +480,13 @@ ; SSE42-LABEL: cmpgt_zext_v2i64: ; SSE42: # BB#0: ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE42-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE42-NEXT: psrlq $63, %xmm0 ; SSE42-NEXT: retq ; ; AVX-LABEL: cmpgt_zext_v2i64: ; AVX: # BB#0: ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpsrlq $63, %xmm0, %xmm0 ; AVX-NEXT: retq ; %cmp = icmp sgt <2 x i64> %a, %b