Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -32370,6 +32370,29 @@ return SDValue(); } +/// Try to convert a vector select into a SHRUNKBLEND and eliminate a compare +/// that it only testing the sign-bits of its operand. +static SDValue combineVSelectSignBitCmp(SDNode *N, SelectionDAG &DAG) { + SDValue Cond = N->getOperand(0); + if (Cond.getOpcode() != X86ISD::PCMPGT) + return SDValue(); + + assert(N->getOpcode() == ISD::VSELECT && "PCMPGT with scalar select?"); + SDValue Cond0 = Cond.getOperand(0); + if (!ISD::isBuildVectorAllZeros(Cond0.getNode())) + return SDValue(); + + EVT CondOpVT = Cond0.getValueType(); + EVT VT = N->getValueType(0); + if (CondOpVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) + return SDValue(); + + // vselect (pcmpgt 0, X), Y, Z --> shrunkblend X, Y, Z + SDValue Cond1 = Cond.getOperand(1); + return DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), VT, Cond1, N->getOperand(1), + N->getOperand(2)); +} + /// Do target-specific dag combines on SELECT and VSELECT nodes. static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -32781,6 +32804,9 @@ return DAG.getBitcast(VT, newSelect); } + if (SDValue V = combineVSelectSignBitCmp(N, DAG)) + return V; + return SDValue(); } Index: test/CodeGen/X86/vsel-cmp-load.ll =================================================================== --- test/CodeGen/X86/vsel-cmp-load.ll +++ test/CodeGen/X86/vsel-cmp-load.ll @@ -128,8 +128,6 @@ ; AVX2-LABEL: slt_zero: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbd (%rdi), %ymm2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpgtd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -251,8 +249,6 @@ ret <4 x double> %sel } -; FIXME: The compare with 0 for AVX2 should be eliminated. - define <8 x float> @slt_zero_fp_select(<8 x i16>* %p, <8 x float> %x, <8 x float> %y) { ; AVX1-LABEL: slt_zero_fp_select: ; AVX1: # %bb.0: @@ -265,8 +261,6 @@ ; AVX2-LABEL: slt_zero_fp_select: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxwd (%rdi), %ymm2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpgtd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; Index: test/CodeGen/X86/vselect-pcmp.ll =================================================================== --- test/CodeGen/X86/vselect-pcmp.ll +++ test/CodeGen/X86/vselect-pcmp.ll @@ -9,20 +9,11 @@ ; Test 128-bit vectors for all legal element types. -; FIXME: Why didn't AVX-512 optimize too? - define <16 x i8> @signbit_sel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) { -; AVX12-LABEL: signbit_sel_v16i8: -; AVX12: # %bb.0: -; AVX12-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX12-NEXT: retq -; -; AVX512-LABEL: signbit_sel_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: signbit_sel_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %tr = icmp slt <16 x i8> %mask, zeroinitializer %z = select <16 x i1> %tr, <16 x i8> %x, <16 x i8> %y ret <16 x i8> %z @@ -180,8 +171,6 @@ ; ; AVX512-LABEL: signbit_sel_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2 ; AVX512-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %tr = icmp slt <32 x i8> %mask, zeroinitializer Index: test/CodeGen/X86/vselect.ll =================================================================== --- test/CodeGen/X86/vselect.ll +++ test/CodeGen/X86/vselect.ll @@ -522,7 +522,8 @@ ret <2 x i64> %z } -; Similar to above, but condition has a use that isn't a condition of a vselect so we can't optimize. +; Similar to above, but condition has a use that isn't a condition of a vselect. +; The blend does not require a cmp, but we may produce one anyway for the math op. define <2 x i64> @shrunkblend_nonvselectuse(<2 x i1> %cond, <2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { ; SSE2-LABEL: shrunkblend_nonvselectuse: ; SSE2: # %bb.0: @@ -548,9 +549,9 @@ ; AVX-LABEL: shrunkblend_nonvselectuse: ; AVX: # %bb.0: ; AVX-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 ; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %x = select <2 x i1> %cond, <2 x i64> %a, <2 x i64> %b