Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34410,6 +34410,76 @@ return SDValue(); } +/// Given an extract vector element instruction with a setcc operand, look for +/// other extracts of that same vector comparison and convert all of the +/// extracts into MOVMSK followed by scalar ops. This eliminates multiple +/// potentially expensive transfers from a vector to scalar registers. +static SDValue foldExtractEltsToMovMsk(SDNode *ExtElt, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + // First, match this as an extract element from a setcc with constant index. + assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract"); + SDValue Setcc = ExtElt->getOperand(0); + SDValue Index = ExtElt->getOperand(1); + EVT VT = ExtElt->getValueType(0); + EVT SetccVT = Setcc.getValueType(); + if (VT != MVT::i1 || Setcc.getOpcode() != ISD::SETCC || Setcc.hasOneUse() || + !isa(Index) || SetccVT.getScalarType() != MVT::i1) + return SDValue(); + + // Make sure that we have movmsk-ability. The setcc type is converted to + // integer from FP if needed. AVX512 with writemask will have a vXi1 type, + // so we bail out on that. + // TODO: Allow 256-bit with AVX1/2. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT SetccOpVT = Setcc.getOperand(0).getValueType(); + EVT NewSetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), SetccOpVT); + bool CanUseMOVMSKPS = NewSetccVT == MVT::v4i32 && Subtarget.hasSSE1(); + bool CanUseMOVMSKPD = NewSetccVT == MVT::v2i64 && Subtarget.hasSSE2(); + bool CanUsePMOVMSKB = NewSetccVT == MVT::v16i8 && Subtarget.hasSSE2(); + if (!(CanUseMOVMSKPS || CanUseMOVMSKPD || CanUsePMOVMSKB)) + return SDValue(); + + // Collect all similar extract element uses of the setcc. + // Use a set because duplicates may be present in the uses list. + SmallSetVector ExtEltSet; + for (SDNode *Use : Setcc->uses()) { + if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isa(Use->getOperand(1)) && + Use->getValueType(0) == MVT::i1) + ExtEltSet.insert(Use); + } + + // We need to replace at least 2 extracts for this to be profitable. + // TODO: We might want to allow this for a single extract if MOVMSK is the + // fastest way to get a vector compare result over to a GPR. + if (ExtEltSet.size() < 2) + return SDValue(); + + // Convert the vXi1 setcc to the x86-legal type. Then, MOVMSK the bits to a + // scalar value. + SDLoc DL(ExtElt); + SDValue NewCmp = DAG.getNode(ISD::SETCC, DL, NewSetccVT, Setcc.getOperand(0), + Setcc.getOperand(1), Setcc.getOperand(2)); + SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, NewCmp); + + // For each extracted bit of the original boolean vector setcc, convert to + // extract of a scalar bit from the MOVMSK. We are still early in combining, + // so create generic and/setcc nodes that may get folded with other ops. + SDValue Zero = DAG.getConstant(0, DL, MVT::i32); + for (SDNode *ExtUse : ExtEltSet) { + // extelt vXi1 (setcc X, Y, CC), IndexC --> + // ((movmsk (setcc X, Y, CC)) & (1 << IndexC)) != 0 + SDValue BitIndex = DAG.getConstant(1 << ExtUse->getConstantOperandVal(1), + DL, MVT::i32); + SDValue MaskedBit = DAG.getNode(ISD::AND, DL, MVT::i32, MovMsk, BitIndex); + SDValue MovMskBit = DAG.getSetCC(DL, MVT::i1, MaskedBit, Zero, ISD::SETNE); + DCI.CombineTo(ExtUse, MovMskBit); + } + return SDValue(ExtElt, 0); // ExtElt was replaced. +} + /// Extracting a scalar FP value from vector element 0 is free, so extract each /// operand first, then perform the math as a scalar op. static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) { @@ -34583,6 +34653,9 @@ if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget)) return MinMax; + if (SDValue MovMsk = foldExtractEltsToMovMsk(N, DAG, DCI, Subtarget)) + return MovMsk; + if (SDValue V = scalarizeExtEltFP(N, DAG)) return V; Index: llvm/test/CodeGen/X86/movmsk-cmp.ll =================================================================== --- llvm/test/CodeGen/X86/movmsk-cmp.ll +++ llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -4777,19 +4777,31 @@ ; SSE2-LABEL: movmsk_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: xorb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: andb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $15, %ecx +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: shrl $8, %edx +; SSE2-NEXT: andl $1, %edx +; SSE2-NEXT: andl $8, %eax +; SSE2-NEXT: shrl $3, %eax +; SSE2-NEXT: xorl %edx, %eax +; SSE2-NEXT: andl %ecx, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; AVX-LABEL: movmsk_v16i8: ; AVX: # %bb.0: ; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $3, %xmm0, %eax -; AVX-NEXT: vpextrb $8, %xmm0, %ecx -; AVX-NEXT: xorl %eax, %ecx -; AVX-NEXT: vpextrb $15, %xmm0, %eax +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: shrl $15, %ecx +; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: andl $1, %edx +; AVX-NEXT: andl $8, %eax +; AVX-NEXT: shrl $3, %eax +; AVX-NEXT: xorl %edx, %eax ; AVX-NEXT: andl %ecx, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq @@ -4797,18 +4809,17 @@ ; KNL-LABEL: movmsk_v16i8: ; KNL: # %bb.0: ; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: kshiftrw $8, %k0, %k1 -; KNL-NEXT: kmovw %k1, %edx -; KNL-NEXT: kshiftrw $3, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: xorb %dl, %al -; KNL-NEXT: andb %cl, %al +; KNL-NEXT: vpmovmskb %xmm0, %eax +; KNL-NEXT: movl %eax, %ecx +; KNL-NEXT: shrl $15, %ecx +; KNL-NEXT: movl %eax, %edx +; KNL-NEXT: shrl $8, %edx +; KNL-NEXT: andl $1, %edx +; KNL-NEXT: andl $8, %eax +; KNL-NEXT: shrl $3, %eax +; KNL-NEXT: xorl %edx, %eax +; KNL-NEXT: andl %ecx, %eax ; KNL-NEXT: # kill: def $al killed $al killed $eax -; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: movmsk_v16i8: @@ -4909,10 +4920,11 @@ ; SSE2-LABEL: movmsk_v4i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movmskps %xmm1, %eax +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $3, %ecx +; SSE2-NEXT: andl $4, %eax +; SSE2-NEXT: shrl $2, %eax ; SSE2-NEXT: xorl %ecx, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq @@ -4920,24 +4932,25 @@ ; AVX-LABEL: movmsk_v4i32: ; AVX: # %bb.0: ; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpextrd $2, %xmm0, %ecx -; AVX-NEXT: vpextrd $3, %xmm0, %eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: shrl $3, %ecx +; AVX-NEXT: andl $4, %eax +; AVX-NEXT: shrl $2, %eax ; AVX-NEXT: xorl %ecx, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; ; KNL-LABEL: movmsk_v4i32: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: kshiftrw $2, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: xorb %cl, %al +; KNL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vmovmskps %xmm0, %eax +; KNL-NEXT: movl %eax, %ecx +; KNL-NEXT: shrl $3, %ecx +; KNL-NEXT: andl $4, %eax +; KNL-NEXT: shrl $2, %eax +; KNL-NEXT: xorl %ecx, %eax ; KNL-NEXT: # kill: def $al killed $al killed $eax -; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: movmsk_v4i32: @@ -4965,9 +4978,9 @@ ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movmskpd %xmm0, %ecx +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: shrl %eax ; SSE2-NEXT: andl %ecx, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq @@ -4977,21 +4990,21 @@ ; AVX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrd $2, %xmm0, %ecx -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskpd %xmm0, %ecx +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: shrl %eax ; AVX-NEXT: andl %ecx, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; ; KNL-LABEL: movmsk_v2i64: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andb %cl, %al +; KNL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovmskpd %xmm0, %ecx +; KNL-NEXT: movl %ecx, %eax +; KNL-NEXT: shrl %eax +; KNL-NEXT: andl %ecx, %eax ; KNL-NEXT: # kill: def $al killed $al killed $eax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -5019,42 +5032,25 @@ ; SSE2-NEXT: cmpeqps %xmm1, %xmm2 ; SSE2-NEXT: cmpunordps %xmm1, %xmm0 ; SSE2-NEXT: orps %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm1, %edx -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: orl %edx, %eax -; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: testb $14, %al +; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; AVX-LABEL: movmsk_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vcmpeq_uqps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vextractps $1, %xmm0, %ecx -; AVX-NEXT: vextractps $2, %xmm0, %edx -; AVX-NEXT: vpextrb $12, %xmm0, %eax -; AVX-NEXT: orl %edx, %eax -; AVX-NEXT: orl %ecx, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: testb $14, %al +; AVX-NEXT: setne %al ; AVX-NEXT: retq ; ; KNL-LABEL: movmsk_v4f32: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vcmpeq_uqps %zmm1, %zmm0, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: kshiftrw $2, %k0, %k1 -; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: kshiftrw $1, %k0, %k0 -; KNL-NEXT: kmovw %k0, %edx -; KNL-NEXT: orb %cl, %al -; KNL-NEXT: orb %dl, %al -; KNL-NEXT: # kill: def $al killed $al killed $eax -; KNL-NEXT: vzeroupper +; KNL-NEXT: vcmpeq_uqps %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vmovmskps %xmm0, %eax +; KNL-NEXT: testb $14, %al +; KNL-NEXT: setne %al ; KNL-NEXT: retq ; ; SKX-LABEL: movmsk_v4f32: @@ -5083,9 +5079,9 @@ ; SSE2-LABEL: movmsk_v2f64: ; SSE2: # %bb.0: ; SSE2-NEXT: cmplepd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movmskpd %xmm1, %ecx +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: shrl %eax ; SSE2-NEXT: andl %ecx, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq @@ -5093,23 +5089,21 @@ ; AVX-LABEL: movmsk_v2f64: ; AVX: # %bb.0: ; AVX-NEXT: vcmplepd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vextractps $2, %xmm0, %ecx -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovmskpd %xmm0, %ecx +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: shrl %eax ; AVX-NEXT: andl %ecx, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; ; KNL-LABEL: movmsk_v2f64: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vcmplepd %zmm0, %zmm1, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andb %cl, %al +; KNL-NEXT: vcmplepd %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vmovmskpd %xmm0, %ecx +; KNL-NEXT: movl %ecx, %eax +; KNL-NEXT: shrl %eax +; KNL-NEXT: andl %ecx, %eax ; KNL-NEXT: # kill: def $al killed $al killed $eax -; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: movmsk_v2f64: @@ -5132,21 +5126,20 @@ ; SSE2-LABEL: PR39665_c_ray: ; SSE2: # %bb.0: ; SSE2-NEXT: cmpltpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: testb $1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movmskpd %xmm1, %ecx +; SSE2-NEXT: testb $2, %cl ; SSE2-NEXT: movl $42, %eax -; SSE2-NEXT: movl $99, %ecx -; SSE2-NEXT: cmovel %ecx, %eax -; SSE2-NEXT: testb $1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: cmovel %ecx, %eax +; SSE2-NEXT: movl $99, %edx +; SSE2-NEXT: cmovel %edx, %eax +; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: cmovel %edx, %eax ; SSE2-NEXT: retq ; ; AVX-LABEL: PR39665_c_ray: ; AVX: # %bb.0: ; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %ecx -; AVX-NEXT: vpextrb $8, %xmm0, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: vmovmskpd %xmm0, %ecx +; AVX-NEXT: testb $2, %cl ; AVX-NEXT: movl $42, %eax ; AVX-NEXT: movl $99, %edx ; AVX-NEXT: cmovel %edx, %eax @@ -5156,19 +5149,14 @@ ; ; KNL-LABEL: PR39665_c_ray: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k1 -; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: testb $1, %al +; KNL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vmovmskpd %xmm0, %ecx +; KNL-NEXT: testb $2, %cl ; KNL-NEXT: movl $42, %eax ; KNL-NEXT: movl $99, %edx ; KNL-NEXT: cmovel %edx, %eax ; KNL-NEXT: testb $1, %cl ; KNL-NEXT: cmovel %edx, %eax -; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: PR39665_c_ray: