Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -34710,6 +34710,73 @@ return SDValue(); } +// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity. +// Turn it into series of XORs and a setnp. +static SDValue combineParity(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + + // We only support 64-bit and 32-bit. 64-bit requires special handling + // unless the 64-bit popcnt instruction is legal. + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT)) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // LHS needs to be a single use CTPOP. + if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse()) + return SDValue(); + + // RHS needs to be 1. + if (!isOneConstant(N1)) + return SDValue(); + + SDLoc DL(N); + SDValue X = N0.getOperand(0); + + // If this is 64-bit, its always best to xor the two 32-bit pieces together + // even if we have popcnt. + if (VT == MVT::i64) { + SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, + DAG.getNode(ISD::SRL, DL, VT, X, + DAG.getConstant(32, DL, MVT::i8))); + SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); + X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi); + // Generate a 32-bit parity idiom. This will bring us back here if we need + // to expand it too. + SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32, + DAG.getNode(ISD::CTPOP, DL, MVT::i32, X), + DAG.getConstant(1, DL, MVT::i32)); + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity); + } + assert(VT == MVT::i32 && "Unexpected VT!"); + + // Xor the high and low 16-bits together using a 32-bit operation. + SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X, + DAG.getConstant(16, DL, MVT::i8)); + X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16); + + // Finally xor the low 2 bytes together and use a 8-bit flag setting xor. + // This should allow an h-reg to be used to save a shift. + // FIXME: We only get an h-reg in 32-bit mode. + SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, + DAG.getNode(ISD::SRL, DL, VT, X, + DAG.getConstant(8, DL, MVT::i8))); + SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X); + SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32); + SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1); + + // Copy the inverse of the parity flag into a register with setcc. + SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG); + // Zero extend to original type. + return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp); +} + static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -34737,6 +34804,10 @@ } } + // This must be done before legalization has expanded the ctpop. + if (SDValue V = combineParity(N, DAG, Subtarget)) + return V; + if (DCI.isBeforeLegalizeOps()) return SDValue(); Index: llvm/trunk/test/CodeGen/X86/parity.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/parity.ll +++ llvm/trunk/test/CodeGen/X86/parity.ll @@ -9,41 +9,23 @@ ; X86-NOPOPCNT: # %bb.0: ; X86-NOPOPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOPOPCNT-NEXT: movl %eax, %ecx -; X86-NOPOPCNT-NEXT: shrl %ecx -; X86-NOPOPCNT-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X86-NOPOPCNT-NEXT: subl %ecx, %eax -; X86-NOPOPCNT-NEXT: movl %eax, %ecx -; X86-NOPOPCNT-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NOPOPCNT-NEXT: shrl $2, %eax -; X86-NOPOPCNT-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X86-NOPOPCNT-NEXT: addl %ecx, %eax -; X86-NOPOPCNT-NEXT: movl %eax, %ecx -; X86-NOPOPCNT-NEXT: shrl $4, %ecx -; X86-NOPOPCNT-NEXT: addl %eax, %ecx -; X86-NOPOPCNT-NEXT: andl $17764111, %ecx # imm = 0x10F0F0F -; X86-NOPOPCNT-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; X86-NOPOPCNT-NEXT: shrl $24, %eax -; X86-NOPOPCNT-NEXT: andl $1, %eax +; X86-NOPOPCNT-NEXT: shrl $16, %ecx +; X86-NOPOPCNT-NEXT: xorl %eax, %ecx +; X86-NOPOPCNT-NEXT: xorl %eax, %eax +; X86-NOPOPCNT-NEXT: xorb %ch, %cl +; X86-NOPOPCNT-NEXT: setnp %al ; X86-NOPOPCNT-NEXT: retl ; ; X64-NOPOPCNT-LABEL: parity_32: ; X64-NOPOPCNT: # %bb.0: -; X64-NOPOPCNT-NEXT: movl %edi, %eax -; X64-NOPOPCNT-NEXT: shrl %eax -; X64-NOPOPCNT-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X64-NOPOPCNT-NEXT: subl %eax, %edi -; X64-NOPOPCNT-NEXT: movl %edi, %eax -; X64-NOPOPCNT-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X64-NOPOPCNT-NEXT: shrl $2, %edi -; X64-NOPOPCNT-NEXT: andl $858993459, %edi # imm = 0x33333333 -; X64-NOPOPCNT-NEXT: addl %eax, %edi -; X64-NOPOPCNT-NEXT: movl %edi, %eax -; X64-NOPOPCNT-NEXT: shrl $4, %eax -; X64-NOPOPCNT-NEXT: addl %edi, %eax -; X64-NOPOPCNT-NEXT: andl $17764111, %eax # imm = 0x10F0F0F -; X64-NOPOPCNT-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 -; X64-NOPOPCNT-NEXT: shrl $24, %eax -; X64-NOPOPCNT-NEXT: andl $1, %eax +; X64-NOPOPCNT-NEXT: movl %edi, %ecx +; X64-NOPOPCNT-NEXT: shrl $16, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx +; X64-NOPOPCNT-NEXT: movl %ecx, %edx +; X64-NOPOPCNT-NEXT: shrl $8, %edx +; X64-NOPOPCNT-NEXT: xorl %eax, %eax +; X64-NOPOPCNT-NEXT: xorb %cl, %dl +; X64-NOPOPCNT-NEXT: setnp %al ; X64-NOPOPCNT-NEXT: retq ; ; X86-POPCNT-LABEL: parity_32: @@ -66,71 +48,36 @@ ; X86-NOPOPCNT-LABEL: parity_64: ; X86-NOPOPCNT: # %bb.0: ; X86-NOPOPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOPOPCNT-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOPOPCNT-NEXT: movl %ecx, %edx -; X86-NOPOPCNT-NEXT: shrl %edx -; X86-NOPOPCNT-NEXT: andl $1431655765, %edx # imm = 0x55555555 -; X86-NOPOPCNT-NEXT: subl %edx, %ecx -; X86-NOPOPCNT-NEXT: movl %ecx, %edx -; X86-NOPOPCNT-NEXT: andl $858993459, %edx # imm = 0x33333333 -; X86-NOPOPCNT-NEXT: shrl $2, %ecx -; X86-NOPOPCNT-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NOPOPCNT-NEXT: addl %edx, %ecx -; X86-NOPOPCNT-NEXT: movl %ecx, %edx -; X86-NOPOPCNT-NEXT: shrl $4, %edx -; X86-NOPOPCNT-NEXT: addl %ecx, %edx -; X86-NOPOPCNT-NEXT: andl $17764111, %edx # imm = 0x10F0F0F -; X86-NOPOPCNT-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101 -; X86-NOPOPCNT-NEXT: shrl $24, %ecx -; X86-NOPOPCNT-NEXT: movl %eax, %edx -; X86-NOPOPCNT-NEXT: shrl %edx -; X86-NOPOPCNT-NEXT: andl $1431655765, %edx # imm = 0x55555555 -; X86-NOPOPCNT-NEXT: subl %edx, %eax -; X86-NOPOPCNT-NEXT: movl %eax, %edx -; X86-NOPOPCNT-NEXT: andl $858993459, %edx # imm = 0x33333333 -; X86-NOPOPCNT-NEXT: shrl $2, %eax -; X86-NOPOPCNT-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X86-NOPOPCNT-NEXT: addl %edx, %eax -; X86-NOPOPCNT-NEXT: movl %eax, %edx -; X86-NOPOPCNT-NEXT: shrl $4, %edx -; X86-NOPOPCNT-NEXT: addl %eax, %edx -; X86-NOPOPCNT-NEXT: andl $17764111, %edx # imm = 0x10F0F0F -; X86-NOPOPCNT-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 -; X86-NOPOPCNT-NEXT: shrl $24, %eax -; X86-NOPOPCNT-NEXT: addl %ecx, %eax -; X86-NOPOPCNT-NEXT: andl $1, %eax +; X86-NOPOPCNT-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-NOPOPCNT-NEXT: movl %eax, %ecx +; X86-NOPOPCNT-NEXT: shrl $16, %ecx +; X86-NOPOPCNT-NEXT: xorl %eax, %ecx +; X86-NOPOPCNT-NEXT: xorl %eax, %eax +; X86-NOPOPCNT-NEXT: xorb %ch, %cl +; X86-NOPOPCNT-NEXT: setnp %al ; X86-NOPOPCNT-NEXT: xorl %edx, %edx ; X86-NOPOPCNT-NEXT: retl ; ; X64-NOPOPCNT-LABEL: parity_64: ; X64-NOPOPCNT: # %bb.0: ; X64-NOPOPCNT-NEXT: movq %rdi, %rax -; X64-NOPOPCNT-NEXT: shrq %rax -; X64-NOPOPCNT-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 -; X64-NOPOPCNT-NEXT: andq %rax, %rcx -; X64-NOPOPCNT-NEXT: subq %rcx, %rdi -; X64-NOPOPCNT-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 -; X64-NOPOPCNT-NEXT: movq %rdi, %rcx -; X64-NOPOPCNT-NEXT: andq %rax, %rcx -; X64-NOPOPCNT-NEXT: shrq $2, %rdi -; X64-NOPOPCNT-NEXT: andq %rax, %rdi -; X64-NOPOPCNT-NEXT: addq %rcx, %rdi -; X64-NOPOPCNT-NEXT: movq %rdi, %rax -; X64-NOPOPCNT-NEXT: shrq $4, %rax -; X64-NOPOPCNT-NEXT: leaq (%rax,%rdi), %rax -; X64-NOPOPCNT-NEXT: movabsq $76296276040158991, %rcx # imm = 0x10F0F0F0F0F0F0F -; X64-NOPOPCNT-NEXT: andq %rax, %rcx -; X64-NOPOPCNT-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 -; X64-NOPOPCNT-NEXT: imulq %rcx, %rax -; X64-NOPOPCNT-NEXT: shrq $56, %rax -; X64-NOPOPCNT-NEXT: andl $1, %eax +; X64-NOPOPCNT-NEXT: shrq $32, %rax +; X64-NOPOPCNT-NEXT: xorl %edi, %eax +; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: shrl $16, %ecx +; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: movl %ecx, %edx +; X64-NOPOPCNT-NEXT: shrl $8, %edx +; X64-NOPOPCNT-NEXT: xorl %eax, %eax +; X64-NOPOPCNT-NEXT: xorb %cl, %dl +; X64-NOPOPCNT-NEXT: setnp %al ; X64-NOPOPCNT-NEXT: retq ; ; X86-POPCNT-LABEL: parity_64: ; X86-POPCNT: # %bb.0: -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax -; X86-POPCNT-NEXT: addl %ecx, %eax +; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: popcntl %eax, %eax ; X86-POPCNT-NEXT: andl $1, %eax ; X86-POPCNT-NEXT: xorl %edx, %edx ; X86-POPCNT-NEXT: retl