Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -34707,6 +34707,71 @@ return SDValue(); } +// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity. +// Turn it into series of XORs and a setnp. +static SDValue combineParity(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + + // We only support 64-bit and 32-bit. 64-bit requires special handling + // unless the 64-bit popcnt instruction is legal. + if ((VT != MVT::i32 || Subtarget.hasPOPCNT()) && + (VT != MVT::i64 || (Subtarget.is64Bit() && Subtarget.hasPOPCNT()))) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // LHS needs to be a single use CTPOP. + if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse()) + return SDValue(); + + // RHS needs to be 1. + auto *N1C = dyn_cast(N1); + if (!N1C || !N1C->getAPIntValue().isOneValue()) + return SDValue(); + + SDLoc DL(N); + SDValue X = N0.getOperand(0); + + // If this is 64-bit, its always best to xor the two 32-bit pieces together + // even if we have popcnt. + if (VT == MVT::i64) { + SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, + DAG.getNode(ISD::SRL, DL, VT, X, + DAG.getConstant(32, DL, MVT::i8))); + SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); + X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi); + // Generate a 32-bit parity idiom. This will bring us back here if we need + // to expand it too. + SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32, + DAG.getNode(ISD::CTPOP, DL, MVT::i32, X), + DAG.getConstant(1, DL, MVT::i32)); + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity); + } + assert(VT == MVT::i32 && "Unexpected VT!"); + + // Xor the high and low 16-bits together using a 32-bit operation. + SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X, + DAG.getConstant(16, DL, MVT::i8)); + X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16); + + // Finally xor the low 2 bytes together and use a 8-bit flag setting xor. + // This should allow an h-reg to be used to save a shift. + // FIXME: We only get an h-reg in 32-bit mode. + SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, + DAG.getNode(ISD::SRL, DL, VT, X, + DAG.getConstant(8, DL, MVT::i8))); + SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X); + SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32); + SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1); + + // Copy the inverse of the parity flag into a register with setcc. + SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG); + // Zero extend to original type. + return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp); +} + static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -34734,6 +34799,10 @@ } } + // This must be done before legalization has expanded the ctpop. + if (SDValue V = combineParity(N, DAG, Subtarget)) + return V; + if (DCI.isBeforeLegalizeOps()) return SDValue(); Index: test/CodeGen/X86/parity.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/parity.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-popcnt | FileCheck %s --check-prefix=X86-NOPOPCNT +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-popcnt | FileCheck %s --check-prefix=X64-NOPOPCNT +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X86-POPCNT +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X64-POPCNT + +define i32 @parity_32(i32 %x) { +; X86-NOPOPCNT-LABEL: parity_32: +; X86-NOPOPCNT: # %bb.0: +; X86-NOPOPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOPOPCNT-NEXT: movl %eax, %ecx +; X86-NOPOPCNT-NEXT: shrl $16, %ecx +; X86-NOPOPCNT-NEXT: xorl %eax, %ecx +; X86-NOPOPCNT-NEXT: xorl %eax, %eax +; X86-NOPOPCNT-NEXT: xorb %ch, %cl +; X86-NOPOPCNT-NEXT: setnp %al +; X86-NOPOPCNT-NEXT: retl +; +; X64-NOPOPCNT-LABEL: parity_32: +; X64-NOPOPCNT: # %bb.0: +; X64-NOPOPCNT-NEXT: movl %edi, %ecx +; X64-NOPOPCNT-NEXT: shrl $16, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx +; X64-NOPOPCNT-NEXT: movl %ecx, %edx +; X64-NOPOPCNT-NEXT: shrl $8, %edx +; X64-NOPOPCNT-NEXT: xorl %eax, %eax +; X64-NOPOPCNT-NEXT: xorb %cl, %dl +; X64-NOPOPCNT-NEXT: setnp %al +; X64-NOPOPCNT-NEXT: retq +; +; X86-POPCNT-LABEL: parity_32: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: andl $1, %eax +; X86-POPCNT-NEXT: retl +; +; X64-POPCNT-LABEL: parity_32: +; X64-POPCNT: # %bb.0: +; X64-POPCNT-NEXT: popcntl %edi, %eax +; X64-POPCNT-NEXT: andl $1, %eax +; X64-POPCNT-NEXT: retq + %1 = tail call i32 @llvm.ctpop.i32(i32 %x) + %2 = and i32 %1, 1 + ret i32 %2 +} + +define i64 @parity_64(i64 %x) { +; X86-NOPOPCNT-LABEL: parity_64: +; X86-NOPOPCNT: # %bb.0: +; X86-NOPOPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOPOPCNT-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-NOPOPCNT-NEXT: movl %eax, %ecx +; X86-NOPOPCNT-NEXT: shrl $16, %ecx +; X86-NOPOPCNT-NEXT: xorl %eax, %ecx +; X86-NOPOPCNT-NEXT: xorl %eax, %eax +; X86-NOPOPCNT-NEXT: xorb %ch, %cl +; X86-NOPOPCNT-NEXT: setnp %al +; X86-NOPOPCNT-NEXT: xorl %edx, %edx +; X86-NOPOPCNT-NEXT: retl +; +; X64-NOPOPCNT-LABEL: parity_64: +; X64-NOPOPCNT: # %bb.0: +; X64-NOPOPCNT-NEXT: movq %rdi, %rax +; X64-NOPOPCNT-NEXT: shrq $32, %rax +; X64-NOPOPCNT-NEXT: xorl %edi, %eax +; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: shrl $16, %ecx +; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: movl %ecx, %edx +; X64-NOPOPCNT-NEXT: shrl $8, %edx +; X64-NOPOPCNT-NEXT: xorl %eax, %eax +; X64-NOPOPCNT-NEXT: xorb %cl, %dl +; X64-NOPOPCNT-NEXT: setnp %al +; X64-NOPOPCNT-NEXT: retq +; +; X86-POPCNT-LABEL: parity_64: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: popcntl %eax, %eax +; X86-POPCNT-NEXT: andl $1, %eax +; X86-POPCNT-NEXT: xorl %edx, %edx +; X86-POPCNT-NEXT: retl +; +; X64-POPCNT-LABEL: parity_64: +; X64-POPCNT: # %bb.0: +; X64-POPCNT-NEXT: popcntq %rdi, %rax +; X64-POPCNT-NEXT: andl $1, %eax +; X64-POPCNT-NEXT: retq + %1 = tail call i64 @llvm.ctpop.i64(i64 %x) + %2 = and i64 %1, 1 + ret i64 %2 +} + +declare i32 @llvm.ctpop.i32(i32 %x) +declare i64 @llvm.ctpop.i64(i64 %x)