diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48798,6 +48798,80 @@ return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT); } +static SDValue GetBMIMatchingOp(unsigned OPC, SelectionDAG &DAG, + SDValue OpMustEq, SDValue Op, unsigned Depth) { + // We don't want to go crazy with the recursion here. This isn't a super + // important optimization. + static constexpr unsigned kMaxDepth = 2; + + // Only can do this re-ordering if op has one use, op has the same type + if (!Op.hasOneUse() || Op.getValueType() != OpMustEq.getValueType()) + return SDValue(); + + SDLoc DL(Op.getNode()); + // If we hit another commutable op, recurse further + if (Op.getOpcode() == OPC) { + // Done recursing + if (Depth++ >= kMaxDepth) + return SDValue(); + + for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { + if (SDValue R = GetBMIMatchingOp(OPC, DAG, OpMustEq, Op.getOperand(OpIdx), + Depth)) { + return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R, + Op.getOperand(1 - OpIdx)); + } + } + } else if (Op.getOpcode() == ISD::SUB) { + if (OPC == ISD::AND) { + // BLSI: (and x, (sub 0, x)) + auto *C = dyn_cast(Op.getOperand(0)); + if (C && C->isZero() && Op.getOperand(1) == OpMustEq) { + return DAG.getNode(OPC, DL, Op.getValueType(), OpMustEq, Op); + } + } + // OPC must be ISD::AND or ISD::XOR + // BLSR: (and x, (sub x, 1)) + // BLSMSK: (xor x, (sub x, 1)) + auto *C = dyn_cast(Op.getOperand(1)); + if (C && C->isOne() && Op.getOperand(0) == OpMustEq) { + return DAG.getNode(OPC, DL, Op.getValueType(), OpMustEq, Op); + } + } else if (Op.getOpcode() == ISD::ADD) { + // OPC must be ISD::AND or ISD::XOR + // BLSR: (and x, (add x, -1)) + // BLSMSK: (xor x, (add x, -1)) + for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { + auto *C = dyn_cast(Op.getOperand(OpIdx)); + if (C && C->isAllOnes() && Op.getOperand(1 - OpIdx) == OpMustEq) { + return DAG.getNode(OPC, DL, Op.getValueType(), OpMustEq, Op); + } + } + } + return SDValue(); +} + +static SDValue CombineBMILogicOp(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + // Make sure this node is a candidate for BMI instructions. + if (!Subtarget.hasBMI() || !VT.isScalarInteger() || VT.getSizeInBits() < 32 || + VT.getSizeInBits() > 64) + return SDValue(); + + assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR); + + // Try and match LHS and RHS. + for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { + if (SDValue OpMatch = + GetBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx), + N->getOperand(1 - OpIdx), 0)) { + return OpMatch; + } + } + return SDValue(); +} + static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -49019,6 +49093,9 @@ } } + if(SDValue R = CombineBMILogicOp(N, DAG, Subtarget)) + return R; + return SDValue(); } @@ -51933,6 +52010,9 @@ } } + if(SDValue R = CombineBMILogicOp(N, DAG, Subtarget)) + return R; + return combineFneg(N, DAG, DCI, Subtarget); } diff --git a/llvm/test/CodeGen/X86/bmi-out-of-order.ll b/llvm/test/CodeGen/X86/bmi-out-of-order.ll --- a/llvm/test/CodeGen/X86/bmi-out-of-order.ll +++ b/llvm/test/CodeGen/X86/bmi-out-of-order.ll @@ -48,8 +48,7 @@ ; ; X64-LABEL: blsmask_through1: ; X64: # %bb.0: # %entry -; X64-NEXT: xorq %rsi, %rdi -; X64-NEXT: leaq -1(%rsi), %rax +; X64-NEXT: blsmskq %rsi, %rax ; X64-NEXT: xorq %rdi, %rax ; X64-NEXT: retq entry: @@ -62,19 +61,15 @@ define i32 @blsmask_through2(i32 %a, i32 %b, i32 %c) nounwind { ; X86-LABEL: blsmask_through2: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal -1(%ecx), %eax +; X86-NEXT: blsmskl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: blsmask_through2: ; X64: # %bb.0: # %entry -; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: blsmskl %esi, %eax ; X64-NEXT: xorl %edx, %edi -; X64-NEXT: xorl %esi, %edi -; X64-NEXT: leal -1(%rsi), %eax ; X64-NEXT: xorl %edi, %eax ; X64-NEXT: retq entry: @@ -242,9 +237,7 @@ ; ; X64-LABEL: blsi_through1: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: andq %rsi, %rdi -; X64-NEXT: negq %rax +; X64-NEXT: blsiq %rsi, %rax ; X64-NEXT: andq %rdi, %rax ; X64-NEXT: retq entry: @@ -257,20 +250,15 @@ define i32 @blsi_through2(i32 %a, i32 %b, i32 %c) nounwind { ; X86-LABEL: blsi_through2: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: blsil {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: blsi_through2: ; X64: # %bb.0: # %entry -; X64-NEXT: movl %esi, %eax +; X64-NEXT: blsil %esi, %eax ; X64-NEXT: andl %edx, %edi -; X64-NEXT: andl %esi, %edi -; X64-NEXT: negl %eax ; X64-NEXT: andl %edi, %eax ; X64-NEXT: retq entry: @@ -302,10 +290,8 @@ ; ; X64-LABEL: blsi_through3: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rsi, %rax +; X64-NEXT: blsiq %rsi, %rax ; X64-NEXT: andq %rdx, %rdi -; X64-NEXT: andq %rsi, %rdi -; X64-NEXT: negq %rax ; X64-NEXT: andq %rdi, %rax ; X64-NEXT: retq entry: @@ -436,8 +422,7 @@ ; ; X64-LABEL: blsr_through1: ; X64: # %bb.0: # %entry -; X64-NEXT: andq %rsi, %rdi -; X64-NEXT: leaq -1(%rsi), %rax +; X64-NEXT: blsrq %rsi, %rax ; X64-NEXT: andq %rdi, %rax ; X64-NEXT: retq entry: @@ -450,19 +435,15 @@ define i32 @blsr_through2(i32 %a, i32 %b, i32 %c) nounwind { ; X86-LABEL: blsr_through2: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal -1(%ecx), %eax +; X86-NEXT: blsrl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: blsr_through2: ; X64: # %bb.0: # %entry -; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: blsrl %esi, %eax ; X64-NEXT: andl %edx, %edi -; X64-NEXT: andl %esi, %edi -; X64-NEXT: leal -1(%rsi), %eax ; X64-NEXT: andl %edi, %eax ; X64-NEXT: retq entry: