diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -47278,6 +47278,80 @@ return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget); } +// Recognize the std::bit_ceil pattern and drop the cmov. +// (cmov 1 (shl 1 (neg (trunc (ctlz X-1)))) (uge X 2)) +// => +// (shl 1 (and (neg (trunc (ctlz (X - 1)))) 31)) +static SDValue combineCMovBitCeil(SDValue FalseOp, SDValue TrueOp, + X86::CondCode CC, SDValue Cond, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + auto IsConstant = [](SDValue V, uint64_t C) { + auto *VC = dyn_cast(V); + return VC && VC->getZExtValue() == C; + }; + + if (DCI.isBeforeLegalize()) + return SDValue(); + + // Limit to i32 and i64. + EVT VT = FalseOp.getValueType(); + if (VT == MVT::i32) + ; // OK + else if (VT == MVT::i64 && Subtarget.is64Bit()) + ; // OK + else + return SDValue(); + + // Check the condition. + if (CC != X86::COND_AE || Cond.getOpcode() != X86ISD::SUB || + !IsConstant(Cond.getOperand(1), 2) || !Cond.hasOneUse()) + return SDValue(); + + // Check FalseOp. + if (!isOneConstant(FalseOp)) + return SDValue(); + + // Check TrueOp. + if (TrueOp.getOpcode() != ISD::SHL || !isOneConstant(TrueOp.getOperand(0)) || + !TrueOp.hasOneUse()) + return SDValue(); + + SDValue ShiftCount = TrueOp.getOperand(1); + if (ShiftCount.getOpcode() != ISD::SUB || !ShiftCount.hasOneUse()) + return SDValue(); + + unsigned Size = VT.getSizeInBits(); + if (!IsConstant(ShiftCount.getOperand(0), Size) || + ShiftCount.getOperand(1).getOpcode() != ISD::TRUNCATE) + return SDValue(); + + SDValue Trunc = ShiftCount.getOperand(1); + if (Trunc.getOperand(0).getOpcode() != ISD::CTLZ) + return SDValue(); + + SDValue CTLZ = Trunc.getOperand(0); + if (CTLZ.getOperand(0).getOpcode() != ISD::ADD) + return SDValue(); + + SDValue Add = CTLZ.getOperand(0); + if (Add.getOperand(0) != Cond.getOperand(0) || + !isAllOnesConstant(Add.getOperand(1))) + return SDValue(); + + // Construct (shl 1 (and (neg (trunc ...)) 31)). + SDLoc DL(TrueOp); + EVT ShiftCountVT = ShiftCount.getValueType(); + SDValue Neg = DAG.getNode(ISD::SUB, DL, ShiftCountVT, + DAG.getConstant(0, DL, ShiftCountVT), Trunc); + SDValue MaskedShiftCount = + DAG.getNode(ISD::AND, DL, ShiftCountVT, Neg, + DAG.getConstant(Size - 1, DL, ShiftCountVT)); + return DAG.getNode(ISD::SHL, DL, VT, DAG.getConstant(1, DL, VT), + MaskedShiftCount); +} + /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -47499,6 +47573,10 @@ } } + if (SDValue R = + combineCMovBitCeil(FalseOp, TrueOp, CC, Cond, DAG, DCI, Subtarget)) + return R; + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/bit_ceil.ll b/llvm/test/CodeGen/X86/bit_ceil.ll --- a/llvm/test/CodeGen/X86/bit_ceil.ll +++ b/llvm/test/CodeGen/X86/bit_ceil.ll @@ -6,14 +6,11 @@ define i32 @bit_ceil_i32(i32 %x) { ; CHECK-LABEL: bit_ceil_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: leal -1(%rdi), %eax -; CHECK-NEXT: lzcntl %eax, %eax +; CHECK-NEXT: decl %edi +; CHECK-NEXT: lzcntl %edi, %eax ; CHECK-NEXT: negb %al ; CHECK-NEXT: movl $1, %ecx ; CHECK-NEXT: shlxl %eax, %ecx, %eax -; CHECK-NEXT: cmpl $2, %edi -; CHECK-NEXT: cmovbl %ecx, %eax ; CHECK-NEXT: retq %dec = add i32 %x, -1 %lz = tail call i32 @llvm.ctlz.i32(i32 %dec, i1 false) @@ -27,13 +24,11 @@ define i64 @bit_ceil_i64(i64 %x) { ; CHECK-LABEL: bit_ceil_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: leaq -1(%rdi), %rax -; CHECK-NEXT: lzcntq %rax, %rax +; CHECK-NEXT: decq %rdi +; CHECK-NEXT: lzcntq %rdi, %rax ; CHECK-NEXT: negb %al ; CHECK-NEXT: movl $1, %ecx ; CHECK-NEXT: shlxq %rax, %rcx, %rax -; CHECK-NEXT: cmpq $2, %rdi -; CHECK-NEXT: cmovbq %rcx, %rax ; CHECK-NEXT: retq %dec = add i64 %x, -1 %lz = tail call i64 @llvm.ctlz.i64(i64 %dec, i1 false)