diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6339,6 +6339,9 @@ SDValue Neg, SDValue InnerPos, SDValue InnerNeg, unsigned PosOpcode, unsigned NegOpcode, const SDLoc &DL) { + EVT VT = N0.getValueType(); + unsigned EltBits = VT.getScalarSizeInBits(); + // fold (or (shl x0, (*ext y)), // (srl x1, (*ext (sub 32, y)))) -> // (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y)) @@ -6346,13 +6349,52 @@ // fold (or (shl x0, (*ext (sub 32, y))), // (srl x1, (*ext y))) -> // (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y)) - EVT VT = N0.getValueType(); - if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) { + if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG)) { bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT); return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1, HasPos ? Pos : Neg); } + // Matching the shift+xor cases, we can't easily use the xor'd shift amount + // so for now just use the PosOpcode case if its legal. + // TODO: When can we use the NegOpcode case? + if (PosOpcode == ISD::FSHL && isPowerOf2_32(EltBits)) { + auto IsBinOpImm = [](SDValue Op, unsigned BinOpc, unsigned Imm) { + if (Op.getOpcode() != BinOpc) + return false; + ConstantSDNode *Cst = isConstOrConstSplat(Op.getOperand(1)); + return Cst && (Cst->getAPIntValue() == Imm); + }; + + // fold (or (shl x0, y), (srl (srl x1, 1), (xor y, 31))) + // -> (fshl x0, x1, y) + if (IsBinOpImm(N1, ISD::SRL, 1) && + IsBinOpImm(InnerNeg, ISD::XOR, EltBits - 1) && + InnerPos == InnerNeg.getOperand(0) && + TLI.isOperationLegalOrCustom(ISD::FSHL, VT)) { + return DAG.getNode(ISD::FSHL, DL, VT, N0, N1.getOperand(0), Pos); + } + + // fold (or (shl (shl x0, 1), (xor y, 31)), (srl x1, y)) + // -> (fshr x0, x1, y) + if (IsBinOpImm(N0, ISD::SHL, 1) && + IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) && + InnerNeg == InnerPos.getOperand(0) && + TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) { + return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg); + } + + // fold (or (shl (add x0, x0), (xor y, 31)), (srl x1, y)) + // -> (fshr x0, x1, y) + // TODO: Should add(x,x) -> shl(x,1) be a general DAG canonicalization? + if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N0.getOperand(1) && + IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) && + InnerNeg == InnerPos.getOperand(0) && + TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) { + return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg); + } + } + return SDValue(); } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -42045,114 +42045,6 @@ return Ret; } -static SDValue combineOrShiftToFunnelShift(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - assert(N->getOpcode() == ISD::OR && "Expected ISD::OR node"); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - EVT VT = N->getValueType(0); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - - if (!TLI.isOperationLegalOrCustom(ISD::FSHL, VT) || - !TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) - return SDValue(); - - // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) - bool OptForSize = DAG.shouldOptForSize(); - unsigned Bits = VT.getScalarSizeInBits(); - - // SHLD/SHRD instructions have lower register pressure, but on some - // platforms they have higher latency than the equivalent - // series of shifts/or that would otherwise be generated. - // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions - // have higher latencies and we are not optimizing for size. - if (!OptForSize && Subtarget.isSHLDSlow()) - return SDValue(); - - if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) - std::swap(N0, N1); - if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) - return SDValue(); - if (!N0.hasOneUse() || !N1.hasOneUse()) - return SDValue(); - - EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); - - SDValue ShAmt0 = N0.getOperand(1); - if (ShAmt0.getValueType() != ShiftVT) - return SDValue(); - SDValue ShAmt1 = N1.getOperand(1); - if (ShAmt1.getValueType() != ShiftVT) - return SDValue(); - - // Peek through any modulo shift masks. - SDValue ShMsk0; - if (ShAmt0.getOpcode() == ISD::AND && - isa(ShAmt0.getOperand(1)) && - ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) { - ShMsk0 = ShAmt0; - ShAmt0 = ShAmt0.getOperand(0); - } - SDValue ShMsk1; - if (ShAmt1.getOpcode() == ISD::AND && - isa(ShAmt1.getOperand(1)) && - ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) { - ShMsk1 = ShAmt1; - ShAmt1 = ShAmt1.getOperand(0); - } - - if (ShAmt0.getOpcode() == ISD::TRUNCATE) - ShAmt0 = ShAmt0.getOperand(0); - if (ShAmt1.getOpcode() == ISD::TRUNCATE) - ShAmt1 = ShAmt1.getOperand(0); - - SDLoc DL(N); - unsigned Opc = ISD::FSHL; - SDValue Op0 = N0.getOperand(0); - SDValue Op1 = N1.getOperand(0); - if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) { - Opc = ISD::FSHR; - std::swap(Op0, Op1); - std::swap(ShAmt0, ShAmt1); - std::swap(ShMsk0, ShMsk1); - } - - auto GetFunnelShift = [&DAG, &DL, VT, Opc, &ShiftVT](SDValue Op0, SDValue Op1, - SDValue Amt) { - if (Opc == ISD::FSHR) - std::swap(Op0, Op1); - return DAG.getNode(Opc, DL, VT, Op0, Op1, - DAG.getNode(ISD::TRUNCATE, DL, ShiftVT, Amt)); - }; - - // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C ) - // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C ) - if (ShAmt1.getOpcode() == ISD::XOR) { - SDValue Mask = ShAmt1.getOperand(1); - if (auto *MaskC = dyn_cast(Mask)) { - unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL); - SDValue ShAmt1Op0 = ShAmt1.getOperand(0); - if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE) - ShAmt1Op0 = ShAmt1Op0.getOperand(0); - if (MaskC->getSExtValue() == (Bits - 1) && - (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) { - if (Op1.getOpcode() == InnerShift && - isa(Op1.getOperand(1)) && - Op1.getConstantOperandAPInt(1).isOneValue()) { - return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0); - } - // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ). - if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD && - Op1.getOperand(0) == Op1.getOperand(1)) { - return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0); - } - } - } - } - - return SDValue(); -} - static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -42208,9 +42100,6 @@ if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget)) return R; - if (SDValue R = combineOrShiftToFunnelShift(N, DAG, Subtarget)) - return R; - // Attempt to recursively combine an OR of shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); diff --git a/llvm/test/CodeGen/X86/pr32282.ll b/llvm/test/CodeGen/X86/pr32282.ll --- a/llvm/test/CodeGen/X86/pr32282.ll +++ b/llvm/test/CodeGen/X86/pr32282.ll @@ -13,18 +13,18 @@ ; X86-LABEL: foo: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: movl d+4, %eax +; X86-NEXT: movl d, %eax ; X86-NEXT: notl %eax -; X86-NEXT: movl d, %ecx +; X86-NEXT: movl d+4, %ecx ; X86-NEXT: notl %ecx -; X86-NEXT: andl $-566231040, %ecx # imm = 0xDE400000 -; X86-NEXT: andl $701685459, %eax # imm = 0x29D2DED3 -; X86-NEXT: shrdl $21, %eax, %ecx -; X86-NEXT: shrl $21, %eax -; X86-NEXT: addl $7, %ecx -; X86-NEXT: adcl $0, %eax -; X86-NEXT: pushl %eax +; X86-NEXT: andl $701685459, %ecx # imm = 0x29D2DED3 +; X86-NEXT: andl $-566231040, %eax # imm = 0xDE400000 +; X86-NEXT: shrdl $21, %ecx, %eax +; X86-NEXT: shrl $21, %ecx +; X86-NEXT: addl $7, %eax +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: calll __divdi3