diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3452,6 +3452,22 @@ return true; } + /// Return true if it's profitable to replace + /// + /// shift x, non-constant + /// + /// with two instances of + /// + /// shift x, constant + /// + /// where `shift` is a shift or rotate operation (not including funnel shift + /// ops). + virtual bool + shiftOrRotateIsFasterWithConstantShiftAmount(const SDNode *N, + CombineLevel Level) const { + return false; + } + /// Return true if the target has native support for the specified value type /// and it is 'desirable' to use the type for the given node type. e.g. On x86 /// i16 is legal, but undesirable since i16 instruction encodings are longer diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -498,6 +498,9 @@ SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SDNodeFlags Flags); + // SHL, SRA, SRL, RTOL, ROTR, but FSHL or FSHR. + SDValue visitShiftOrRotate(SDNode *N); + SDValue visitShiftByConstant(SDNode *N); SDValue foldSelectOfConstants(SDNode *N); @@ -7375,6 +7378,32 @@ return DAG.getNode(LogicOpcode, DL, VT, NewShift1, NewShift2); } +SDValue DAGCombiner::visitShiftOrRotate(SDNode *N) { + auto ShiftOpcode = N->getOpcode(); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // On some targets, shifting/rotating by a constant is faster than + // shifting/rotating by a register, so we fold: + // + // shift lhs, (select cond, constant1, constant2) --> + // select cond, (shift lhs, constant1), (shift lhs, constant2) + // + // TODO: This logic could be extended to ops other than shift/rotate. + if (OptLevel != CodeGenOpt::None && RHS.getOpcode() == ISD::SELECT && + RHS.hasOneUse() && isa(RHS.getOperand(1)) && + isa(RHS.getOperand(2)) && + TLI.shiftOrRotateIsFasterWithConstantShiftAmount(N, Level)) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + return DAG.getNode( + ISD::SELECT, DL, VT, RHS.getOperand(0), + DAG.getNode(ShiftOpcode, DL, VT, LHS, RHS.getOperand(1)), + DAG.getNode(ShiftOpcode, DL, VT, LHS, RHS.getOperand(2))); + } + return SDValue(); +} + /// Handle transforms common to the three shifts, when the shift amount is a /// constant. /// We are looking for: (shift being one of shl/sra/srl) @@ -7537,6 +7566,10 @@ } } } + + if (SDValue V = visitShiftOrRotate(N)) + return V; + return SDValue(); } @@ -7793,6 +7826,9 @@ return DAG.getVScale(DL, VT, C0 << C1); } + if (SDValue V = visitShiftOrRotate(N)) + return V; + return SDValue(); } @@ -7982,6 +8018,9 @@ if (SDValue NewSRA = visitShiftByConstant(N)) return NewSRA; + if (SDValue V = visitShiftOrRotate(N)) + return V; + return SDValue(); } @@ -8207,6 +8246,9 @@ } } + if (SDValue V = visitShiftOrRotate(N)) + return V; + return SDValue(); } diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -796,6 +796,9 @@ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + bool shiftOrRotateIsFasterWithConstantShiftAmount( + const SDNode *N, CombineLevel Level) const override; + /// Return true if the target has native support for /// the specified value type and it is 'desirable' to use the type for the /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -47165,6 +47165,19 @@ return true; } +bool X86TargetLowering::shiftOrRotateIsFasterWithConstantShiftAmount( + const SDNode *N, CombineLevel Level) const { + // On most x86 chips, shifts/rotates by a constant are faster than + // shifts/rotates by a register. + unsigned Opcode = N->getOpcode(); + (void)Opcode; + assert(Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL || + Opcode == ISD::ROTL || Opcode == ISD::ROTR); + // Scalar shifts of an immediate are faster than scalar shifts of a register. + // But vector shifts have no such preference. + return !N->getValueType(0).isVector(); +} + //===----------------------------------------------------------------------===// // X86 Inline Assembly Support //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/X86/dagcombine-shifts.ll b/llvm/test/CodeGen/X86/dagcombine-shifts.ll --- a/llvm/test/CodeGen/X86/dagcombine-shifts.ll +++ b/llvm/test/CodeGen/X86/dagcombine-shifts.ll @@ -226,12 +226,11 @@ ; CHECK-LABEL: shl_select: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: shrl $3, %ecx +; CHECK-NEXT: shrl $6, %eax ; CHECK-NEXT: testb $1, %sil -; CHECK-NEXT: sete %cl -; CHECK-NEXT: leal 3(%rcx,%rcx,2), %ecx -; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NEXT: shrl %cl, %eax +; CHECK-NEXT: cmovnel %ecx, %eax ; CHECK-NEXT: retq %shift_amnt = select i1 %cond, i32 3, i32 6 %ret = lshr i32 %x, %shift_amnt @@ -242,12 +241,11 @@ ; CHECK-LABEL: ashr_select: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: sarl $3, %ecx +; CHECK-NEXT: sarl $6, %eax ; CHECK-NEXT: testb $1, %sil -; CHECK-NEXT: sete %cl -; CHECK-NEXT: leal 3(%rcx,%rcx,2), %ecx -; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NEXT: sarl %cl, %eax +; CHECK-NEXT: cmovnel %ecx, %eax ; CHECK-NEXT: retq %shift_amnt = select i1 %cond, i32 3, i32 6 %ret = ashr i32 %x, %shift_amnt @@ -258,12 +256,11 @@ ; CHECK-LABEL: lshr_select: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: shrl $3, %ecx +; CHECK-NEXT: shrl $6, %eax ; CHECK-NEXT: testb $1, %sil -; CHECK-NEXT: sete %cl -; CHECK-NEXT: leal 3(%rcx,%rcx,2), %ecx -; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NEXT: shrl %cl, %eax +; CHECK-NEXT: cmovnel %ecx, %eax ; CHECK-NEXT: retq %shift_amnt = select i1 %cond, i32 3, i32 6 %ret = lshr i32 %x, %shift_amnt @@ -274,12 +271,11 @@ ; CHECK-LABEL: rot_select: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: roll $3, %ecx +; CHECK-NEXT: roll $6, %eax ; CHECK-NEXT: testb $1, %sil -; CHECK-NEXT: sete %cl -; CHECK-NEXT: leal 3(%rcx,%rcx,2), %ecx -; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NEXT: roll %cl, %eax +; CHECK-NEXT: cmovnel %ecx, %eax ; CHECK-NEXT: retq %amnt = select i1 %cond, i32 3, i32 6 %amnt2 = sub i32 32, %amnt