diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3327,6 +3327,21 @@ return true; } + /// Return true if it's profitable to replace + /// + /// shift x, non-constant + /// + /// with two instances of + /// + /// shift x, constant + /// + /// where `shift` is a shift or rotate operation of the given opcode. + virtual bool + shiftOrRotateIsFasterWithConstantShiftAmount(unsigned opcode, + CombineLevel level) const { + return false; + } + // Return true if it is profitable to combine a BUILD_VECTOR with a stride-pattern // to a shuffle and a truncate. // Example of such a combine: diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -488,6 +488,9 @@ SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SDNodeFlags Flags); + // SHL, SRA, SRL, RTOL, ROTR, but FSHL or FSHR. + SDValue visitShiftOrRotate(SDNode *N); + SDValue visitShiftByConstant(SDNode *N); SDValue foldSelectOfConstants(SDNode *N); @@ -7120,6 +7123,32 @@ return DAG.getNode(LogicOpcode, DL, VT, NewShift1, NewShift2); } +SDValue DAGCombiner::visitShiftOrRotate(SDNode *N) { + auto ShiftOpcode = N->getOpcode(); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // On some targets, shifting/rotating by a constant is faster than + // shifting/rotating by a register, so we fold: + // + // shift lhs, (select cond, constant1, constant2) --> + // select cond, (shift lhs, constant1), (shift lhs, constant2) + // + // TODO: This logic could be extended to ops other than shift/rotate. + if (RHS.getOpcode() == ISD::SELECT && RHS.hasOneUse() && + isa(RHS.getOperand(1)) && + isa(RHS.getOperand(2)) && + TLI.shiftOrRotateIsFasterWithConstantShiftAmount(ShiftOpcode, Level)) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + return DAG.getNode( + ISD::SELECT, DL, VT, RHS.getOperand(0), + DAG.getNode(ShiftOpcode, DL, VT, LHS, RHS.getOperand(1)), + DAG.getNode(ShiftOpcode, DL, VT, LHS, RHS.getOperand(2))); + } + return SDValue(); +} + /// Handle transforms common to the three shifts, when the shift amount is a /// constant. /// We are looking for: (shift being one of shl/sra/srl) @@ -7227,6 +7256,9 @@ EVT VT = N->getValueType(0); unsigned Bitsize = VT.getScalarSizeInBits(); + if (SDValue V = visitShiftOrRotate(N)) + return V; + // fold (rot x, 0) -> x if (isNullOrNullSplat(N1)) return N0; @@ -7284,6 +7316,9 @@ if (SDValue V = DAG.simplifyShift(N0, N1)) return V; + if (SDValue V = visitShiftOrRotate(N)) + return V; + EVT VT = N0.getValueType(); EVT ShiftVT = N1.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); @@ -7534,6 +7569,9 @@ if (SDValue V = DAG.simplifyShift(N0, N1)) return V; + if (SDValue V = visitShiftOrRotate(N)) + return V; + EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); @@ -7725,6 +7763,9 @@ if (SDValue V = DAG.simplifyShift(N0, N1)) return V; + if (SDValue V = visitShiftOrRotate(N)) + return V; + EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -777,6 +777,9 @@ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + bool shiftOrRotateIsFasterWithConstantShiftAmount( + unsigned Opcode, CombineLevel Level) const override; + // Return true if it is profitable to combine a BUILD_VECTOR with a // stride-pattern to a shuffle and a truncate. // Example of such a combine: diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45265,6 +45265,15 @@ return true; } +bool X86TargetLowering::shiftOrRotateIsFasterWithConstantShiftAmount( + unsigned Opcode, CombineLevel /*Level*/) const { + // On most x86 chips, shifts/rotates by a constant are faster than + // shifts/rotates by a register. + assert(Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL || + Opcode == ISD::ROTL || Opcode == ISD::ROTR); + return true; +} + bool X86TargetLowering:: isDesirableToCombineBuildVectorToShuffleTruncate( ArrayRef ShuffleMask, EVT SrcVT, EVT TruncVT) const { diff --git a/llvm/test/CodeGen/X86/dagcombine-select.ll b/llvm/test/CodeGen/X86/dagcombine-select.ll --- a/llvm/test/CodeGen/X86/dagcombine-select.ll +++ b/llvm/test/CodeGen/X86/dagcombine-select.ll @@ -194,12 +194,10 @@ define i32 @shl_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: shl_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: andb $1, %cl -; CHECK-NEXT: xorb $3, %cl -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NEXT: shll %cl, %eax +; CHECK-NEXT: notb %dil +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: leal 4(,%rax,4), %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = shl i32 1, %sel @@ -209,12 +207,9 @@ define i32 @lshr_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: lshr_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: andb $1, %cl -; CHECK-NEXT: xorb $3, %cl -; CHECK-NEXT: movl $64, %eax -; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NEXT: shrl %cl, %eax +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: leal 8(,%rdi,8), %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = lshr i32 64, %sel @@ -224,12 +219,10 @@ define i32 @ashr_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: ashr_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: andb $1, %cl -; CHECK-NEXT: xorb $3, %cl -; CHECK-NEXT: movl $128, %eax -; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NEXT: shrl %cl, %eax +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: shll $4, %edi +; CHECK-NEXT: leal 16(%rdi), %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = ashr i32 128, %sel diff --git a/llvm/test/CodeGen/X86/dagcombine-shifts.ll b/llvm/test/CodeGen/X86/dagcombine-shifts.ll --- a/llvm/test/CodeGen/X86/dagcombine-shifts.ll +++ b/llvm/test/CodeGen/X86/dagcombine-shifts.ll @@ -215,3 +215,143 @@ declare void @f(i64) +; The *_select tests below check that we do the following transformation: +; +; shift lhs, (select cond, constant1, constant2) --> +; select cond, (shift lhs, constant1), (shift lhs, constant2) +; +; When updating these testcases, ensure that there are two shift instructions +; in the result and that they take immediates rather than registers. +define i32 @shl_select(i32 %x, i1 %cond) { +; CHECK-LABEL: shl_select: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: shrl $3, %ecx +; CHECK-NEXT: shrl $6, %eax +; CHECK-NEXT: testb $1, %sil +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %shift_amnt = select i1 %cond, i32 3, i32 6 + %ret = lshr i32 %x, %shift_amnt + ret i32 %ret +} + +define i32 @ashr_select(i32 %x, i1 %cond) { +; CHECK-LABEL: ashr_select: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: sarl $3, %ecx +; CHECK-NEXT: sarl $6, %eax +; CHECK-NEXT: testb $1, %sil +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %shift_amnt = select i1 %cond, i32 3, i32 6 + %ret = ashr i32 %x, %shift_amnt + ret i32 %ret +} + +define i32 @lshr_select(i32 %x, i1 %cond) { +; CHECK-LABEL: lshr_select: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: shrl $3, %ecx +; CHECK-NEXT: shrl $6, %eax +; CHECK-NEXT: testb $1, %sil +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %shift_amnt = select i1 %cond, i32 3, i32 6 + %ret = lshr i32 %x, %shift_amnt + ret i32 %ret +} + +; Check that we don't perform the folding described in shl_select when the +; shift width is used other than as an input to the shift instruction. +; +; When updating this testcase, check that there's exactly one shrl instruction +; generated. +declare void @i32_foo(i32) +define i32 @shl_select_not_folded_if_shift_amnt_is_used(i32 %x, i1 %cond) { +; CHECK-LABEL: shl_select_not_folded_if_shift_amnt_is_used: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movl %edi, %ebx +; CHECK-NEXT: notb %sil +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: leal 3(%rax,%rax,2), %ebp +; CHECK-NEXT: movl %ebp, %edi +; CHECK-NEXT: callq i32_foo +; CHECK-NEXT: movl %ebp, %ecx +; CHECK-NEXT: shrl %cl, %ebx +; CHECK-NEXT: movl %ebx, %eax +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %shift_amnt = select i1 %cond, i32 3, i32 6 + call void @i32_foo(i32 %shift_amnt) + %ret = lshr i32 %x, %shift_amnt + ret i32 %ret +} + +; Check that we don't perfrm the folding described in shl_select when one of +; the shift widths is not a constant. +; +; When updating these testcases, check that there's exactly one shrl +; instruction generated in each. +define i32 @shl_select_not_folded_if_shift_amnt_is_nonconstant_1(i32 %x, i32 %a, i1 %cond) { +; CHECK-LABEL: shl_select_not_folded_if_shift_amnt_is_nonconstant_1: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: testb $1, %dl +; CHECK-NEXT: movl $6, %ecx +; CHECK-NEXT: cmovnel %esi, %ecx +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax +; CHECK-NEXT: retq + %shift_amnt = select i1 %cond, i32 %a, i32 6 + %ret = lshr i32 %x, %shift_amnt + ret i32 %ret +} + +define i32 @shl_select_not_folded_if_shift_amnt_is_nonconstant_2(i32 %x, i32 %a, i1 %cond) { +; CHECK-LABEL: shl_select_not_folded_if_shift_amnt_is_nonconstant_2: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: testb $1, %dl +; CHECK-NEXT: movl $3, %ecx +; CHECK-NEXT: cmovel %esi, %ecx +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax +; CHECK-NEXT: retq + %shift_amnt = select i1 %cond, i32 3, i32 %a + %ret = lshr i32 %x, %shift_amnt + ret i32 %ret +} + +define i32 @shl_select_not_folded_if_shift_amnt_is_nonconstant_3(i32 %x, i32 %a, i32 %b, i1 %cond) { +; CHECK-LABEL: shl_select_not_folded_if_shift_amnt_is_nonconstant_3: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: cmovel %edx, %esi +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: shrl %cl, %eax +; CHECK-NEXT: retq + %shift_amnt = select i1 %cond, i32 %a, i32 %b + %ret = lshr i32 %x, %shift_amnt + ret i32 %ret +} diff --git a/llvm/test/CodeGen/X86/pr22338.ll b/llvm/test/CodeGen/X86/pr22338.ll --- a/llvm/test/CodeGen/X86/pr22338.ll +++ b/llvm/test/CodeGen/X86/pr22338.ll @@ -5,51 +5,52 @@ define i32 @fn(i32 %a0, i32 %a1) { ; X86-LABEL: fn: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %ebx, -8 -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $1, {{[0-9]+}}(%esp) -; X86-NEXT: sete %cl -; X86-NEXT: setne %al -; X86-NEXT: cmpl $1, {{[0-9]+}}(%esp) -; X86-NEXT: sete %dl -; X86-NEXT: negl %eax -; X86-NEXT: addb %cl, %cl -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: addb %dl, %dl -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: shll %cl, %eax +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: setne %cl +; X86-NEXT: negl %ecx +; X86-NEXT: leal (,%ecx,4), %eax +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: jne .LBB0_2 +; X86-NEXT: # %bb.1: # %entry +; X86-NEXT: movl %eax, %edx +; X86-NEXT: .LBB0_2: # %entry +; X86-NEXT: cmpl $1, %esi +; X86-NEXT: je .LBB0_4 +; X86-NEXT: # %bb.3: # %entry +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB0_1: # %bb1 +; X86-NEXT: .LBB0_4: # %bb1 ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: je .LBB0_1 -; X86-NEXT: # %bb.2: # %bb2 -; X86-NEXT: popl %ebx +; X86-NEXT: testl %edx, %edx +; X86-NEXT: je .LBB0_4 +; X86-NEXT: # %bb.5: # %bb2 +; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: fn: ; X64: # %bb.0: # %entry -; X64-NEXT: xorl %eax, %eax +; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpl $1, %edi -; X64-NEXT: sete %cl -; X64-NEXT: setne %al +; X64-NEXT: setne %dl +; X64-NEXT: negl %edx +; X64-NEXT: leal (,%rdx,4), %eax +; X64-NEXT: cmpl $1, %edi +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: cmovnel %edx, %ecx ; X64-NEXT: cmpl $1, %esi -; X64-NEXT: sete %dl -; X64-NEXT: negl %eax -; X64-NEXT: addb %cl, %cl -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: addb %dl, %dl -; X64-NEXT: movl %edx, %ecx -; X64-NEXT: shll %cl, %eax +; X64-NEXT: cmovnel %edx, %eax ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB0_1: # %bb1 ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: testl %esi, %esi +; X64-NEXT: testl %ecx, %ecx ; X64-NEXT: je .LBB0_1 ; X64-NEXT: # %bb.2: # %bb2 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll --- a/llvm/test/CodeGen/X86/select.ll +++ b/llvm/test/CodeGen/X86/select.ll @@ -1094,40 +1094,33 @@ } define i32 @trunc_select_miscompile(i32 %a, i1 zeroext %cc) { -; GENERIC-LABEL: trunc_select_miscompile: -; GENERIC: ## %bb.0: -; GENERIC-NEXT: ## kill: def $esi killed $esi def $rsi -; GENERIC-NEXT: movl %edi, %eax -; GENERIC-NEXT: leal 2(%rsi), %ecx -; GENERIC-NEXT: ## kill: def $cl killed $cl killed $ecx -; GENERIC-NEXT: shll %cl, %eax -; GENERIC-NEXT: retq -; -; ATOM-LABEL: trunc_select_miscompile: -; ATOM: ## %bb.0: -; ATOM-NEXT: ## kill: def $esi killed $esi def $rsi -; ATOM-NEXT: leal 2(%rsi), %ecx -; ATOM-NEXT: movl %edi, %eax -; ATOM-NEXT: ## kill: def $cl killed $cl killed $ecx -; ATOM-NEXT: shll %cl, %eax -; ATOM-NEXT: nop -; ATOM-NEXT: nop -; ATOM-NEXT: retq +; CHECK-LABEL: trunc_select_miscompile: +; CHECK: ## %bb.0: +; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi +; CHECK-NEXT: leal (,%rdi,8), %ecx +; CHECK-NEXT: leal (,%rdi,4), %eax +; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq ; ; ATHLON-LABEL: trunc_select_miscompile: ; ATHLON: ## %bb.0: ; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %eax -; ATHLON-NEXT: movb {{[0-9]+}}(%esp), %cl -; ATHLON-NEXT: orb $2, %cl -; ATHLON-NEXT: shll %cl, %eax +; ATHLON-NEXT: leal (,%eax,8), %ecx +; ATHLON-NEXT: shll $2, %eax +; ATHLON-NEXT: cmpb $0, {{[0-9]+}}(%esp) +; ATHLON-NEXT: cmovnel %ecx, %eax ; ATHLON-NEXT: retl ; ; MCU-LABEL: trunc_select_miscompile: ; MCU: # %bb.0: -; MCU-NEXT: movl %edx, %ecx -; MCU-NEXT: orb $2, %cl -; MCU-NEXT: # kill: def $cl killed $cl killed $ecx -; MCU-NEXT: shll %cl, %eax +; MCU-NEXT: testl %edx, %edx +; MCU-NEXT: jne .LBB20_1 +; MCU-NEXT: # %bb.2: +; MCU-NEXT: shll $2, %eax +; MCU-NEXT: retl +; MCU-NEXT: .LBB20_1: +; MCU-NEXT: shll $3, %eax ; MCU-NEXT: retl %tmp1 = select i1 %cc, i32 3, i32 2 %tmp2 = shl i32 %a, %tmp1 diff --git a/llvm/test/CodeGen/X86/shift-parts.ll b/llvm/test/CodeGen/X86/shift-parts.ll --- a/llvm/test/CodeGen/X86/shift-parts.ll +++ b/llvm/test/CodeGen/X86/shift-parts.ll @@ -10,17 +10,14 @@ ; CHECK-LABEL: int87: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq g_144+{{.*}}(%rip), %rax -; CHECK-NEXT: movq g_144+{{.*}}(%rip), %rdx -; CHECK-NEXT: movzbl %sil, %ecx -; CHECK-NEXT: shll $6, %ecx +; CHECK-NEXT: movq g_144+{{.*}}(%rip), %rcx ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_1: # %for.cond ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rdx, %rsi -; CHECK-NEXT: shrdq %cl, %rax, %rsi -; CHECK-NEXT: testb $64, %cl -; CHECK-NEXT: cmovneq %rax, %rsi -; CHECK-NEXT: orl $0, %esi +; CHECK-NEXT: testb $1, %sil +; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: cmovnel %eax, %edx +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: je .LBB0_1 ; CHECK-NEXT: # %bb.2: # %if.then ; CHECK-NEXT: movl $1, %eax