diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6046,8 +6046,8 @@ !isOperationLegalOrCustomOrPromote(ISD::OR, VT))) return false; - // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) - // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) + // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW)) + // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW) SDValue X = Node->getOperand(0); SDValue Y = Node->getOperand(1); SDValue Z = Node->getOperand(2); @@ -6057,30 +6057,29 @@ SDLoc DL(SDValue(Node, 0)); EVT ShVT = Z.getValueType(); - SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT); - SDValue Zero = DAG.getConstant(0, DL, ShVT); - + SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT); SDValue ShAmt; if (isPowerOf2_32(EltSizeInBits)) { - SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT); + // Z % BW -> Z & (BW - 1) ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Z, Mask); } else { + SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT); ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Z, BitWidthC); } + SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, Mask, ShAmt); - SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, ShAmt); - SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt); - SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt); - SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShX, ShY); - - // If (Z % BW == 0), then the opposite direction shift is shift-by-bitwidth, - // and that is undefined. We must compare and select to avoid UB. - EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ShVT); - - // For fshl, 0-shift returns the 1st arg (X). - // For fshr, 0-shift returns the 2nd arg (Y). - SDValue IsZeroShift = DAG.getSetCC(DL, CCVT, ShAmt, Zero, ISD::SETEQ); - Result = DAG.getSelect(DL, VT, IsZeroShift, IsFSHL ? X : Y, Or); + SDValue One = DAG.getConstant(1, DL, ShVT); + SDValue ShX, ShY; + if (IsFSHL) { + ShX = DAG.getNode(ISD::SHL, DL, VT, X, ShAmt); + SDValue ShY1 = DAG.getNode(ISD::SRL, DL, VT, Y, One); + ShY = DAG.getNode(ISD::SRL, DL, VT, ShY1, InvShAmt); + } else { + SDValue ShX1 = DAG.getNode(ISD::SHL, DL, VT, X, One); + ShX = DAG.getNode(ISD::SHL, DL, VT, ShX1, InvShAmt); + ShY = DAG.getNode(ISD::SRL, DL, VT, Y, ShAmt); + } + Result = DAG.getNode(ISD::OR, DL, VT, ShX, ShY); return true; } diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll --- a/llvm/test/CodeGen/X86/fshl.ll +++ b/llvm/test/CodeGen/X86/fshl.ll @@ -65,27 +65,16 @@ ; ; X86-SLOW-LABEL: var_shift_i16: ; X86-SLOW: # %bb.0: -; X86-SLOW-NEXT: pushl %edi -; X86-SLOW-NEXT: pushl %esi -; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl -; X86-SLOW-NEXT: andb $15, %dl -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl %eax, %edi -; X86-SLOW-NEXT: movl %edx, %ecx -; X86-SLOW-NEXT: shll %cl, %edi -; X86-SLOW-NEXT: movb $16, %cl -; X86-SLOW-NEXT: subb %dl, %cl -; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: testb %dl, %dl -; X86-SLOW-NEXT: je .LBB1_2 -; X86-SLOW-NEXT: # %bb.1: -; X86-SLOW-NEXT: orl %esi, %edi -; X86-SLOW-NEXT: movl %edi, %eax -; X86-SLOW-NEXT: .LBB1_2: +; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-SLOW-NEXT: andb $15, %cl +; X86-SLOW-NEXT: shll %cl, %edx +; X86-SLOW-NEXT: shrl %eax +; X86-SLOW-NEXT: xorb $15, %cl +; X86-SLOW-NEXT: shrl %cl, %eax +; X86-SLOW-NEXT: orl %edx, %eax ; X86-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; X86-SLOW-NEXT: popl %esi -; X86-SLOW-NEXT: popl %edi ; X86-SLOW-NEXT: retl ; ; X64-FAST-LABEL: var_shift_i16: @@ -100,17 +89,15 @@ ; ; X64-SLOW-LABEL: var_shift_i16: ; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: movzwl %si, %eax -; X64-SLOW-NEXT: andb $15, %dl -; X64-SLOW-NEXT: movl %edi, %esi ; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: shll %cl, %esi -; X64-SLOW-NEXT: movb $16, %cl -; X64-SLOW-NEXT: subb %dl, %cl +; X64-SLOW-NEXT: movzwl %si, %eax +; X64-SLOW-NEXT: andb $15, %cl +; X64-SLOW-NEXT: shll %cl, %edi +; X64-SLOW-NEXT: xorb $15, %cl +; X64-SLOW-NEXT: shrl %eax +; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-SLOW-NEXT: shrl %cl, %eax -; X64-SLOW-NEXT: orl %esi, %eax -; X64-SLOW-NEXT: testb %dl, %dl -; X64-SLOW-NEXT: cmovel %edi, %eax +; X64-SLOW-NEXT: orl %edi, %eax ; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax ; X64-SLOW-NEXT: retq %tmp = tail call i16 @llvm.fshl.i16(i16 %x, i16 %y, i16 %z) @@ -128,26 +115,15 @@ ; ; X86-SLOW-LABEL: var_shift_i32: ; X86-SLOW: # %bb.0: -; X86-SLOW-NEXT: pushl %edi -; X86-SLOW-NEXT: pushl %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl %eax, %edi -; X86-SLOW-NEXT: movl %edx, %ecx -; X86-SLOW-NEXT: shll %cl, %edi -; X86-SLOW-NEXT: andb $31, %dl -; X86-SLOW-NEXT: movl %edx, %ecx -; X86-SLOW-NEXT: negb %cl -; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: testb %dl, %dl -; X86-SLOW-NEXT: je .LBB2_2 -; X86-SLOW-NEXT: # %bb.1: -; X86-SLOW-NEXT: orl %esi, %edi -; X86-SLOW-NEXT: movl %edi, %eax -; X86-SLOW-NEXT: .LBB2_2: -; X86-SLOW-NEXT: popl %esi -; X86-SLOW-NEXT: popl %edi +; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SLOW-NEXT: shll %cl, %edx +; X86-SLOW-NEXT: shrl %eax +; X86-SLOW-NEXT: andb $31, %cl +; X86-SLOW-NEXT: xorb $31, %cl +; X86-SLOW-NEXT: shrl %cl, %eax +; X86-SLOW-NEXT: orl %edx, %eax ; X86-SLOW-NEXT: retl ; ; X64-FAST-LABEL: var_shift_i32: @@ -160,17 +136,15 @@ ; ; X64-SLOW-LABEL: var_shift_i32: ; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: movl %esi, %eax -; X64-SLOW-NEXT: movl %edi, %esi ; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: shll %cl, %esi -; X64-SLOW-NEXT: andb $31, %dl -; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: negb %cl +; X64-SLOW-NEXT: movl %esi, %eax +; X64-SLOW-NEXT: shll %cl, %edi +; X64-SLOW-NEXT: shrl %eax +; X64-SLOW-NEXT: andb $31, %cl +; X64-SLOW-NEXT: xorb $31, %cl +; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-SLOW-NEXT: shrl %cl, %eax -; X64-SLOW-NEXT: orl %esi, %eax -; X64-SLOW-NEXT: testb %dl, %dl -; X64-SLOW-NEXT: cmovel %edi, %eax +; X64-SLOW-NEXT: orl %edi, %eax ; X64-SLOW-NEXT: retq %tmp = tail call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) ret i32 %tmp @@ -279,78 +253,61 @@ ; X86-SLOW-NEXT: pushl %ebx ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi -; X86-SLOW-NEXT: subl $8, %esp -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-SLOW-NEXT: andl $63, %ebx -; X86-SLOW-NEXT: movb $64, %dh -; X86-SLOW-NEXT: subb %bl, %dh -; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: movb %dh, %cl -; X86-SLOW-NEXT: shrl %cl, %eax -; X86-SLOW-NEXT: movb %dh, %dl -; X86-SLOW-NEXT: andb $31, %dl -; X86-SLOW-NEXT: movl %edx, %ecx -; X86-SLOW-NEXT: negb %cl -; X86-SLOW-NEXT: movl %esi, %ebp -; X86-SLOW-NEXT: shll %cl, %ebp -; X86-SLOW-NEXT: testb %dl, %dl -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: je .LBB5_2 -; X86-SLOW-NEXT: # %bb.1: -; X86-SLOW-NEXT: orl %eax, %ebp -; X86-SLOW-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: .LBB5_2: -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SLOW-NEXT: movl %ebp, %eax -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shll %cl, %eax -; X86-SLOW-NEXT: movb %bl, %ch -; X86-SLOW-NEXT: andb $31, %ch +; X86-SLOW-NEXT: movb $64, %ch +; X86-SLOW-NEXT: subb %bl, %ch ; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: negb %cl +; X86-SLOW-NEXT: shrl %cl, %edx +; X86-SLOW-NEXT: andb $31, %cl +; X86-SLOW-NEXT: xorb $31, %cl +; X86-SLOW-NEXT: addl %eax, %eax +; X86-SLOW-NEXT: shll %cl, %eax +; X86-SLOW-NEXT: movb %bl, %cl +; X86-SLOW-NEXT: shll %cl, %ebp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: movl %esi, %edi +; X86-SLOW-NEXT: shrl %edi +; X86-SLOW-NEXT: andb $31, %cl +; X86-SLOW-NEXT: xorb $31, %cl ; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: testb %ch, %ch -; X86-SLOW-NEXT: je .LBB5_4 -; X86-SLOW-NEXT: # %bb.3: -; X86-SLOW-NEXT: orl %edi, %eax -; X86-SLOW-NEXT: movl %eax, %ebp -; X86-SLOW-NEXT: .LBB5_4: -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl %eax, %edi -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: movb %bl, %cl +; X86-SLOW-NEXT: shll %cl, %esi ; X86-SLOW-NEXT: testb $32, %bl -; X86-SLOW-NEXT: je .LBB5_6 +; X86-SLOW-NEXT: jne .LBB5_1 +; X86-SLOW-NEXT: # %bb.2: +; X86-SLOW-NEXT: orl %edi, %ebp +; X86-SLOW-NEXT: jmp .LBB5_3 +; X86-SLOW-NEXT: .LBB5_1: +; X86-SLOW-NEXT: movl %esi, %ebp +; X86-SLOW-NEXT: xorl %esi, %esi +; X86-SLOW-NEXT: .LBB5_3: +; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: testb $32, %ch +; X86-SLOW-NEXT: jne .LBB5_4 ; X86-SLOW-NEXT: # %bb.5: -; X86-SLOW-NEXT: movl %edi, %ebp +; X86-SLOW-NEXT: orl %edx, %eax +; X86-SLOW-NEXT: movl %eax, %ecx +; X86-SLOW-NEXT: jmp .LBB5_6 +; X86-SLOW-NEXT: .LBB5_4: +; X86-SLOW-NEXT: movl %edi, %ecx ; X86-SLOW-NEXT: xorl %edi, %edi ; X86-SLOW-NEXT: .LBB5_6: -; X86-SLOW-NEXT: movb %dh, %cl -; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: testb $32, %dh -; X86-SLOW-NEXT: jne .LBB5_7 -; X86-SLOW-NEXT: # %bb.8: -; X86-SLOW-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-SLOW-NEXT: testl %ebx, %ebx -; X86-SLOW-NEXT: jne .LBB5_10 -; X86-SLOW-NEXT: jmp .LBB5_11 -; X86-SLOW-NEXT: .LBB5_7: -; X86-SLOW-NEXT: movl %esi, %ecx -; X86-SLOW-NEXT: xorl %esi, %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: testl %ebx, %ebx -; X86-SLOW-NEXT: je .LBB5_11 -; X86-SLOW-NEXT: .LBB5_10: -; X86-SLOW-NEXT: orl %esi, %ebp -; X86-SLOW-NEXT: orl %ecx, %edi -; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %edi, %eax -; X86-SLOW-NEXT: .LBB5_11: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-SLOW-NEXT: addl $8, %esp +; X86-SLOW-NEXT: je .LBB5_8 +; X86-SLOW-NEXT: # %bb.7: +; X86-SLOW-NEXT: orl %edi, %ebp +; X86-SLOW-NEXT: orl %ecx, %esi +; X86-SLOW-NEXT: movl %ebp, %edx +; X86-SLOW-NEXT: movl %esi, %eax +; X86-SLOW-NEXT: .LBB5_8: ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: popl %edi ; X86-SLOW-NEXT: popl %ebx @@ -367,17 +324,15 @@ ; ; X64-SLOW-LABEL: var_shift_i64: ; X64-SLOW: # %bb.0: +; X64-SLOW-NEXT: movq %rdx, %rcx ; X64-SLOW-NEXT: movq %rsi, %rax -; X64-SLOW-NEXT: movq %rdi, %rsi -; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: shlq %cl, %rsi -; X64-SLOW-NEXT: andb $63, %dl -; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: negb %cl +; X64-SLOW-NEXT: shlq %cl, %rdi +; X64-SLOW-NEXT: shrq %rax +; X64-SLOW-NEXT: andb $63, %cl +; X64-SLOW-NEXT: xorb $63, %cl +; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-SLOW-NEXT: shrq %cl, %rax -; X64-SLOW-NEXT: orq %rsi, %rax -; X64-SLOW-NEXT: testb %dl, %dl -; X64-SLOW-NEXT: cmoveq %rdi, %rax +; X64-SLOW-NEXT: orq %rdi, %rax ; X64-SLOW-NEXT: retq %tmp = tail call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 %z) ret i64 %tmp diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll --- a/llvm/test/CodeGen/X86/fshr.ll +++ b/llvm/test/CodeGen/X86/fshr.ll @@ -65,27 +65,16 @@ ; ; X86-SLOW-LABEL: var_shift_i16: ; X86-SLOW: # %bb.0: -; X86-SLOW-NEXT: pushl %edi -; X86-SLOW-NEXT: pushl %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl -; X86-SLOW-NEXT: andb $15, %dl -; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl %eax, %edi -; X86-SLOW-NEXT: movl %edx, %ecx -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: movb $16, %cl -; X86-SLOW-NEXT: subb %dl, %cl -; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: testb %dl, %dl -; X86-SLOW-NEXT: je .LBB1_2 -; X86-SLOW-NEXT: # %bb.1: -; X86-SLOW-NEXT: orl %edi, %esi -; X86-SLOW-NEXT: movl %esi, %eax -; X86-SLOW-NEXT: .LBB1_2: +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-SLOW-NEXT: andb $15, %cl +; X86-SLOW-NEXT: shrl %cl, %edx +; X86-SLOW-NEXT: addl %eax, %eax +; X86-SLOW-NEXT: xorb $15, %cl +; X86-SLOW-NEXT: shll %cl, %eax +; X86-SLOW-NEXT: orl %edx, %eax ; X86-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; X86-SLOW-NEXT: popl %esi -; X86-SLOW-NEXT: popl %edi ; X86-SLOW-NEXT: retl ; ; X64-FAST-LABEL: var_shift_i16: @@ -100,16 +89,16 @@ ; ; X64-SLOW-LABEL: var_shift_i16: ; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: movzwl %si, %eax -; X64-SLOW-NEXT: andb $15, %dl ; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: shrl %cl, %eax -; X64-SLOW-NEXT: movb $16, %cl -; X64-SLOW-NEXT: subb %dl, %cl -; X64-SLOW-NEXT: shll %cl, %edi -; X64-SLOW-NEXT: orl %edi, %eax -; X64-SLOW-NEXT: testb %dl, %dl -; X64-SLOW-NEXT: cmovel %esi, %eax +; X64-SLOW-NEXT: # kill: def $edi killed $edi def $rdi +; X64-SLOW-NEXT: movzwl %si, %edx +; X64-SLOW-NEXT: andb $15, %cl +; X64-SLOW-NEXT: shrl %cl, %edx +; X64-SLOW-NEXT: leal (%rdi,%rdi), %eax +; X64-SLOW-NEXT: xorb $15, %cl +; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-SLOW-NEXT: shll %cl, %eax +; X64-SLOW-NEXT: orl %edx, %eax ; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax ; X64-SLOW-NEXT: retq %tmp = tail call i16 @llvm.fshr.i16(i16 %x, i16 %y, i16 %z) @@ -127,26 +116,15 @@ ; ; X86-SLOW-LABEL: var_shift_i32: ; X86-SLOW: # %bb.0: -; X86-SLOW-NEXT: pushl %edi -; X86-SLOW-NEXT: pushl %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl %eax, %edi -; X86-SLOW-NEXT: movl %edx, %ecx -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: andb $31, %dl -; X86-SLOW-NEXT: movl %edx, %ecx -; X86-SLOW-NEXT: negb %cl -; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: testb %dl, %dl -; X86-SLOW-NEXT: je .LBB2_2 -; X86-SLOW-NEXT: # %bb.1: -; X86-SLOW-NEXT: orl %edi, %esi -; X86-SLOW-NEXT: movl %esi, %eax -; X86-SLOW-NEXT: .LBB2_2: -; X86-SLOW-NEXT: popl %esi -; X86-SLOW-NEXT: popl %edi +; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SLOW-NEXT: shrl %cl, %edx +; X86-SLOW-NEXT: addl %eax, %eax +; X86-SLOW-NEXT: andb $31, %cl +; X86-SLOW-NEXT: xorb $31, %cl +; X86-SLOW-NEXT: shll %cl, %eax +; X86-SLOW-NEXT: orl %edx, %eax ; X86-SLOW-NEXT: retl ; ; X64-FAST-LABEL: var_shift_i32: @@ -159,17 +137,15 @@ ; ; X64-SLOW-LABEL: var_shift_i32: ; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: movl %edi, %eax -; X64-SLOW-NEXT: movl %esi, %edi -; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: shrl %cl, %edi -; X64-SLOW-NEXT: andb $31, %dl ; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: negb %cl +; X64-SLOW-NEXT: # kill: def $edi killed $edi def $rdi +; X64-SLOW-NEXT: shrl %cl, %esi +; X64-SLOW-NEXT: leal (%rdi,%rdi), %eax +; X64-SLOW-NEXT: andb $31, %cl +; X64-SLOW-NEXT: xorb $31, %cl +; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-SLOW-NEXT: shll %cl, %eax -; X64-SLOW-NEXT: orl %edi, %eax -; X64-SLOW-NEXT: testb %dl, %dl -; X64-SLOW-NEXT: cmovel %esi, %eax +; X64-SLOW-NEXT: orl %esi, %eax ; X64-SLOW-NEXT: retq %tmp = tail call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) ret i32 %tmp @@ -276,76 +252,61 @@ ; X86-SLOW-NEXT: pushl %esi ; X86-SLOW-NEXT: subl $8, %esp ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-SLOW-NEXT: andl $63, %ebx -; X86-SLOW-NEXT: movb $64, %al -; X86-SLOW-NEXT: subb %bl, %al -; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: shll %cl, %edx -; X86-SLOW-NEXT: movb %al, %ch -; X86-SLOW-NEXT: andb $31, %ch +; X86-SLOW-NEXT: movb $64, %ch +; X86-SLOW-NEXT: subb %bl, %ch ; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: negb %cl -; X86-SLOW-NEXT: movl %esi, %edi -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: testb %ch, %ch -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SLOW-NEXT: je .LBB5_2 -; X86-SLOW-NEXT: # %bb.1: -; X86-SLOW-NEXT: orl %edi, %edx -; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: .LBB5_2: -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shrl %cl, %edx -; X86-SLOW-NEXT: movb %bl, %ah -; X86-SLOW-NEXT: andb $31, %ah -; X86-SLOW-NEXT: movb %ah, %cl -; X86-SLOW-NEXT: negb %cl -; X86-SLOW-NEXT: movl %ebp, %edi -; X86-SLOW-NEXT: shll %cl, %edi -; X86-SLOW-NEXT: testb %ah, %ah -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SLOW-NEXT: je .LBB5_4 -; X86-SLOW-NEXT: # %bb.3: -; X86-SLOW-NEXT: orl %edx, %edi -; X86-SLOW-NEXT: movl %edi, %ebp -; X86-SLOW-NEXT: .LBB5_4: +; X86-SLOW-NEXT: shll %cl, %edx +; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %esi, %edx +; X86-SLOW-NEXT: andb $31, %cl +; X86-SLOW-NEXT: xorb $31, %cl +; X86-SLOW-NEXT: shrl %esi +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: movb %bl, %cl +; X86-SLOW-NEXT: shrl %cl, %eax +; X86-SLOW-NEXT: andb $31, %cl +; X86-SLOW-NEXT: xorb $31, %cl ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: leal (%edi,%edi), %ebp +; X86-SLOW-NEXT: shll %cl, %ebp +; X86-SLOW-NEXT: movb %bl, %cl ; X86-SLOW-NEXT: shrl %cl, %edi ; X86-SLOW-NEXT: testb $32, %bl -; X86-SLOW-NEXT: je .LBB5_6 -; X86-SLOW-NEXT: # %bb.5: +; X86-SLOW-NEXT: jne .LBB5_1 +; X86-SLOW-NEXT: # %bb.2: +; X86-SLOW-NEXT: orl %eax, %ebp +; X86-SLOW-NEXT: jmp .LBB5_3 +; X86-SLOW-NEXT: .LBB5_1: ; X86-SLOW-NEXT: movl %edi, %ebp ; X86-SLOW-NEXT: xorl %edi, %edi +; X86-SLOW-NEXT: .LBB5_3: +; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: shll %cl, %edx +; X86-SLOW-NEXT: testb $32, %ch +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: jne .LBB5_4 +; X86-SLOW-NEXT: # %bb.5: +; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SLOW-NEXT: orl %esi, %ecx +; X86-SLOW-NEXT: jmp .LBB5_6 +; X86-SLOW-NEXT: .LBB5_4: +; X86-SLOW-NEXT: movl %edx, %ecx +; X86-SLOW-NEXT: movl $0, (%esp) # 4-byte Folded Spill ; X86-SLOW-NEXT: .LBB5_6: -; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: testb $32, %al ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SLOW-NEXT: jne .LBB5_7 -; X86-SLOW-NEXT: # %bb.8: -; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-SLOW-NEXT: testl %ebx, %ebx -; X86-SLOW-NEXT: jne .LBB5_10 -; X86-SLOW-NEXT: jmp .LBB5_11 -; X86-SLOW-NEXT: .LBB5_7: -; X86-SLOW-NEXT: movl %esi, %eax -; X86-SLOW-NEXT: xorl %esi, %esi ; X86-SLOW-NEXT: testl %ebx, %ebx -; X86-SLOW-NEXT: je .LBB5_11 -; X86-SLOW-NEXT: .LBB5_10: -; X86-SLOW-NEXT: orl %ebp, %esi -; X86-SLOW-NEXT: orl %edi, %eax -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %eax, %edx -; X86-SLOW-NEXT: .LBB5_11: -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SLOW-NEXT: je .LBB5_8 +; X86-SLOW-NEXT: # %bb.7: +; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-SLOW-NEXT: orl %ebp, %eax +; X86-SLOW-NEXT: orl %edi, %ecx +; X86-SLOW-NEXT: movl %ecx, %edx +; X86-SLOW-NEXT: .LBB5_8: ; X86-SLOW-NEXT: addl $8, %esp ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: popl %edi @@ -363,17 +324,14 @@ ; ; X64-SLOW-LABEL: var_shift_i64: ; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: movq %rdi, %rax -; X64-SLOW-NEXT: movq %rsi, %rdi -; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: shrq %cl, %rdi -; X64-SLOW-NEXT: andb $63, %dl -; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: negb %cl +; X64-SLOW-NEXT: movq %rdx, %rcx +; X64-SLOW-NEXT: shrq %cl, %rsi +; X64-SLOW-NEXT: leaq (%rdi,%rdi), %rax +; X64-SLOW-NEXT: andb $63, %cl +; X64-SLOW-NEXT: xorb $63, %cl +; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-SLOW-NEXT: shlq %cl, %rax -; X64-SLOW-NEXT: orq %rdi, %rax -; X64-SLOW-NEXT: testb %dl, %dl -; X64-SLOW-NEXT: cmoveq %rsi, %rax +; X64-SLOW-NEXT: orq %rsi, %rax ; X64-SLOW-NEXT: retq %tmp = tail call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 %z) ret i64 %tmp