diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -201,6 +201,7 @@ // For slow shld targets we only lower for code size. LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal; + setOperationAction(ShiftOp , MVT::i8 , Custom); setOperationAction(ShiftOp , MVT::i16 , Custom); setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction); if (Subtarget.is64Bit()) @@ -19074,13 +19075,36 @@ return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT, Op0, Op1, Amt); } - - assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && - "Unexpected funnel shift type!"); + assert( + (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && + "Unexpected funnel shift type!"); // Expand slow SHLD/SHRD cases if we are not optimizing for size. bool OptForSize = DAG.shouldOptForSize(); - if (!OptForSize && Subtarget.isSHLDSlow()) + bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow(); + + // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw. + // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))) >> bw. + if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) && + !isa(Amt)) { + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType()); + SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType()); + Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32); + Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32); + Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask); + SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift); + Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1); + if (IsFSHR) { + Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt); + } else { + Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt); + Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift); + } + return DAG.getZExtOrTrunc(Res, DL, VT); + } + + if (VT == MVT::i8 || ExpandFunnel) return SDValue(); // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo. diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll --- a/llvm/test/CodeGen/X86/fshl.ll +++ b/llvm/test/CodeGen/X86/fshl.ll @@ -16,37 +16,26 @@ define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind { ; X86-LABEL: var_shift_i8: ; X86: # %bb.0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %ah -; X86-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-NEXT: movb {{[0-9]+}}(%esp), %dl -; X86-NEXT: andb $7, %dl -; X86-NEXT: movb %al, %ch -; X86-NEXT: movb %dl, %cl -; X86-NEXT: shlb %cl, %ch -; X86-NEXT: movb $8, %cl -; X86-NEXT: subb %dl, %cl -; X86-NEXT: shrb %cl, %ah -; X86-NEXT: testb %dl, %dl -; X86-NEXT: je .LBB0_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: orb %ah, %ch -; X86-NEXT: movb %ch, %al -; X86-NEXT: .LBB0_2: +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $8, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: andb $7, %cl +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movb %ah, %al ; X86-NEXT: retl ; ; X64-LABEL: var_shift_i8: ; X64: # %bb.0: -; X64-NEXT: andb $7, %dl -; X64-NEXT: movl %edi, %eax ; X64-NEXT: movl %edx, %ecx -; X64-NEXT: shlb %cl, %al -; X64-NEXT: movb $8, %cl -; X64-NEXT: subb %dl, %cl -; X64-NEXT: shrb %cl, %sil -; X64-NEXT: orb %al, %sil +; X64-NEXT: shll $8, %edi ; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: testb %dl, %dl -; X64-NEXT: cmovel %edi, %eax +; X64-NEXT: orl %edi, %eax +; X64-NEXT: andb $7, %cl +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shll %cl, %eax +; X64-NEXT: shrl $8, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %tmp = tail call i8 @llvm.fshl.i8(i8 %x, i8 %y, i8 %z) @@ -65,15 +54,14 @@ ; ; X86-SLOW-LABEL: var_shift_i16: ; X86-SLOW: # %bb.0: -; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-SLOW-NEXT: andb $15, %cl -; X86-SLOW-NEXT: shll %cl, %edx -; X86-SLOW-NEXT: shrl %eax -; X86-SLOW-NEXT: xorb $15, %cl -; X86-SLOW-NEXT: shrl %cl, %eax +; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: shll $16, %eax ; X86-SLOW-NEXT: orl %edx, %eax +; X86-SLOW-NEXT: andb $15, %cl +; X86-SLOW-NEXT: shll %cl, %eax +; X86-SLOW-NEXT: shrl $16, %eax ; X86-SLOW-NEXT: # kill: def $ax killed $ax killed $eax ; X86-SLOW-NEXT: retl ; @@ -90,14 +78,13 @@ ; X64-SLOW-LABEL: var_shift_i16: ; X64-SLOW: # %bb.0: ; X64-SLOW-NEXT: movl %edx, %ecx +; X64-SLOW-NEXT: shll $16, %edi ; X64-SLOW-NEXT: movzwl %si, %eax +; X64-SLOW-NEXT: orl %edi, %eax ; X64-SLOW-NEXT: andb $15, %cl -; X64-SLOW-NEXT: shll %cl, %edi -; X64-SLOW-NEXT: xorb $15, %cl -; X64-SLOW-NEXT: shrl %eax ; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-SLOW-NEXT: shrl %cl, %eax -; X64-SLOW-NEXT: orl %edi, %eax +; X64-SLOW-NEXT: shll %cl, %eax +; X64-SLOW-NEXT: shrl $16, %eax ; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax ; X64-SLOW-NEXT: retq %tmp = tail call i16 @llvm.fshl.i16(i16 %x, i16 %y, i16 %z) diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll --- a/llvm/test/CodeGen/X86/fshr.ll +++ b/llvm/test/CodeGen/X86/fshr.ll @@ -16,37 +16,25 @@ define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind { ; X86-LABEL: var_shift_i8: ; X86: # %bb.0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %ah -; X86-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-NEXT: movb {{[0-9]+}}(%esp), %dl -; X86-NEXT: andb $7, %dl -; X86-NEXT: movb %al, %ch -; X86-NEXT: movb %dl, %cl -; X86-NEXT: shrb %cl, %ch -; X86-NEXT: movb $8, %cl -; X86-NEXT: subb %dl, %cl -; X86-NEXT: shlb %cl, %ah -; X86-NEXT: testb %dl, %dl -; X86-NEXT: je .LBB0_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: orb %ch, %ah -; X86-NEXT: movb %ah, %al -; X86-NEXT: .LBB0_2: +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $8, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: andb $7, %cl +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; ; X64-LABEL: var_shift_i8: ; X64: # %bb.0: -; X64-NEXT: andb $7, %dl -; X64-NEXT: movl %esi, %eax ; X64-NEXT: movl %edx, %ecx -; X64-NEXT: shrb %cl, %al -; X64-NEXT: movb $8, %cl -; X64-NEXT: subb %dl, %cl -; X64-NEXT: shlb %cl, %dil -; X64-NEXT: orb %al, %dil -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: testb %dl, %dl -; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: shll $8, %edi +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: orl %edi, %eax +; X64-NEXT: andb $7, %cl +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shrl %cl, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %tmp = tail call i8 @llvm.fshr.i8(i8 %x, i8 %y, i8 %z) @@ -65,15 +53,13 @@ ; ; X86-SLOW-LABEL: var_shift_i16: ; X86-SLOW: # %bb.0: -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-SLOW-NEXT: andb $15, %cl -; X86-SLOW-NEXT: shrl %cl, %edx -; X86-SLOW-NEXT: addl %eax, %eax -; X86-SLOW-NEXT: xorb $15, %cl -; X86-SLOW-NEXT: shll %cl, %eax +; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: shll $16, %eax ; X86-SLOW-NEXT: orl %edx, %eax +; X86-SLOW-NEXT: andb $15, %cl +; X86-SLOW-NEXT: shrl %cl, %eax ; X86-SLOW-NEXT: # kill: def $ax killed $ax killed $eax ; X86-SLOW-NEXT: retl ; @@ -90,15 +76,12 @@ ; X64-SLOW-LABEL: var_shift_i16: ; X64-SLOW: # %bb.0: ; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: # kill: def $edi killed $edi def $rdi -; X64-SLOW-NEXT: movzwl %si, %edx +; X64-SLOW-NEXT: shll $16, %edi +; X64-SLOW-NEXT: movzwl %si, %eax +; X64-SLOW-NEXT: orl %edi, %eax ; X64-SLOW-NEXT: andb $15, %cl -; X64-SLOW-NEXT: shrl %cl, %edx -; X64-SLOW-NEXT: leal (%rdi,%rdi), %eax -; X64-SLOW-NEXT: xorb $15, %cl ; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-SLOW-NEXT: shll %cl, %eax -; X64-SLOW-NEXT: orl %edx, %eax +; X64-SLOW-NEXT: shrl %cl, %eax ; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax ; X64-SLOW-NEXT: retq %tmp = tail call i16 @llvm.fshr.i16(i16 %x, i16 %y, i16 %z) diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll --- a/llvm/test/CodeGen/X86/rotate-extract.ll +++ b/llvm/test/CodeGen/X86/rotate-extract.ll @@ -232,31 +232,31 @@ ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull $171, %eax, %ecx -; X86-NEXT: shlb $3, %ch -; X86-NEXT: andb $-16, %ch ; X86-NEXT: imull $79, %eax, %edx ; X86-NEXT: subb %dh, %al ; X86-NEXT: shrb %al ; X86-NEXT: addb %dh, %al ; X86-NEXT: shrb $5, %al -; X86-NEXT: orb %ch, %al -; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: shlb $3, %ch +; X86-NEXT: orb %al, %ch +; X86-NEXT: andb $-9, %ch +; X86-NEXT: movb %ch, %al ; X86-NEXT: retl ; ; X64-LABEL: no_extract_udiv: ; X64: # %bb.0: -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: imull $171, %eax, %ecx -; X64-NEXT: shrl $8, %ecx -; X64-NEXT: shlb $3, %cl -; X64-NEXT: andb $-16, %cl -; X64-NEXT: imull $79, %eax, %edx +; X64-NEXT: movzbl %dil, %ecx +; X64-NEXT: imull $171, %ecx, %eax +; X64-NEXT: shrl $8, %eax +; X64-NEXT: imull $79, %ecx, %edx ; X64-NEXT: shrl $8, %edx -; X64-NEXT: subb %dl, %al -; X64-NEXT: shrb %al -; X64-NEXT: addb %dl, %al -; X64-NEXT: shrb $5, %al +; X64-NEXT: subb %dl, %cl +; X64-NEXT: shrb %cl +; X64-NEXT: addb %dl, %cl +; X64-NEXT: shrb $5, %cl +; X64-NEXT: shlb $3, %al ; X64-NEXT: orb %cl, %al +; X64-NEXT: andb $-9, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %lhs_div = udiv i8 %i, 3