Index: llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h +++ llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h @@ -394,9 +394,13 @@ /// When the 1st operand is a vector, the shift amount must be in the same /// type. (TLI.getShiftAmountTy() will return the same type when the input /// type is a vector.) - /// For rotates, the shift amount is treated as an unsigned amount modulo - /// the element size of the first operand. - SHL, SRA, SRL, ROTL, ROTR, + /// For rotates and funnel shifts, the shift amount is treated as an unsigned + /// amount modulo the element size of the first operand. + /// + /// Funnel 'double' shifts take 3 operands, 2 inputs and the shift amount. + /// fshl(X,Y,Z): (X << (Z % BW)) | (Y >> (BW - (Z % BW))) + /// fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW)) + SHL, SRA, SRL, ROTL, ROTR, FSHL, FSHR, /// Byte Swap and Counting operators. BSWAP, CTTZ, CTLZ, CTPOP, BITREVERSE, Index: llvm/trunk/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/TargetLowering.h +++ llvm/trunk/include/llvm/CodeGen/TargetLowering.h @@ -3689,6 +3689,12 @@ SDValue LL = SDValue(), SDValue LH = SDValue(), SDValue RL = SDValue(), SDValue RH = SDValue()) const; + /// Expand funnel shift. + /// \param N Node to expand + /// \param Result output after conversion + /// \returns True, if the expansion was successful, false otherwise + bool expandFunnelShift(SDNode *N, SDValue &Result, SelectionDAG &DAG) const; + /// Expand float(f32) to SINT(i64) conversion /// \param N Node to expand /// \param Result output after conversion Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -325,6 +325,7 @@ SDValue visitSHL(SDNode *N); SDValue visitSRA(SDNode *N); SDValue visitSRL(SDNode *N); + SDValue visitFunnelShift(SDNode *N); SDValue visitRotate(SDNode *N); SDValue visitABS(SDNode *N); SDValue visitBSWAP(SDNode *N); @@ -1513,6 +1514,8 @@ case ISD::SRL: return visitSRL(N); case ISD::ROTR: case ISD::ROTL: return visitRotate(N); + case ISD::FSHL: + case ISD::FSHR: return visitFunnelShift(N); case ISD::ABS: return visitABS(N); case ISD::BSWAP: return visitBSWAP(N); case ISD::BITREVERSE: return visitBITREVERSE(N); @@ -6926,6 +6929,39 @@ return SDValue(); } +SDValue DAGCombiner::visitFunnelShift(SDNode *N) { + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue N2 = N->getOperand(2); + bool IsFSHL = N->getOpcode() == ISD::FSHL; + unsigned BitWidth = VT.getScalarSizeInBits(); + + // fold (fshl N0, N1, 0) -> N0 + // fold (fshr N0, N1, 0) -> N1 + if (DAG.MaskedValueIsZero(N2, APInt::getAllOnesValue(BitWidth))) + return IsFSHL ? N0 : N1; + + // fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth) + if (ConstantSDNode *Cst = isConstOrConstSplat(N2)) { + if (Cst->getAPIntValue().uge(BitWidth)) { + uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth); + return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1, + DAG.getConstant(RotAmt, SDLoc(N), N2.getValueType())); + } + } + + // fold (fshl N0, N0, N2) -> (rotl N0, N2) + // fold (fshr N0, N0, N2) -> (rotr N0, N2) + // TODO: Investigate flipping this rotate if only one is legal, if funnel shift + // is legal as well we might be better off avoiding non-constant (BW - N2). + unsigned RotOpc = IsFSHL ? ISD::ROTL : ISD::ROTR; + if (N0 == N1 && hasOperation(RotOpc, VT)) + return DAG.getNode(RotOpc, SDLoc(N), VT, N0, N2); + + return SDValue(); +} + SDValue DAGCombiner::visitABS(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1170,6 +1170,8 @@ } } break; + case ISD::FSHL: + case ISD::FSHR: case ISD::SRL_PARTS: case ISD::SRA_PARTS: case ISD::SHL_PARTS: { @@ -3262,6 +3264,11 @@ } break; } + case ISD::FSHL: + case ISD::FSHR: + if (TLI.expandFunnelShift(Node, Tmp1, DAG)) + Results.push_back(Tmp1); + break; case ISD::SADDSAT: case ISD::UADDSAT: case ISD::SSUBSAT: Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -129,12 +129,13 @@ SDValue ExpandFNEG(SDValue Op); SDValue ExpandFSUB(SDValue Op); SDValue ExpandBITREVERSE(SDValue Op); - SDValue ExpandCTPOP(SDValue Op); - SDValue ExpandCTLZ(SDValue Op); - SDValue ExpandCTTZ(SDValue Op); - SDValue ExpandFMINNUM_FMAXNUM(SDValue Op); - SDValue ExpandStrictFPOp(SDValue Op); - + SDValue ExpandCTPOP(SDValue Op); + SDValue ExpandCTLZ(SDValue Op); + SDValue ExpandCTTZ(SDValue Op); + SDValue ExpandFunnelShift(SDValue Op); + SDValue ExpandFMINNUM_FMAXNUM(SDValue Op); + SDValue ExpandStrictFPOp(SDValue Op); + /// Implements vector promotion. /// /// This is essentially just bitcasting the operands to a different type and @@ -746,12 +747,15 @@ case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: return ExpandCTLZ(Op); - case ISD::CTTZ: - case ISD::CTTZ_ZERO_UNDEF: - return ExpandCTTZ(Op); - case ISD::FMINNUM: - case ISD::FMAXNUM: - return ExpandFMINNUM_FMAXNUM(Op); + case ISD::CTTZ: + case ISD::CTTZ_ZERO_UNDEF: + return ExpandCTTZ(Op); + case ISD::FSHL: + case ISD::FSHR: + return ExpandFunnelShift(Op); + case ISD::FMINNUM: + case ISD::FMAXNUM: + return ExpandFMINNUM_FMAXNUM(Op); case ISD::STRICT_FADD: case ISD::STRICT_FSUB: case ISD::STRICT_FMUL: @@ -1123,32 +1127,40 @@ return Op; // Defer to LegalizeDAG return DAG.UnrollVectorOp(Op.getNode()); -} - -SDValue VectorLegalizer::ExpandCTPOP(SDValue Op) { - SDValue Result; - if (TLI.expandCTPOP(Op.getNode(), Result, DAG)) - return Result; - - return DAG.UnrollVectorOp(Op.getNode()); -} - -SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) { - SDValue Result; - if (TLI.expandCTLZ(Op.getNode(), Result, DAG)) - return Result; - - return DAG.UnrollVectorOp(Op.getNode()); -} - -SDValue VectorLegalizer::ExpandCTTZ(SDValue Op) { - SDValue Result; - if (TLI.expandCTTZ(Op.getNode(), Result, DAG)) - return Result; - - return DAG.UnrollVectorOp(Op.getNode()); -} - +} + +SDValue VectorLegalizer::ExpandCTPOP(SDValue Op) { + SDValue Result; + if (TLI.expandCTPOP(Op.getNode(), Result, DAG)) + return Result; + + return DAG.UnrollVectorOp(Op.getNode()); +} + +SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) { + SDValue Result; + if (TLI.expandCTLZ(Op.getNode(), Result, DAG)) + return Result; + + return DAG.UnrollVectorOp(Op.getNode()); +} + +SDValue VectorLegalizer::ExpandCTTZ(SDValue Op) { + SDValue Result; + if (TLI.expandCTTZ(Op.getNode(), Result, DAG)) + return Result; + + return DAG.UnrollVectorOp(Op.getNode()); +} + +SDValue VectorLegalizer::ExpandFunnelShift(SDValue Op) { + SDValue Result; + if (TLI.expandFunnelShift(Op.getNode(), Result, DAG)) + return Result; + + return DAG.UnrollVectorOp(Op.getNode()); +} + SDValue VectorLegalizer::ExpandFMINNUM_FMAXNUM(SDValue Op) { if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Op.getNode(), DAG)) return Expanded; Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5751,6 +5751,12 @@ SDValue Zero = DAG.getConstant(0, sdl, VT); SDValue ShAmt = DAG.getNode(ISD::UREM, sdl, VT, Z, BitWidthC); + auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR; + if (TLI.isOperationLegalOrCustom(FunnelOpcode, VT)) { + setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z)); + return nullptr; + } + // When X == Y, this is rotate. If the data type has a power-of-2 size, we // avoid the select that is necessary in the general case to filter out // the 0-shift possibility that leads to UB. Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -237,6 +237,8 @@ case ISD::SRL: return "srl"; case ISD::ROTL: return "rotl"; case ISD::ROTR: return "rotr"; + case ISD::FSHL: return "fshl"; + case ISD::FSHR: return "fshr"; case ISD::FADD: return "fadd"; case ISD::STRICT_FADD: return "strict_fadd"; case ISD::FSUB: return "fsub"; Index: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4114,6 +4114,54 @@ return Ok; } +bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result, + SelectionDAG &DAG) const { + EVT VT = Node->getValueType(0); + + if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SHL, VT) || + !isOperationLegalOrCustom(ISD::SRL, VT) || + !isOperationLegalOrCustom(ISD::SUB, VT) || + !isOperationLegalOrCustomOrPromote(ISD::OR, VT))) + return false; + + // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) + // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) + SDValue X = Node->getOperand(0); + SDValue Y = Node->getOperand(1); + SDValue Z = Node->getOperand(2); + + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + bool IsFSHL = Node->getOpcode() == ISD::FSHL; + SDLoc DL(SDValue(Node, 0)); + + EVT ShVT = Z.getValueType(); + SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT); + SDValue Zero = DAG.getConstant(0, DL, ShVT); + + SDValue ShAmt; + if (isPowerOf2_32(EltSizeInBits)) { + SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT); + ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Z, Mask); + } else { + ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Z, BitWidthC); + } + + SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, ShAmt); + SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt); + SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt); + SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShX, ShY); + + // If (Z % BW == 0), then the opposite direction shift is shift-by-bitwidth, + // and that is undefined. We must compare and select to avoid UB. + EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ShVT); + + // For fshl, 0-shift returns the 1st arg (X). + // For fshr, 0-shift returns the 2nd arg (Y). + SDValue IsZeroShift = DAG.getSetCC(DL, CCVT, ShAmt, Zero, ISD::SETEQ); + Result = DAG.getSelect(DL, VT, IsZeroShift, IsFSHL ? X : Y, Or); + return true; +} + bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result, SelectionDAG &DAG) const { SDValue Src = Node->getOperand(0); Index: llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp +++ llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp @@ -610,6 +610,8 @@ setOperationAction(ISD::UMIN, VT, Expand); setOperationAction(ISD::UMAX, VT, Expand); setOperationAction(ISD::ABS, VT, Expand); + setOperationAction(ISD::FSHL, VT, Expand); + setOperationAction(ISD::FSHR, VT, Expand); setOperationAction(ISD::SADDSAT, VT, Expand); setOperationAction(ISD::UADDSAT, VT, Expand); setOperationAction(ISD::SSUBSAT, VT, Expand); Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -195,6 +195,14 @@ setOperationAction(ISD::ABS , MVT::i64 , Custom); } + // Funnel shifts. + for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) { + setOperationAction(ShiftOp , MVT::i16 , Custom); + setOperationAction(ShiftOp , MVT::i32 , Custom); + if (Subtarget.is64Bit()) + setOperationAction(ShiftOp , MVT::i64 , Custom); + } + // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this // operation. setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); @@ -16972,6 +16980,7 @@ /// Lower SRA_PARTS and friends, which return two i32 values /// and take a 2 x i32 value to shift plus a shift amount. +/// TODO: Can this be moved to general expansion code? static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); MVT VT = Op.getSimpleValueType(); @@ -16981,8 +16990,8 @@ SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the - // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away + // ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and + // ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away // during isel. SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, DAG.getConstant(VTBits - 1, dl, MVT::i8)); @@ -16992,10 +17001,10 @@ SDValue Tmp2, Tmp3; if (Op.getOpcode() == ISD::SHL_PARTS) { - Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); + Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt); Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt); } else { - Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); + Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt); Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt); } @@ -17019,6 +17028,37 @@ return DAG.getMergeValues({ Lo, Hi }, dl); } +static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && + "Unexpected funnel shift opcode!"); + assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && + "Unexpected funnel shift type!"); + + SDLoc DL(Op); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue Amt = Op.getOperand(2); + + // Expand slow SHLD/SHRD cases. + // TODO - can we be more selective here: OptSize/RMW etc.? + if (Subtarget.isSHLDSlow()) + return SDValue(); + + bool IsFSHR = Op.getOpcode() == ISD::FSHR; + if (IsFSHR) + std::swap(Op0, Op1); + + // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo. + if (VT == MVT::i16) + Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, + DAG.getConstant(15, DL, Amt.getValueType())); + + unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD); + return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt); +} + // Try to use a packed vector operation to handle i64 on 32-bit targets when // AVX512DQ is enabled. static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG, @@ -26115,6 +26155,8 @@ case ISD::SHL_PARTS: case ISD::SRA_PARTS: case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); + case ISD::FSHL: + case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2022,7 +2022,7 @@ static const CostTblEntry X64CostTbl[] = { // 64-bit targets { ISD::ROTL, MVT::i64, 1 }, { ISD::ROTR, MVT::i64, 1 }, - { X86ISD::SHLD, MVT::i64, 4 } + { ISD::FSHL, MVT::i64, 4 } }; static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets { ISD::ROTL, MVT::i32, 1 }, @@ -2031,9 +2031,9 @@ { ISD::ROTR, MVT::i32, 1 }, { ISD::ROTR, MVT::i16, 1 }, { ISD::ROTR, MVT::i8, 1 }, - { X86ISD::SHLD, MVT::i32, 4 }, - { X86ISD::SHLD, MVT::i16, 4 }, - { X86ISD::SHLD, MVT::i8, 4 } + { ISD::FSHL, MVT::i32, 4 }, + { ISD::FSHL, MVT::i16, 4 }, + { ISD::FSHL, MVT::i8, 4 } }; unsigned ISD = ISD::DELETED_NODE; @@ -2041,13 +2041,13 @@ default: break; case Intrinsic::fshl: - ISD = X86ISD::SHLD; + ISD = ISD::FSHL; if (Args[0] == Args[1]) ISD = ISD::ROTL; break; case Intrinsic::fshr: - // SHRD has same costs so don't duplicate. - ISD = X86ISD::SHLD; + // FSHR has same costs so don't duplicate. + ISD = ISD::FSHL; if (Args[0] == Args[1]) ISD = ISD::ROTR; break; Index: llvm/trunk/test/CodeGen/X86/fshl.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fshl.ll +++ llvm/trunk/test/CodeGen/X86/fshl.ll @@ -58,20 +58,11 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind { ; X86-FAST-LABEL: var_shift_i16: ; X86-FAST: # %bb.0: -; X86-FAST-NEXT: pushl %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %esi -; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-FAST-NEXT: andl $15, %ecx -; X86-FAST-NEXT: movl %eax, %edx -; X86-FAST-NEXT: shldw %cl, %si, %dx -; X86-FAST-NEXT: testw %cx, %cx -; X86-FAST-NEXT: je .LBB1_2 -; X86-FAST-NEXT: # %bb.1: -; X86-FAST-NEXT: movl %edx, %eax -; X86-FAST-NEXT: .LBB1_2: -; X86-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; X86-FAST-NEXT: popl %esi +; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-FAST-NEXT: andb $15, %cl +; X86-FAST-NEXT: shldw %cl, %dx, %ax ; X86-FAST-NEXT: retl ; ; X86-SLOW-LABEL: var_shift_i16: @@ -79,17 +70,16 @@ ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi ; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-SLOW-NEXT: andl $15, %edx +; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl +; X86-SLOW-NEXT: andb $15, %dl ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: movl %eax, %edi ; X86-SLOW-NEXT: movl %edx, %ecx ; X86-SLOW-NEXT: shll %cl, %edi -; X86-SLOW-NEXT: movl $16, %ecx -; X86-SLOW-NEXT: subl %edx, %ecx -; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SLOW-NEXT: movb $16, %cl +; X86-SLOW-NEXT: subb %dl, %cl ; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: testw %dx, %dx +; X86-SLOW-NEXT: testb %dl, %dl ; X86-SLOW-NEXT: je .LBB1_2 ; X86-SLOW-NEXT: # %bb.1: ; X86-SLOW-NEXT: orl %esi, %edi @@ -103,27 +93,25 @@ ; X64-FAST-LABEL: var_shift_i16: ; X64-FAST: # %bb.0: ; X64-FAST-NEXT: movl %edx, %ecx -; X64-FAST-NEXT: andl $15, %ecx ; X64-FAST-NEXT: movl %edi, %eax +; X64-FAST-NEXT: andb $15, %cl +; X64-FAST-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-FAST-NEXT: shldw %cl, %si, %ax -; X64-FAST-NEXT: testw %cx, %cx -; X64-FAST-NEXT: cmovel %edi, %eax ; X64-FAST-NEXT: # kill: def $ax killed $ax killed $eax ; X64-FAST-NEXT: retq ; ; X64-SLOW-LABEL: var_shift_i16: ; X64-SLOW: # %bb.0: ; X64-SLOW-NEXT: movzwl %si, %eax -; X64-SLOW-NEXT: andl $15, %edx +; X64-SLOW-NEXT: andb $15, %dl ; X64-SLOW-NEXT: movl %edi, %esi ; X64-SLOW-NEXT: movl %edx, %ecx ; X64-SLOW-NEXT: shll %cl, %esi -; X64-SLOW-NEXT: movl $16, %ecx -; X64-SLOW-NEXT: subl %edx, %ecx -; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-SLOW-NEXT: movb $16, %cl +; X64-SLOW-NEXT: subb %dl, %cl ; X64-SLOW-NEXT: shrl %cl, %eax ; X64-SLOW-NEXT: orl %esi, %eax -; X64-SLOW-NEXT: testw %dx, %dx +; X64-SLOW-NEXT: testb %dl, %dl ; X64-SLOW-NEXT: cmovel %edi, %eax ; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax ; X64-SLOW-NEXT: retq @@ -134,19 +122,10 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind { ; X86-FAST-LABEL: var_shift_i32: ; X86-FAST: # %bb.0: -; X86-FAST-NEXT: pushl %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-FAST-NEXT: andl $31, %ecx -; X86-FAST-NEXT: movl %eax, %edx -; X86-FAST-NEXT: shldl %cl, %esi, %edx -; X86-FAST-NEXT: testl %ecx, %ecx -; X86-FAST-NEXT: je .LBB2_2 -; X86-FAST-NEXT: # %bb.1: -; X86-FAST-NEXT: movl %edx, %eax -; X86-FAST-NEXT: .LBB2_2: -; X86-FAST-NEXT: popl %esi +; X86-FAST-NEXT: shldl %cl, %edx, %eax ; X86-FAST-NEXT: retl ; ; X86-SLOW-LABEL: var_shift_i32: @@ -154,17 +133,16 @@ ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SLOW-NEXT: andl $31, %edx ; X86-SLOW-NEXT: movl %eax, %edi ; X86-SLOW-NEXT: movl %edx, %ecx ; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: andb $31, %dl ; X86-SLOW-NEXT: movl %edx, %ecx -; X86-SLOW-NEXT: negl %ecx -; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SLOW-NEXT: negb %cl ; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: testl %edx, %edx +; X86-SLOW-NEXT: testb %dl, %dl ; X86-SLOW-NEXT: je .LBB2_2 ; X86-SLOW-NEXT: # %bb.1: ; X86-SLOW-NEXT: orl %esi, %edi @@ -177,26 +155,23 @@ ; X64-FAST-LABEL: var_shift_i32: ; X64-FAST: # %bb.0: ; X64-FAST-NEXT: movl %edx, %ecx -; X64-FAST-NEXT: andl $31, %ecx ; X64-FAST-NEXT: movl %edi, %eax +; X64-FAST-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-FAST-NEXT: shldl %cl, %esi, %eax -; X64-FAST-NEXT: testl %ecx, %ecx -; X64-FAST-NEXT: cmovel %edi, %eax ; X64-FAST-NEXT: retq ; ; X64-SLOW-LABEL: var_shift_i32: ; X64-SLOW: # %bb.0: ; X64-SLOW-NEXT: movl %esi, %eax -; X64-SLOW-NEXT: andl $31, %edx ; X64-SLOW-NEXT: movl %edi, %esi ; X64-SLOW-NEXT: movl %edx, %ecx ; X64-SLOW-NEXT: shll %cl, %esi +; X64-SLOW-NEXT: andb $31, %dl ; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: negl %ecx -; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-SLOW-NEXT: negb %cl ; X64-SLOW-NEXT: shrl %cl, %eax ; X64-SLOW-NEXT: orl %esi, %eax -; X64-SLOW-NEXT: testl %edx, %edx +; X64-SLOW-NEXT: testb %dl, %dl ; X64-SLOW-NEXT: cmovel %edi, %eax ; X64-SLOW-NEXT: retq %tmp = tail call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) @@ -204,85 +179,166 @@ } define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { -; X86-LABEL: var_shift_i64: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: andl $63, %ebx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: shll %cl, %edi -; X86-NEXT: shldl %cl, %eax, %ebp -; X86-NEXT: testb $32, %bl -; X86-NEXT: je .LBB3_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: .LBB3_2: -; X86-NEXT: movb $64, %cl -; X86-NEXT: subb %bl, %cl -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NEXT: testb $32, %cl -; X86-NEXT: jne .LBB3_3 -; X86-NEXT: # %bb.4: -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: jne .LBB3_6 -; X86-NEXT: jmp .LBB3_7 -; X86-NEXT: .LBB3_3: -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: je .LBB3_7 -; X86-NEXT: .LBB3_6: -; X86-NEXT: orl %esi, %ebp -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: .LBB3_7: -; X86-NEXT: addl $4, %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl +; X86-FAST-LABEL: var_shift_i64: +; X86-FAST: # %bb.0: +; X86-FAST-NEXT: pushl %ebp +; X86-FAST-NEXT: pushl %ebx +; X86-FAST-NEXT: pushl %edi +; X86-FAST-NEXT: pushl %esi +; X86-FAST-NEXT: pushl %eax +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-FAST-NEXT: andl $63, %ebx +; X86-FAST-NEXT: movl %eax, %edi +; X86-FAST-NEXT: movl %ebx, %ecx +; X86-FAST-NEXT: shll %cl, %edi +; X86-FAST-NEXT: shldl %cl, %eax, %ebp +; X86-FAST-NEXT: testb $32, %bl +; X86-FAST-NEXT: je .LBB3_2 +; X86-FAST-NEXT: # %bb.1: +; X86-FAST-NEXT: movl %edi, %ebp +; X86-FAST-NEXT: xorl %edi, %edi +; X86-FAST-NEXT: .LBB3_2: +; X86-FAST-NEXT: movb $64, %cl +; X86-FAST-NEXT: subb %bl, %cl +; X86-FAST-NEXT: movl %edx, %esi +; X86-FAST-NEXT: shrl %cl, %esi +; X86-FAST-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill +; X86-FAST-NEXT: testb $32, %cl +; X86-FAST-NEXT: jne .LBB3_3 +; X86-FAST-NEXT: # %bb.4: +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-FAST-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-FAST-NEXT: testl %ebx, %ebx +; X86-FAST-NEXT: jne .LBB3_6 +; X86-FAST-NEXT: jmp .LBB3_7 +; X86-FAST-NEXT: .LBB3_3: +; X86-FAST-NEXT: movl %esi, %ecx +; X86-FAST-NEXT: xorl %esi, %esi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-FAST-NEXT: testl %ebx, %ebx +; X86-FAST-NEXT: je .LBB3_7 +; X86-FAST-NEXT: .LBB3_6: +; X86-FAST-NEXT: orl %esi, %ebp +; X86-FAST-NEXT: orl %ecx, %edi +; X86-FAST-NEXT: movl %edi, %eax +; X86-FAST-NEXT: movl %ebp, %edx +; X86-FAST-NEXT: .LBB3_7: +; X86-FAST-NEXT: addl $4, %esp +; X86-FAST-NEXT: popl %esi +; X86-FAST-NEXT: popl %edi +; X86-FAST-NEXT: popl %ebx +; X86-FAST-NEXT: popl %ebp +; X86-FAST-NEXT: retl +; +; X86-SLOW-LABEL: var_shift_i64: +; X86-SLOW: # %bb.0: +; X86-SLOW-NEXT: pushl %ebp +; X86-SLOW-NEXT: pushl %ebx +; X86-SLOW-NEXT: pushl %edi +; X86-SLOW-NEXT: pushl %esi +; X86-SLOW-NEXT: subl $8, %esp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SLOW-NEXT: andl $63, %ebx +; X86-SLOW-NEXT: movb $64, %dh +; X86-SLOW-NEXT: subb %bl, %dh +; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movb %dh, %cl +; X86-SLOW-NEXT: shrl %cl, %eax +; X86-SLOW-NEXT: movb %dh, %dl +; X86-SLOW-NEXT: andb $31, %dl +; X86-SLOW-NEXT: movl %edx, %ecx +; X86-SLOW-NEXT: negb %cl +; X86-SLOW-NEXT: movl %esi, %ebp +; X86-SLOW-NEXT: shll %cl, %ebp +; X86-SLOW-NEXT: testb %dl, %dl +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: je .LBB3_2 +; X86-SLOW-NEXT: # %bb.1: +; X86-SLOW-NEXT: orl %eax, %ebp +; X86-SLOW-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: .LBB3_2: +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SLOW-NEXT: movl %ebp, %eax +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shll %cl, %eax +; X86-SLOW-NEXT: movb %bl, %ch +; X86-SLOW-NEXT: andb $31, %ch +; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: negb %cl +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: testb %ch, %ch +; X86-SLOW-NEXT: je .LBB3_4 +; X86-SLOW-NEXT: # %bb.3: +; X86-SLOW-NEXT: orl %edi, %eax +; X86-SLOW-NEXT: movl %eax, %ebp +; X86-SLOW-NEXT: .LBB3_4: +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: movl %eax, %edi +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: testb $32, %bl +; X86-SLOW-NEXT: je .LBB3_6 +; X86-SLOW-NEXT: # %bb.5: +; X86-SLOW-NEXT: movl %edi, %ebp +; X86-SLOW-NEXT: xorl %edi, %edi +; X86-SLOW-NEXT: .LBB3_6: +; X86-SLOW-NEXT: movb %dh, %cl +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: testb $32, %dh +; X86-SLOW-NEXT: jne .LBB3_7 +; X86-SLOW-NEXT: # %bb.8: +; X86-SLOW-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-SLOW-NEXT: testl %ebx, %ebx +; X86-SLOW-NEXT: jne .LBB3_10 +; X86-SLOW-NEXT: jmp .LBB3_11 +; X86-SLOW-NEXT: .LBB3_7: +; X86-SLOW-NEXT: movl %esi, %ecx +; X86-SLOW-NEXT: xorl %esi, %esi +; X86-SLOW-NEXT: testl %ebx, %ebx +; X86-SLOW-NEXT: je .LBB3_11 +; X86-SLOW-NEXT: .LBB3_10: +; X86-SLOW-NEXT: orl %esi, %ebp +; X86-SLOW-NEXT: orl %ecx, %edi +; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %edi, %eax +; X86-SLOW-NEXT: .LBB3_11: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-SLOW-NEXT: addl $8, %esp +; X86-SLOW-NEXT: popl %esi +; X86-SLOW-NEXT: popl %edi +; X86-SLOW-NEXT: popl %ebx +; X86-SLOW-NEXT: popl %ebp +; X86-SLOW-NEXT: retl ; ; X64-FAST-LABEL: var_shift_i64: ; X64-FAST: # %bb.0: ; X64-FAST-NEXT: movq %rdx, %rcx -; X64-FAST-NEXT: andl $63, %ecx ; X64-FAST-NEXT: movq %rdi, %rax +; X64-FAST-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-FAST-NEXT: shldq %cl, %rsi, %rax -; X64-FAST-NEXT: testq %rcx, %rcx -; X64-FAST-NEXT: cmoveq %rdi, %rax ; X64-FAST-NEXT: retq ; ; X64-SLOW-LABEL: var_shift_i64: ; X64-SLOW: # %bb.0: ; X64-SLOW-NEXT: movq %rsi, %rax -; X64-SLOW-NEXT: andl $63, %edx ; X64-SLOW-NEXT: movq %rdi, %rsi ; X64-SLOW-NEXT: movl %edx, %ecx ; X64-SLOW-NEXT: shlq %cl, %rsi +; X64-SLOW-NEXT: andb $63, %dl ; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: negl %ecx -; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-SLOW-NEXT: negb %cl ; X64-SLOW-NEXT: shrq %cl, %rax ; X64-SLOW-NEXT: orq %rsi, %rax -; X64-SLOW-NEXT: testq %rdx, %rdx +; X64-SLOW-NEXT: testb %dl, %dl ; X64-SLOW-NEXT: cmoveq %rdi, %rax ; X64-SLOW-NEXT: retq %tmp = tail call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 %z) Index: llvm/trunk/test/CodeGen/X86/fshr.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fshr.ll +++ llvm/trunk/test/CodeGen/X86/fshr.ll @@ -58,20 +58,11 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind { ; X86-FAST-LABEL: var_shift_i16: ; X86-FAST: # %bb.0: -; X86-FAST-NEXT: pushl %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %esi -; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-FAST-NEXT: andl $15, %ecx -; X86-FAST-NEXT: movl %eax, %edx -; X86-FAST-NEXT: shrdw %cl, %si, %dx -; X86-FAST-NEXT: testw %cx, %cx -; X86-FAST-NEXT: je .LBB1_2 -; X86-FAST-NEXT: # %bb.1: -; X86-FAST-NEXT: movl %edx, %eax -; X86-FAST-NEXT: .LBB1_2: -; X86-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; X86-FAST-NEXT: popl %esi +; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-FAST-NEXT: andb $15, %cl +; X86-FAST-NEXT: shrdw %cl, %dx, %ax ; X86-FAST-NEXT: retl ; ; X86-SLOW-LABEL: var_shift_i16: @@ -79,17 +70,16 @@ ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-SLOW-NEXT: andl $15, %edx +; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl +; X86-SLOW-NEXT: andb $15, %dl ; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: movl %eax, %edi ; X86-SLOW-NEXT: movl %edx, %ecx ; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: movl $16, %ecx -; X86-SLOW-NEXT: subl %edx, %ecx -; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SLOW-NEXT: movb $16, %cl +; X86-SLOW-NEXT: subb %dl, %cl ; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: testw %dx, %dx +; X86-SLOW-NEXT: testb %dl, %dl ; X86-SLOW-NEXT: je .LBB1_2 ; X86-SLOW-NEXT: # %bb.1: ; X86-SLOW-NEXT: orl %edi, %esi @@ -103,26 +93,24 @@ ; X64-FAST-LABEL: var_shift_i16: ; X64-FAST: # %bb.0: ; X64-FAST-NEXT: movl %edx, %ecx -; X64-FAST-NEXT: andl $15, %ecx ; X64-FAST-NEXT: movl %esi, %eax +; X64-FAST-NEXT: andb $15, %cl +; X64-FAST-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-FAST-NEXT: shrdw %cl, %di, %ax -; X64-FAST-NEXT: testw %cx, %cx -; X64-FAST-NEXT: cmovel %esi, %eax ; X64-FAST-NEXT: # kill: def $ax killed $ax killed $eax ; X64-FAST-NEXT: retq ; ; X64-SLOW-LABEL: var_shift_i16: ; X64-SLOW: # %bb.0: ; X64-SLOW-NEXT: movzwl %si, %eax -; X64-SLOW-NEXT: andl $15, %edx +; X64-SLOW-NEXT: andb $15, %dl ; X64-SLOW-NEXT: movl %edx, %ecx ; X64-SLOW-NEXT: shrl %cl, %eax -; X64-SLOW-NEXT: movl $16, %ecx -; X64-SLOW-NEXT: subl %edx, %ecx -; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-SLOW-NEXT: movb $16, %cl +; X64-SLOW-NEXT: subb %dl, %cl ; X64-SLOW-NEXT: shll %cl, %edi ; X64-SLOW-NEXT: orl %edi, %eax -; X64-SLOW-NEXT: testw %dx, %dx +; X64-SLOW-NEXT: testb %dl, %dl ; X64-SLOW-NEXT: cmovel %esi, %eax ; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax ; X64-SLOW-NEXT: retq @@ -133,19 +121,10 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind { ; X86-FAST-LABEL: var_shift_i32: ; X86-FAST: # %bb.0: -; X86-FAST-NEXT: pushl %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-FAST-NEXT: andl $31, %ecx -; X86-FAST-NEXT: movl %eax, %edx -; X86-FAST-NEXT: shrdl %cl, %esi, %edx -; X86-FAST-NEXT: testl %ecx, %ecx -; X86-FAST-NEXT: je .LBB2_2 -; X86-FAST-NEXT: # %bb.1: -; X86-FAST-NEXT: movl %edx, %eax -; X86-FAST-NEXT: .LBB2_2: -; X86-FAST-NEXT: popl %esi +; X86-FAST-NEXT: shrdl %cl, %edx, %eax ; X86-FAST-NEXT: retl ; ; X86-SLOW-LABEL: var_shift_i32: @@ -153,17 +132,16 @@ ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SLOW-NEXT: andl $31, %edx ; X86-SLOW-NEXT: movl %eax, %edi ; X86-SLOW-NEXT: movl %edx, %ecx ; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: andb $31, %dl ; X86-SLOW-NEXT: movl %edx, %ecx -; X86-SLOW-NEXT: negl %ecx -; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SLOW-NEXT: negb %cl ; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: testl %edx, %edx +; X86-SLOW-NEXT: testb %dl, %dl ; X86-SLOW-NEXT: je .LBB2_2 ; X86-SLOW-NEXT: # %bb.1: ; X86-SLOW-NEXT: orl %edi, %esi @@ -176,26 +154,23 @@ ; X64-FAST-LABEL: var_shift_i32: ; X64-FAST: # %bb.0: ; X64-FAST-NEXT: movl %edx, %ecx -; X64-FAST-NEXT: andl $31, %ecx ; X64-FAST-NEXT: movl %esi, %eax +; X64-FAST-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-FAST-NEXT: shrdl %cl, %edi, %eax -; X64-FAST-NEXT: testl %ecx, %ecx -; X64-FAST-NEXT: cmovel %esi, %eax ; X64-FAST-NEXT: retq ; ; X64-SLOW-LABEL: var_shift_i32: ; X64-SLOW: # %bb.0: ; X64-SLOW-NEXT: movl %edi, %eax -; X64-SLOW-NEXT: andl $31, %edx ; X64-SLOW-NEXT: movl %esi, %edi ; X64-SLOW-NEXT: movl %edx, %ecx ; X64-SLOW-NEXT: shrl %cl, %edi +; X64-SLOW-NEXT: andb $31, %dl ; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: negl %ecx -; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-SLOW-NEXT: negb %cl ; X64-SLOW-NEXT: shll %cl, %eax ; X64-SLOW-NEXT: orl %edi, %eax -; X64-SLOW-NEXT: testl %edx, %edx +; X64-SLOW-NEXT: testb %dl, %dl ; X64-SLOW-NEXT: cmovel %esi, %eax ; X64-SLOW-NEXT: retq %tmp = tail call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) @@ -203,81 +178,164 @@ } define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { -; X86-LABEL: var_shift_i64: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: andl $63, %ebx -; X86-NEXT: movb $64, %cl -; X86-NEXT: subb %bl, %cl -; X86-NEXT: movl %eax, %edi -; X86-NEXT: shll %cl, %edi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB3_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edi, %esi -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: .LBB3_2: -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: shrl %cl, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shrdl %cl, %edx, %eax -; X86-NEXT: testb $32, %bl -; X86-NEXT: je .LBB3_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: xorl %ebp, %ebp -; X86-NEXT: .LBB3_4: -; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: je .LBB3_6 -; X86-NEXT: # %bb.5: -; X86-NEXT: orl %ebp, %esi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NEXT: movl %esi, %edx -; X86-NEXT: .LBB3_6: -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: addl $4, %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl +; X86-FAST-LABEL: var_shift_i64: +; X86-FAST: # %bb.0: +; X86-FAST-NEXT: pushl %ebp +; X86-FAST-NEXT: pushl %ebx +; X86-FAST-NEXT: pushl %edi +; X86-FAST-NEXT: pushl %esi +; X86-FAST-NEXT: pushl %eax +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-FAST-NEXT: andl $63, %ebx +; X86-FAST-NEXT: movb $64, %cl +; X86-FAST-NEXT: subb %bl, %cl +; X86-FAST-NEXT: movl %eax, %edi +; X86-FAST-NEXT: shll %cl, %edi +; X86-FAST-NEXT: shldl %cl, %eax, %esi +; X86-FAST-NEXT: testb $32, %cl +; X86-FAST-NEXT: je .LBB3_2 +; X86-FAST-NEXT: # %bb.1: +; X86-FAST-NEXT: movl %edi, %esi +; X86-FAST-NEXT: xorl %edi, %edi +; X86-FAST-NEXT: .LBB3_2: +; X86-FAST-NEXT: movl %edx, %ebp +; X86-FAST-NEXT: movl %ebx, %ecx +; X86-FAST-NEXT: shrl %cl, %ebp +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: shrdl %cl, %edx, %eax +; X86-FAST-NEXT: testb $32, %bl +; X86-FAST-NEXT: je .LBB3_4 +; X86-FAST-NEXT: # %bb.3: +; X86-FAST-NEXT: movl %ebp, %eax +; X86-FAST-NEXT: xorl %ebp, %ebp +; X86-FAST-NEXT: .LBB3_4: +; X86-FAST-NEXT: testl %ebx, %ebx +; X86-FAST-NEXT: je .LBB3_6 +; X86-FAST-NEXT: # %bb.5: +; X86-FAST-NEXT: orl %ebp, %esi +; X86-FAST-NEXT: orl %eax, %edi +; X86-FAST-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-FAST-NEXT: movl %esi, %edx +; X86-FAST-NEXT: .LBB3_6: +; X86-FAST-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-FAST-NEXT: addl $4, %esp +; X86-FAST-NEXT: popl %esi +; X86-FAST-NEXT: popl %edi +; X86-FAST-NEXT: popl %ebx +; X86-FAST-NEXT: popl %ebp +; X86-FAST-NEXT: retl +; +; X86-SLOW-LABEL: var_shift_i64: +; X86-SLOW: # %bb.0: +; X86-SLOW-NEXT: pushl %ebp +; X86-SLOW-NEXT: pushl %ebx +; X86-SLOW-NEXT: pushl %edi +; X86-SLOW-NEXT: pushl %esi +; X86-SLOW-NEXT: subl $8, %esp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SLOW-NEXT: andl $63, %ebx +; X86-SLOW-NEXT: movb $64, %al +; X86-SLOW-NEXT: subb %bl, %al +; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl %eax, %ecx +; X86-SLOW-NEXT: shll %cl, %edx +; X86-SLOW-NEXT: movb %al, %ch +; X86-SLOW-NEXT: andb $31, %ch +; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: negb %cl +; X86-SLOW-NEXT: movl %esi, %edi +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: testb %ch, %ch +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SLOW-NEXT: je .LBB3_2 +; X86-SLOW-NEXT: # %bb.1: +; X86-SLOW-NEXT: orl %edi, %edx +; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: .LBB3_2: +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: movl %ecx, %edx +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shrl %cl, %edx +; X86-SLOW-NEXT: movb %bl, %ah +; X86-SLOW-NEXT: andb $31, %ah +; X86-SLOW-NEXT: movb %ah, %cl +; X86-SLOW-NEXT: negb %cl +; X86-SLOW-NEXT: movl %ebp, %edi +; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: testb %ah, %ah +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SLOW-NEXT: je .LBB3_4 +; X86-SLOW-NEXT: # %bb.3: +; X86-SLOW-NEXT: orl %edx, %edi +; X86-SLOW-NEXT: movl %edi, %ebp +; X86-SLOW-NEXT: .LBB3_4: +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: testb $32, %bl +; X86-SLOW-NEXT: je .LBB3_6 +; X86-SLOW-NEXT: # %bb.5: +; X86-SLOW-NEXT: movl %edi, %ebp +; X86-SLOW-NEXT: xorl %edi, %edi +; X86-SLOW-NEXT: .LBB3_6: +; X86-SLOW-NEXT: movl %eax, %ecx +; X86-SLOW-NEXT: shll %cl, %esi +; X86-SLOW-NEXT: testb $32, %al +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SLOW-NEXT: jne .LBB3_7 +; X86-SLOW-NEXT: # %bb.8: +; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-SLOW-NEXT: testl %ebx, %ebx +; X86-SLOW-NEXT: jne .LBB3_10 +; X86-SLOW-NEXT: jmp .LBB3_11 +; X86-SLOW-NEXT: .LBB3_7: +; X86-SLOW-NEXT: movl %esi, %eax +; X86-SLOW-NEXT: xorl %esi, %esi +; X86-SLOW-NEXT: testl %ebx, %ebx +; X86-SLOW-NEXT: je .LBB3_11 +; X86-SLOW-NEXT: .LBB3_10: +; X86-SLOW-NEXT: orl %ebp, %esi +; X86-SLOW-NEXT: orl %edi, %eax +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %eax, %edx +; X86-SLOW-NEXT: .LBB3_11: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SLOW-NEXT: addl $8, %esp +; X86-SLOW-NEXT: popl %esi +; X86-SLOW-NEXT: popl %edi +; X86-SLOW-NEXT: popl %ebx +; X86-SLOW-NEXT: popl %ebp +; X86-SLOW-NEXT: retl ; ; X64-FAST-LABEL: var_shift_i64: ; X64-FAST: # %bb.0: ; X64-FAST-NEXT: movq %rdx, %rcx -; X64-FAST-NEXT: andl $63, %ecx ; X64-FAST-NEXT: movq %rsi, %rax +; X64-FAST-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-FAST-NEXT: shrdq %cl, %rdi, %rax -; X64-FAST-NEXT: testq %rcx, %rcx -; X64-FAST-NEXT: cmoveq %rsi, %rax ; X64-FAST-NEXT: retq ; ; X64-SLOW-LABEL: var_shift_i64: ; X64-SLOW: # %bb.0: ; X64-SLOW-NEXT: movq %rdi, %rax -; X64-SLOW-NEXT: andl $63, %edx ; X64-SLOW-NEXT: movq %rsi, %rdi ; X64-SLOW-NEXT: movl %edx, %ecx ; X64-SLOW-NEXT: shrq %cl, %rdi +; X64-SLOW-NEXT: andb $63, %dl ; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: negl %ecx -; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-SLOW-NEXT: negb %cl ; X64-SLOW-NEXT: shlq %cl, %rax ; X64-SLOW-NEXT: orq %rdi, %rax -; X64-SLOW-NEXT: testq %rdx, %rdx +; X64-SLOW-NEXT: testb %dl, %dl ; X64-SLOW-NEXT: cmoveq %rsi, %rax ; X64-SLOW-NEXT: retq %tmp = tail call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 %z) @@ -315,7 +373,7 @@ ; X86-FAST: # %bb.0: ; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: shldw $9, %cx, %ax +; X86-FAST-NEXT: shrdw $7, %cx, %ax ; X86-FAST-NEXT: retl ; ; X86-SLOW-LABEL: const_shift_i16: @@ -330,8 +388,8 @@ ; ; X64-FAST-LABEL: const_shift_i16: ; X64-FAST: # %bb.0: -; X64-FAST-NEXT: movl %edi, %eax -; X64-FAST-NEXT: shldw $9, %si, %ax +; X64-FAST-NEXT: movl %esi, %eax +; X64-FAST-NEXT: shrdw $7, %di, %ax ; X64-FAST-NEXT: # kill: def $ax killed $ax killed $eax ; X64-FAST-NEXT: retq ; @@ -352,7 +410,7 @@ ; X86-FAST: # %bb.0: ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: shldl $25, %ecx, %eax +; X86-FAST-NEXT: shrdl $7, %ecx, %eax ; X86-FAST-NEXT: retl ; ; X86-SLOW-LABEL: const_shift_i32: Index: llvm/trunk/test/CodeGen/X86/funnel-shift.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/funnel-shift.ll +++ llvm/trunk/test/CodeGen/X86/funnel-shift.ll @@ -14,31 +14,23 @@ declare i64 @llvm.fshr.i64(i64, i64, i64) declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) -; General case - all operands can be variables - x86 has shld, but the mask and cmov are not needed? +; General case - all operands can be variables define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) nounwind { ; X32-SSE2-LABEL: fshl_i32: ; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: pushl %esi +; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: andl $31, %ecx -; X32-SSE2-NEXT: movl %esi, %eax +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE2-NEXT: shldl %cl, %edx, %eax -; X32-SSE2-NEXT: testl %ecx, %ecx -; X32-SSE2-NEXT: cmovel %esi, %eax -; X32-SSE2-NEXT: popl %esi ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshl_i32: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: movl %edx, %ecx -; X64-AVX2-NEXT: andl $31, %ecx ; X64-AVX2-NEXT: movl %edi, %eax +; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-AVX2-NEXT: shldl %cl, %esi, %eax -; X64-AVX2-NEXT: testl %ecx, %ecx -; X64-AVX2-NEXT: cmovel %edi, %eax ; X64-AVX2-NEXT: retq %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) ret i32 %f @@ -212,31 +204,23 @@ ; Repeat everything for funnel shift right. -; General case - all operands can be variables - x86 has 'shrd', but the mask and cmov are not needed? +; General case - all operands can be variables define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) nounwind { ; X32-SSE2-LABEL: fshr_i32: ; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: pushl %esi +; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: andl $31, %ecx -; X32-SSE2-NEXT: movl %esi, %eax +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE2-NEXT: shrdl %cl, %edx, %eax -; X32-SSE2-NEXT: testl %ecx, %ecx -; X32-SSE2-NEXT: cmovel %esi, %eax -; X32-SSE2-NEXT: popl %esi ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshr_i32: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: movl %edx, %ecx -; X64-AVX2-NEXT: andl $31, %ecx ; X64-AVX2-NEXT: movl %esi, %eax +; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-AVX2-NEXT: shrdl %cl, %edi, %eax -; X64-AVX2-NEXT: testl %ecx, %ecx -; X64-AVX2-NEXT: cmovel %esi, %eax ; X64-AVX2-NEXT: retq %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) ret i32 %f @@ -341,7 +325,7 @@ ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: shldl $23, %ecx, %eax +; X32-SSE2-NEXT: shrdl $9, %ecx, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshr_i32_const_shift: @@ -353,14 +337,14 @@ ret i32 %f } -; Check modulo math on shift amount. 41-32=9, but right-shift became left, so 32-9=23. +; Check modulo math on shift amount. 41-32=9, but right-shift may became left, so 32-9=23. define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) nounwind { ; X32-SSE2-LABEL: fshr_i32_const_overshift: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: shldl $23, %ecx, %eax +; X32-SSE2-NEXT: shrdl $9, %ecx, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshr_i32_const_overshift: