diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -14331,6 +14331,136 @@ %res = call i4 @llvm.udiv.fix.i4(i4 3, i4 4, i32 1) ; %res = 2 (or 1) (1.5 / 2 = 0.75) +'``llvm.sdiv.fix.sat.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax +""""""" + +This is an overloaded intrinsic. You can use ``llvm.sdiv.fix.sat`` +on any integer bit width or vectors of integers. + +:: + + declare i16 @llvm.sdiv.fix.sat.i16(i16 %a, i16 %b, i32 %scale) + declare i32 @llvm.sdiv.fix.sat.i32(i32 %a, i32 %b, i32 %scale) + declare i64 @llvm.sdiv.fix.sat.i64(i64 %a, i64 %b, i32 %scale) + declare <4 x i32> @llvm.sdiv.fix.sat.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %scale) + +Overview +""""""""" + +The '``llvm.sdiv.fix.sat``' family of intrinsic functions perform signed +fixed point saturation division on 2 arguments of the same scale. + +Arguments +"""""""""" + +The arguments (%a and %b) and the result may be of integer types of any bit +width, but they must have the same bit width. ``%a`` and ``%b`` are the two +values that will undergo signed fixed point division. The argument +``%scale`` represents the scale of both operands, and must be a constant +integer. + +Semantics: +"""""""""" + +This operation performs fixed point division on the 2 arguments of a +specified scale. The result will also be returned in the same scale specified +in the third argument. + +If the result value cannot be precisely represented in the given scale, the +value is rounded up or down to the closest representable value. The rounding +direction is unspecified. + +The maximum value this operation can clamp to is the largest signed value +representable by the bit width of the first 2 arguments. The minimum value is the +smallest signed value representable by this bit width. + +It is undefined behavior if the second argument is zero. + + +Examples +""""""""" + +.. code-block:: llvm + + %res = call i4 @llvm.sdiv.fix.sat.i4(i4 6, i4 2, i32 0) ; %res = 3 (6 / 2 = 3) + %res = call i4 @llvm.sdiv.fix.sat.i4(i4 6, i4 4, i32 1) ; %res = 3 (3 / 2 = 1.5) + %res = call i4 @llvm.sdiv.fix.sat.i4(i4 3, i4 -2, i32 1) ; %res = -3 (1.5 / -1 = -1.5) + + ; The result in the following could be rounded up to 1 or down to 0.5 + %res = call i4 @llvm.sdiv.fix.sat.i4(i4 3, i4 4, i32 1) ; %res = 2 (or 1) (1.5 / 2 = 0.75) + + ; Saturation + %res = call i4 @llvm.sdiv.fix.sat.i4(i4 -8, i4 -1, i32 0) ; %res = 7 (-8 / -1 = 8 => 7) + %res = call i4 @llvm.sdiv.fix.sat.i4(i4 4, i4 2, i32 2) ; %res = 7 (1 / 0.5 = 2 => 1.75) + %res = call i4 @llvm.sdiv.fix.sat.i4(i4 -4, i4 1, i32 2) ; %res = -8 (-1 / 0.25 = -4 => -2) + + +'``llvm.udiv.fix.sat.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax +""""""" + +This is an overloaded intrinsic. You can use ``llvm.udiv.fix.sat`` +on any integer bit width or vectors of integers. + +:: + + declare i16 @llvm.udiv.fix.sat.i16(i16 %a, i16 %b, i32 %scale) + declare i32 @llvm.udiv.fix.sat.i32(i32 %a, i32 %b, i32 %scale) + declare i64 @llvm.udiv.fix.sat.i64(i64 %a, i64 %b, i32 %scale) + declare <4 x i32> @llvm.udiv.fix.sat.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %scale) + +Overview +""""""""" + +The '``llvm.udiv.fix.sat``' family of intrinsic functions perform unsigned +fixed point saturation division on 2 arguments of the same scale. + +Arguments +"""""""""" + +The arguments (%a and %b) and the result may be of integer types of any bit +width, but they must have the same bit width. ``%a`` and ``%b`` are the two +values that will undergo unsigned fixed point division. The argument +``%scale`` represents the scale of both operands, and must be a constant +integer. + +Semantics: +"""""""""" + +This operation performs fixed point division on the 2 arguments of a +specified scale. The result will also be returned in the same scale specified +in the third argument. + +If the result value cannot be precisely represented in the given scale, the +value is rounded up or down to the closest representable value. The rounding +direction is unspecified. + +The maximum value this operation can clamp to is the largest unsigned value +representable by the bit width of the first 2 arguments. The minimum value is the +smallest unsigned value representable by this bit width (zero). + +It is undefined behavior if the second argument is zero. + +Examples +""""""""" + +.. code-block:: llvm + + %res = call i4 @llvm.udiv.fix.sat.i4(i4 6, i4 2, i32 0) ; %res = 3 (6 / 2 = 3) + %res = call i4 @llvm.udiv.fix.sat.i4(i4 6, i4 4, i32 1) ; %res = 3 (3 / 2 = 1.5) + + ; The result in the following could be rounded down to 0.5 or up to 1 + %res = call i4 @llvm.udiv.fix.sat.i4(i4 3, i4 4, i32 1) ; %res = 1 (or 2) (1.5 / 2 = 0.75) + + ; Saturation + %res = call i4 @llvm.udiv.fix.sat.i4(i4 8, i4 2, i32 2) ; %res = 15 (2 / 0.5 = 4 => 3.75) + + Specialised Arithmetic Intrinsics --------------------------------- diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -291,6 +291,11 @@ /// constant integer. SDIVFIX, UDIVFIX, + /// Same as the corresponding unsaturated fixed point instructions, but the + /// result is clamped between the min and max values representable by the + /// bits of the first 2 operands. + SDIVFIXSAT, UDIVFIXSAT, + /// Simple binary floating point operators. FADD, FSUB, FMUL, FDIV, FREM, diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1043,7 +1043,9 @@ case ISD::UMULFIX: case ISD::UMULFIXSAT: case ISD::SDIVFIX: + case ISD::SDIVFIXSAT: case ISD::UDIVFIX: + case ISD::UDIVFIXSAT: Supported = isSupportedFixedPointOperation(Op, VT, Scale); break; } @@ -4269,7 +4271,7 @@ /// method accepts integers as its arguments. SDValue expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const; - /// Method for building the DAG expansion of ISD::[US]DIVFIX. This + /// Method for building the DAG expansion of ISD::[US]DIVFIX[SAT]. This /// method accepts integers as its arguments. /// Note: This method may fail if the division could not be performed /// within the type. Clients must retry with a wider type if this happens. diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -969,6 +969,14 @@ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative, ImmArg<2>]>; +def int_sdiv_fix_sat : Intrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], + [IntrNoMem, ImmArg<2>]>; + +def int_udiv_fix_sat : Intrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], + [IntrNoMem, ImmArg<2>]>; + //===------------------------- Memory Use Markers -------------------------===// // def int_lifetime_start : Intrinsic<[], diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -402,7 +402,9 @@ def umulfix : SDNode<"ISD::UMULFIX" , SDTIntScaledBinOp, [SDNPCommutative]>; def umulfixsat : SDNode<"ISD::UMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>; def sdivfix : SDNode<"ISD::SDIVFIX" , SDTIntScaledBinOp>; +def sdivfixsat : SDNode<"ISD::SDIVFIXSAT", SDTIntScaledBinOp>; def udivfix : SDNode<"ISD::UDIVFIX" , SDTIntScaledBinOp>; +def udivfixsat : SDNode<"ISD::UDIVFIXSAT", SDTIntScaledBinOp>; def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>; def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1132,7 +1132,9 @@ case ISD::UMULFIX: case ISD::UMULFIXSAT: case ISD::SDIVFIX: - case ISD::UDIVFIX: { + case ISD::SDIVFIXSAT: + case ISD::UDIVFIX: + case ISD::UDIVFIXSAT: { unsigned Scale = Node->getConstantOperandVal(2); Action = TLI.getFixedPointOperationAction(Node->getOpcode(), Node->getValueType(0), Scale); @@ -3489,7 +3491,9 @@ Results.push_back(TLI.expandFixedPointMul(Node, DAG)); break; case ISD::SDIVFIX: + case ISD::SDIVFIXSAT: case ISD::UDIVFIX: + case ISD::UDIVFIXSAT: if (SDValue V = TLI.expandFixedPointDiv(Node->getOpcode(), SDLoc(Node), Node->getOperand(0), Node->getOperand(1), diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -162,7 +162,9 @@ case ISD::UMULFIXSAT: Res = PromoteIntRes_MULFIX(N); break; case ISD::SDIVFIX: - case ISD::UDIVFIX: Res = PromoteIntRes_DIVFIX(N); break; + case ISD::SDIVFIXSAT: + case ISD::UDIVFIX: + case ISD::UDIVFIXSAT: Res = PromoteIntRes_DIVFIX(N); break; case ISD::ABS: Res = PromoteIntRes_ABS(N); break; @@ -784,22 +786,51 @@ N->getOperand(2)); } +static SDValue SaturateWidenedDIVFIX(SDValue V, SDLoc &dl, + unsigned SatW, bool Signed, + const TargetLowering &TLI, + SelectionDAG &DAG) { + EVT VT = V.getValueType(); + unsigned VTW = VT.getScalarSizeInBits(); + + if (!Signed) { + // Saturate to the unsigned maximum by getting the minimum of V and the + // maximum. + return DAG.getNode(ISD::UMIN, dl, VT, V, + DAG.getConstant(APInt::getLowBitsSet(VTW, SatW), + dl, VT)); + } + + // Saturate to the signed maximum (the low SatW - 1 bits) by taking the + // signed minimum of it and V. + V = DAG.getNode(ISD::SMIN, dl, VT, V, + DAG.getConstant(APInt::getLowBitsSet(VTW, SatW - 1), + dl, VT)); + // Saturate to the signed minimum (the high SatW + 1 bits) by taking the + // signed maximum of it and V. + V = DAG.getNode(ISD::SMAX, dl, VT, V, + DAG.getConstant(APInt::getHighBitsSet(VTW, VTW - SatW + 1), + dl, VT)); + return V; +} + static SDValue earlyExpandDIVFIX(SDNode *N, SDValue LHS, SDValue RHS, - unsigned Scale, const TargetLowering &TLI, - SelectionDAG &DAG) { + unsigned Scale, const TargetLowering &TLI, + SelectionDAG &DAG, unsigned SatW = 0) { EVT VT = LHS.getValueType(); - bool Signed = N->getOpcode() == ISD::SDIVFIX; + unsigned VTSize = VT.getScalarSizeInBits(); + bool Signed = N->getOpcode() == ISD::SDIVFIX || + N->getOpcode() == ISD::SDIVFIXSAT; + bool Saturating = N->getOpcode() == ISD::SDIVFIXSAT || + N->getOpcode() == ISD::UDIVFIXSAT; SDLoc dl(N); - // See if we can perform the division in this type without widening. - if (SDValue V = TLI.expandFixedPointDiv(N->getOpcode(), dl, LHS, RHS, Scale, - DAG)) - return V; - - // If that didn't work, double the type width and try again. That must work, - // or something is wrong. - EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), - VT.getScalarSizeInBits() * 2); + // Widen the types by a factor of two. This is guaranteed to expand, since it + // will always have enough high bits in the LHS to shift into. + EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VTSize * 2); + if (VT.isVector()) + WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT, + VT.getVectorElementCount()); if (Signed) { LHS = DAG.getSExtOrTrunc(LHS, dl, WideVT); RHS = DAG.getSExtOrTrunc(RHS, dl, WideVT); @@ -808,18 +839,28 @@ RHS = DAG.getZExtOrTrunc(RHS, dl, WideVT); } - // TODO: Saturation. - SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, LHS, RHS, Scale, DAG); assert(Res && "Expanding DIVFIX with wide type failed?"); + if (Saturating) { + // If the caller has told us to saturate at something less, use that width + // instead of the type before doubling. However, it cannot be more than + // what we just widened! + assert(SatW <= VTSize && + "Tried to saturate to more than the original type?"); + Res = SaturateWidenedDIVFIX(Res, dl, SatW == 0 ? VTSize : SatW, Signed, + TLI, DAG); + } return DAG.getZExtOrTrunc(Res, dl, VT); } SDValue DAGTypeLegalizer::PromoteIntRes_DIVFIX(SDNode *N) { SDLoc dl(N); SDValue Op1Promoted, Op2Promoted; - bool Signed = N->getOpcode() == ISD::SDIVFIX; + bool Signed = N->getOpcode() == ISD::SDIVFIX || + N->getOpcode() == ISD::SDIVFIXSAT; + bool Saturating = N->getOpcode() == ISD::SDIVFIXSAT || + N->getOpcode() == ISD::UDIVFIXSAT; if (Signed) { Op1Promoted = SExtPromotedInteger(N->getOperand(0)); Op2Promoted = SExtPromotedInteger(N->getOperand(1)); @@ -830,23 +871,41 @@ EVT PromotedType = Op1Promoted.getValueType(); unsigned Scale = N->getConstantOperandVal(2); - SDValue Res; // If the type is already legal and the operation is legal in that type, we // should not early expand. if (TLI.isTypeLegal(PromotedType)) { TargetLowering::LegalizeAction Action = TLI.getFixedPointOperationAction(N->getOpcode(), PromotedType, Scale); - if (Action == TargetLowering::Legal || Action == TargetLowering::Custom) - Res = DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted, - Op2Promoted, N->getOperand(2)); + if (Action == TargetLowering::Legal || Action == TargetLowering::Custom) { + EVT ShiftTy = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout()); + unsigned Diff = PromotedType.getScalarSizeInBits() - + N->getValueType(0).getScalarSizeInBits(); + if (Saturating) + Op1Promoted = DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted, + DAG.getConstant(Diff, dl, ShiftTy)); + SDValue Res = DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted, + Op2Promoted, N->getOperand(2)); + if (Saturating) + Res = DAG.getNode(Signed ? ISD::SRA : ISD::SRL, dl, PromotedType, Res, + DAG.getConstant(Diff, dl, ShiftTy)); + return Res; + } } - if (!Res) - Res = earlyExpandDIVFIX(N, Op1Promoted, Op2Promoted, Scale, TLI, DAG); - - // TODO: Saturation. - - return Res; + // See if we can perform the division in this type without expanding. + if (SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, Op1Promoted, + Op2Promoted, Scale, DAG)) { + if (Saturating) + Res = SaturateWidenedDIVFIX(Res, dl, + N->getValueType(0).getScalarSizeInBits(), + Signed, TLI, DAG); + return Res; + } + // If we cannot, expand it to twice the type width. If we are saturating, give + // it the original width as a saturating width so we don't need to emit + // two saturations. + return earlyExpandDIVFIX(N, Op1Promoted, Op2Promoted, Scale, TLI, DAG, + N->getValueType(0).getScalarSizeInBits()); } SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) { @@ -1315,7 +1374,9 @@ case ISD::UMULFIX: case ISD::UMULFIXSAT: case ISD::SDIVFIX: - case ISD::UDIVFIX: Res = PromoteIntOp_FIX(N); break; + case ISD::SDIVFIXSAT: + case ISD::UDIVFIX: + case ISD::UDIVFIXSAT: Res = PromoteIntOp_FIX(N); break; case ISD::FPOWI: Res = PromoteIntOp_FPOWI(N); break; @@ -1923,7 +1984,9 @@ case ISD::UMULFIXSAT: ExpandIntRes_MULFIX(N, Lo, Hi); break; case ISD::SDIVFIX: - case ISD::UDIVFIX: ExpandIntRes_DIVFIX(N, Lo, Hi); break; + case ISD::SDIVFIXSAT: + case ISD::UDIVFIX: + case ISD::UDIVFIXSAT: ExpandIntRes_DIVFIX(N, Lo, Hi); break; case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_MUL: @@ -3253,8 +3316,15 @@ void DAGTypeLegalizer::ExpandIntRes_DIVFIX(SDNode *N, SDValue &Lo, SDValue &Hi) { - SDValue Res = earlyExpandDIVFIX(N, N->getOperand(0), N->getOperand(1), - N->getConstantOperandVal(2), TLI, DAG); + SDLoc dl(N); + // Try expanding in the existing type first. + SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, N->getOperand(0), + N->getOperand(1), + N->getConstantOperandVal(2), DAG); + + if (!Res) + Res = earlyExpandDIVFIX(N, N->getOperand(0), N->getOperand(1), + N->getConstantOperandVal(2), TLI, DAG); SplitInteger(Res, Lo, Hi); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -142,7 +142,7 @@ void ExpandUADDSUBO(SDNode *Node, SmallVectorImpl &Results); void ExpandSADDSUBO(SDNode *Node, SmallVectorImpl &Results); void ExpandMULO(SDNode *Node, SmallVectorImpl &Results); - SDValue ExpandFixedPointDiv(SDNode *Node); + void ExpandFixedPointDiv(SDNode *Node, SmallVectorImpl &Results); SDValue ExpandStrictFPOp(SDNode *Node); void ExpandStrictFPOp(SDNode *Node, SmallVectorImpl &Results); @@ -463,7 +463,9 @@ case ISD::UMULFIX: case ISD::UMULFIXSAT: case ISD::SDIVFIX: - case ISD::UDIVFIX: { + case ISD::SDIVFIXSAT: + case ISD::UDIVFIX: + case ISD::UDIVFIXSAT: { unsigned Scale = Node->getConstantOperandVal(2); Action = TLI.getFixedPointOperationAction(Node->getOpcode(), Node->getValueType(0), Scale); @@ -968,8 +970,11 @@ break; case ISD::SDIVFIX: case ISD::UDIVFIX: - Results.push_back(ExpandFixedPointDiv(Node)); + ExpandFixedPointDiv(Node, Results); return; + case ISD::SDIVFIXSAT: + case ISD::UDIVFIXSAT: + break; #define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ case ISD::STRICT_##DAGN: #include "llvm/IR/ConstrainedOps.def" @@ -1454,12 +1459,12 @@ Results.push_back(Overflow); } -SDValue VectorLegalizer::ExpandFixedPointDiv(SDNode *Node) { +void VectorLegalizer::ExpandFixedPointDiv(SDNode *Node, + SmallVectorImpl &Results) { SDNode *N = Node; if (SDValue Expanded = TLI.expandFixedPointDiv(N->getOpcode(), SDLoc(N), N->getOperand(0), N->getOperand(1), N->getConstantOperandVal(2), DAG)) - return Expanded; - return DAG.UnrollVectorOp(N); + Results.push_back(Expanded); } void VectorLegalizer::ExpandStrictFPOp(SDNode *Node, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -166,7 +166,9 @@ case ISD::UMULFIX: case ISD::UMULFIXSAT: case ISD::SDIVFIX: + case ISD::SDIVFIXSAT: case ISD::UDIVFIX: + case ISD::UDIVFIXSAT: R = ScalarizeVecRes_FIX(N); break; } @@ -956,7 +958,9 @@ case ISD::UMULFIX: case ISD::UMULFIXSAT: case ISD::SDIVFIX: + case ISD::SDIVFIXSAT: case ISD::UDIVFIX: + case ISD::UDIVFIXSAT: SplitVecRes_FIX(N, Lo, Hi); break; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5451,7 +5451,8 @@ SDValue LHS, SDValue RHS, SDValue Scale, SelectionDAG &DAG, const TargetLowering &TLI) { EVT VT = LHS.getValueType(); - bool Signed = Opcode == ISD::SDIVFIX; + bool Signed = Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT; + bool Saturating = Opcode == ISD::SDIVFIXSAT || Opcode == ISD::UDIVFIXSAT; LLVMContext &Ctx = *DAG.getContext(); // If the type is legal but the operation isn't, this node might survive all @@ -5463,14 +5464,16 @@ // by bumping the size by one bit. This will force it to Promote, enabling the // early expansion and avoiding the need to expand later. - // We don't have to do this if Scale is 0; that can always be expanded. + // We don't have to do this if Scale is 0; that can always be expanded, unless + // it's a saturating signed operation. Those can experience true integer + // division overflow, a case which we must avoid. // FIXME: We wouldn't have to do this (or any of the early // expansion/promotion) if it was possible to expand a libcall of an // illegal type during operation legalization. But it's not, so things // get a bit hacky. unsigned ScaleInt = cast(Scale)->getZExtValue(); - if (ScaleInt > 0 && + if ((ScaleInt > 0 || (Saturating && Signed)) && (TLI.isTypeLegal(VT) || (VT.isVector() && TLI.isTypeLegal(VT.getVectorElementType())))) { TargetLowering::LegalizeAction Action = TLI.getFixedPointOperationAction( @@ -5492,8 +5495,16 @@ LHS = DAG.getZExtOrTrunc(LHS, DL, PromVT); RHS = DAG.getZExtOrTrunc(RHS, DL, PromVT); } - // TODO: Saturation. + EVT ShiftTy = TLI.getShiftAmountTy(PromVT, DAG.getDataLayout()); + // For saturating operations, we need to shift up the LHS to get the + // proper saturation width, and then shift down again afterwards. + if (Saturating) + LHS = DAG.getNode(ISD::SHL, DL, PromVT, LHS, + DAG.getConstant(1, DL, ShiftTy)); SDValue Res = DAG.getNode(Opcode, DL, PromVT, LHS, RHS, Scale); + if (Saturating) + Res = DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, PromVT, Res, + DAG.getConstant(1, DL, ShiftTy)); return DAG.getZExtOrTrunc(Res, DL, VT); } } @@ -5757,6 +5768,10 @@ return ISD::SDIVFIX; case Intrinsic::udiv_fix: return ISD::UDIVFIX; + case Intrinsic::sdiv_fix_sat: + return ISD::SDIVFIXSAT; + case Intrinsic::udiv_fix_sat: + return ISD::UDIVFIXSAT; default: llvm_unreachable("Unhandled fixed point intrinsic"); } @@ -6460,7 +6475,9 @@ return; } case Intrinsic::sdiv_fix: - case Intrinsic::udiv_fix: { + case Intrinsic::udiv_fix: + case Intrinsic::sdiv_fix_sat: + case Intrinsic::udiv_fix_sat: { SDValue Op1 = getValue(I.getArgOperand(0)); SDValue Op2 = getValue(I.getArgOperand(1)); SDValue Op3 = getValue(I.getArgOperand(2)); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -314,7 +314,9 @@ case ISD::UMULFIXSAT: return "umulfixsat"; case ISD::SDIVFIX: return "sdivfix"; + case ISD::SDIVFIXSAT: return "sdivfixsat"; case ISD::UDIVFIX: return "udivfix"; + case ISD::UDIVFIXSAT: return "udivfixsat"; // Conversion operators. case ISD::SIGN_EXTEND: return "sign_extend"; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7332,12 +7332,13 @@ TargetLowering::expandFixedPointDiv(unsigned Opcode, const SDLoc &dl, SDValue LHS, SDValue RHS, unsigned Scale, SelectionDAG &DAG) const { - assert((Opcode == ISD::SDIVFIX || - Opcode == ISD::UDIVFIX) && + assert((Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT || + Opcode == ISD::UDIVFIX || Opcode == ISD::UDIVFIXSAT) && "Expected a fixed point division opcode"); EVT VT = LHS.getValueType(); - bool Signed = Opcode == ISD::SDIVFIX; + bool Signed = Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT; + bool Saturating = Opcode == ISD::SDIVFIXSAT || Opcode == ISD::UDIVFIXSAT; EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); // If there is enough room in the type to upscale the LHS or downscale the @@ -7349,7 +7350,15 @@ : DAG.computeKnownBits(LHS).countMinLeadingZeros(); unsigned RHSTrail = DAG.computeKnownBits(RHS).countMinTrailingZeros(); - if (LHSLead + RHSTrail < Scale) + // For signed saturating operations, we need to be able to detect true integer + // division overflow; that is, when you have MIN / -EPS. However, this + // is undefined behavior and if we emit divisions that could take such + // values it may cause undesired behavior (arithmetic exceptions on x86, for + // example). + // Avoid this by requiring an extra bit so that we never get this case. + // FIXME: This is a bit unfortunate as it means that for an 8-bit 7-scale + // signed saturating division, we need to emit a whopping 32-bit division. + if (LHSLead + RHSTrail < Scale + (unsigned)(Saturating && Signed)) return SDValue(); unsigned LHSShift = std::min(LHSLead, Scale); @@ -7403,8 +7412,6 @@ Quot = DAG.getNode(ISD::UDIV, dl, VT, LHS, RHS); - // TODO: Saturation. - return Quot; } diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -660,7 +660,9 @@ setOperationAction(ISD::UMULFIX, VT, Expand); setOperationAction(ISD::UMULFIXSAT, VT, Expand); setOperationAction(ISD::SDIVFIX, VT, Expand); + setOperationAction(ISD::SDIVFIXSAT, VT, Expand); setOperationAction(ISD::UDIVFIX, VT, Expand); + setOperationAction(ISD::UDIVFIXSAT, VT, Expand); // Overflow operations default to expand setOperationAction(ISD::SADDO, VT, Expand); diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -4727,7 +4727,9 @@ case Intrinsic::umul_fix: case Intrinsic::umul_fix_sat: case Intrinsic::sdiv_fix: - case Intrinsic::udiv_fix: { + case Intrinsic::sdiv_fix_sat: + case Intrinsic::udiv_fix: + case Intrinsic::udiv_fix_sat: { Value *Op1 = Call.getArgOperand(0); Value *Op2 = Call.getArgOperand(1); Assert(Op1->getType()->isIntOrIntVectorTy(), @@ -4742,7 +4744,7 @@ "third argument of [us][mul|div]_fix[_sat] must fit within 32 bits"); if (ID == Intrinsic::smul_fix || ID == Intrinsic::smul_fix_sat || - ID == Intrinsic::sdiv_fix) { + ID == Intrinsic::sdiv_fix || ID == Intrinsic::sdiv_fix_sat) { Assert( Op3->getZExtValue() < Op1->getType()->getScalarSizeInBits(), "the scale of s[mul|div]_fix[_sat] must be less than the width of " diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -0,0 +1,1411 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86 + +declare i4 @llvm.sdiv.fix.sat.i4 (i4, i4, i32) +declare i15 @llvm.sdiv.fix.sat.i15 (i15, i15, i32) +declare i16 @llvm.sdiv.fix.sat.i16 (i16, i16, i32) +declare i18 @llvm.sdiv.fix.sat.i18 (i18, i18, i32) +declare i64 @llvm.sdiv.fix.sat.i64 (i64, i64, i32) +declare <4 x i32> @llvm.sdiv.fix.sat.v4i32(<4 x i32>, <4 x i32>, i32) + +define i16 @func(i16 %x, i16 %y) nounwind { +; +; X64-LABEL: func: +; X64: # %bb.0: +; X64-NEXT: movswl %si, %esi +; X64-NEXT: movswl %di, %ecx +; X64-NEXT: shll $8, %ecx +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: cltd +; X64-NEXT: idivl %esi +; X64-NEXT: # kill: def $eax killed $eax def $rax +; X64-NEXT: leal -1(%rax), %edi +; X64-NEXT: testl %esi, %esi +; X64-NEXT: sets %sil +; X64-NEXT: testl %ecx, %ecx +; X64-NEXT: sets %cl +; X64-NEXT: xorb %sil, %cl +; X64-NEXT: testl %edx, %edx +; X64-NEXT: setne %dl +; X64-NEXT: testb %cl, %dl +; X64-NEXT: cmovel %eax, %edi +; X64-NEXT: cmpl $65535, %edi # imm = 0xFFFF +; X64-NEXT: movl $65535, %ecx # imm = 0xFFFF +; X64-NEXT: cmovll %edi, %ecx +; X64-NEXT: cmpl $-65536, %ecx # imm = 0xFFFF0000 +; X64-NEXT: movl $-65536, %eax # imm = 0xFFFF0000 +; X64-NEXT: cmovgl %ecx, %eax +; X64-NEXT: shrl %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq +; +; X86-LABEL: func: +; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movswl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $8, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cltd +; X86-NEXT: idivl %esi +; X86-NEXT: leal -1(%eax), %edi +; X86-NEXT: testl %esi, %esi +; X86-NEXT: sets %bl +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: sets %cl +; X86-NEXT: xorb %bl, %cl +; X86-NEXT: testl %edx, %edx +; X86-NEXT: setne %dl +; X86-NEXT: testb %cl, %dl +; X86-NEXT: cmovel %eax, %edi +; X86-NEXT: cmpl $65535, %edi # imm = 0xFFFF +; X86-NEXT: movl $65535, %ecx # imm = 0xFFFF +; X86-NEXT: cmovll %edi, %ecx +; X86-NEXT: cmpl $-65536, %ecx # imm = 0xFFFF0000 +; X86-NEXT: movl $-65536, %eax # imm = 0xFFFF0000 +; X86-NEXT: cmovgl %ecx, %eax +; X86-NEXT: shrl %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: retl + %tmp = call i16 @llvm.sdiv.fix.sat.i16(i16 %x, i16 %y, i32 7) + ret i16 %tmp +} + +define i16 @func2(i8 %x, i8 %y) nounwind { +; +; X64-LABEL: func2: +; X64: # %bb.0: +; X64-NEXT: movsbl %dil, %eax +; X64-NEXT: movsbl %sil, %ecx +; X64-NEXT: movswl %cx, %esi +; X64-NEXT: movswl %ax, %ecx +; X64-NEXT: shll $14, %ecx +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: cltd +; X64-NEXT: idivl %esi +; X64-NEXT: # kill: def $eax killed $eax def $rax +; X64-NEXT: leal -1(%rax), %edi +; X64-NEXT: testl %esi, %esi +; X64-NEXT: sets %sil +; X64-NEXT: testl %ecx, %ecx +; X64-NEXT: sets %cl +; X64-NEXT: xorb %sil, %cl +; X64-NEXT: testl %edx, %edx +; X64-NEXT: setne %dl +; X64-NEXT: testb %cl, %dl +; X64-NEXT: cmovel %eax, %edi +; X64-NEXT: cmpl $16383, %edi # imm = 0x3FFF +; X64-NEXT: movl $16383, %ecx # imm = 0x3FFF +; X64-NEXT: cmovll %edi, %ecx +; X64-NEXT: cmpl $-16384, %ecx # imm = 0xC000 +; X64-NEXT: movl $-16384, %eax # imm = 0xC000 +; X64-NEXT: cmovgl %ecx, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq +; +; X86-LABEL: func2: +; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $14, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cltd +; X86-NEXT: idivl %esi +; X86-NEXT: leal -1(%eax), %edi +; X86-NEXT: testl %esi, %esi +; X86-NEXT: sets %bl +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: sets %cl +; X86-NEXT: xorb %bl, %cl +; X86-NEXT: testl %edx, %edx +; X86-NEXT: setne %dl +; X86-NEXT: testb %cl, %dl +; X86-NEXT: cmovel %eax, %edi +; X86-NEXT: cmpl $16383, %edi # imm = 0x3FFF +; X86-NEXT: movl $16383, %ecx # imm = 0x3FFF +; X86-NEXT: cmovll %edi, %ecx +; X86-NEXT: cmpl $-16384, %ecx # imm = 0xC000 +; X86-NEXT: movl $-16384, %eax # imm = 0xC000 +; X86-NEXT: cmovgl %ecx, %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: retl + %x2 = sext i8 %x to i15 + %y2 = sext i8 %y to i15 + %tmp = call i15 @llvm.sdiv.fix.sat.i15(i15 %x2, i15 %y2, i32 14) + %tmp2 = sext i15 %tmp to i16 + ret i16 %tmp2 +} + +define i16 @func3(i15 %x, i8 %y) nounwind { +; +; X64-LABEL: func3: +; X64: # %bb.0: +; X64-NEXT: shll $8, %esi +; X64-NEXT: movswl %si, %ecx +; X64-NEXT: addl %edi, %edi +; X64-NEXT: shrl $4, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: cwtd +; X64-NEXT: idivw %cx +; X64-NEXT: # kill: def $ax killed $ax def $rax +; X64-NEXT: leal -1(%rax), %esi +; X64-NEXT: testw %di, %di +; X64-NEXT: sets %dil +; X64-NEXT: testw %cx, %cx +; X64-NEXT: sets %cl +; X64-NEXT: xorb %dil, %cl +; X64-NEXT: testw %dx, %dx +; X64-NEXT: setne %dl +; X64-NEXT: testb %cl, %dl +; X64-NEXT: cmovel %eax, %esi +; X64-NEXT: movswl %si, %eax +; X64-NEXT: cmpl $16383, %eax # imm = 0x3FFF +; X64-NEXT: movl $16383, %ecx # imm = 0x3FFF +; X64-NEXT: cmovll %esi, %ecx +; X64-NEXT: movswl %cx, %eax +; X64-NEXT: cmpl $-16384, %eax # imm = 0xC000 +; X64-NEXT: movl $49152, %eax # imm = 0xC000 +; X64-NEXT: cmovgl %ecx, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq +; +; X86-LABEL: func3: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $8, %eax +; X86-NEXT: movswl %ax, %esi +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: shrl $4, %esi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cwtd +; X86-NEXT: idivw %si +; X86-NEXT: # kill: def $ax killed $ax def $eax +; X86-NEXT: leal -1(%eax), %edi +; X86-NEXT: testw %cx, %cx +; X86-NEXT: sets %cl +; X86-NEXT: testw %si, %si +; X86-NEXT: sets %ch +; X86-NEXT: xorb %cl, %ch +; X86-NEXT: testw %dx, %dx +; X86-NEXT: setne %cl +; X86-NEXT: testb %ch, %cl +; X86-NEXT: cmovel %eax, %edi +; X86-NEXT: movswl %di, %eax +; X86-NEXT: cmpl $16383, %eax # imm = 0x3FFF +; X86-NEXT: movl $16383, %ecx # imm = 0x3FFF +; X86-NEXT: cmovll %edi, %ecx +; X86-NEXT: movswl %cx, %eax +; X86-NEXT: cmpl $-16384, %eax # imm = 0xC000 +; X86-NEXT: movl $49152, %eax # imm = 0xC000 +; X86-NEXT: cmovgl %ecx, %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl + %y2 = sext i8 %y to i15 + %y3 = shl i15 %y2, 7 + %tmp = call i15 @llvm.sdiv.fix.sat.i15(i15 %x, i15 %y3, i32 4) + %tmp2 = sext i15 %tmp to i16 + ret i16 %tmp2 +} + +define i4 @func4(i4 %x, i4 %y) nounwind { +; +; X64-LABEL: func4: +; X64: # %bb.0: +; X64-NEXT: pushq %rbx +; X64-NEXT: shlb $4, %sil +; X64-NEXT: sarb $4, %sil +; X64-NEXT: shlb $4, %dil +; X64-NEXT: sarb $4, %dil +; X64-NEXT: shlb $2, %dil +; X64-NEXT: movsbl %dil, %ecx +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: idivb %sil +; X64-NEXT: movsbl %ah, %ebx +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: leal -1(%rax), %edi +; X64-NEXT: movzbl %dil, %edi +; X64-NEXT: testb %sil, %sil +; X64-NEXT: sets %dl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: sets %cl +; X64-NEXT: xorb %dl, %cl +; X64-NEXT: testb %bl, %bl +; X64-NEXT: setne %dl +; X64-NEXT: testb %cl, %dl +; X64-NEXT: cmovel %eax, %edi +; X64-NEXT: cmpb $7, %dil +; X64-NEXT: movl $7, %ecx +; X64-NEXT: cmovll %edi, %ecx +; X64-NEXT: cmpb $-8, %cl +; X64-NEXT: movl $248, %eax +; X64-NEXT: cmovgl %ecx, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: popq %rbx +; X64-NEXT: retq +; +; X86-LABEL: func4: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movb {{[0-9]+}}(%esp), %dl +; X86-NEXT: shlb $4, %dl +; X86-NEXT: sarb $4, %dl +; X86-NEXT: movb {{[0-9]+}}(%esp), %dh +; X86-NEXT: shlb $4, %dh +; X86-NEXT: sarb $4, %dh +; X86-NEXT: shlb $2, %dh +; X86-NEXT: movsbl %dh, %eax +; X86-NEXT: idivb %dl +; X86-NEXT: movsbl %ah, %ecx +; X86-NEXT: movzbl %al, %esi +; X86-NEXT: decb %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: testb %dl, %dl +; X86-NEXT: sets %dl +; X86-NEXT: testb %dh, %dh +; X86-NEXT: sets %dh +; X86-NEXT: xorb %dl, %dh +; X86-NEXT: testb %cl, %cl +; X86-NEXT: setne %cl +; X86-NEXT: testb %dh, %cl +; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: cmpb $7, %al +; X86-NEXT: movl $7, %ecx +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: cmpb $-8, %cl +; X86-NEXT: movl $248, %eax +; X86-NEXT: cmovgl %ecx, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: popl %esi +; X86-NEXT: retl + %tmp = call i4 @llvm.sdiv.fix.sat.i4(i4 %x, i4 %y, i32 2) + ret i4 %tmp +} + +define i64 @func5(i64 %x, i64 %y) nounwind { +; +; X64-LABEL: func5: +; X64: # %bb.0: +; X64-NEXT: pushq %rbp +; X64-NEXT: pushq %r15 +; X64-NEXT: pushq %r14 +; X64-NEXT: pushq %r13 +; X64-NEXT: pushq %r12 +; X64-NEXT: pushq %rbx +; X64-NEXT: subq $40, %rsp +; X64-NEXT: movq %rdi, %r15 +; X64-NEXT: leaq (%rdi,%rdi), %rax +; X64-NEXT: shrq $33, %rax +; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: sarq $63, %r12 +; X64-NEXT: shlq $31, %r12 +; X64-NEXT: orq %rax, %r12 +; X64-NEXT: sets {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; X64-NEXT: shlq $32, %r15 +; X64-NEXT: movq %rsi, %rdx +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rsi, %r13 +; X64-NEXT: sarq $63, %r13 +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r12, %rsi +; X64-NEXT: movq %r13, %rcx +; X64-NEXT: callq __divti3 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: subq $1, %rbx +; X64-NEXT: sbbq $0, %rbp +; X64-NEXT: testq %r13, %r13 +; X64-NEXT: sets %r14b +; X64-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %r14b # 1-byte Folded Reload +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r12, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; X64-NEXT: movq %r13, %rcx +; X64-NEXT: callq __modti3 +; X64-NEXT: orq %rax, %rdx +; X64-NEXT: setne %al +; X64-NEXT: testb %r14b, %al +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; X64-NEXT: cmpq $-1, %rbx +; X64-NEXT: movq $-1, %rax +; X64-NEXT: movq $-1, %rcx +; X64-NEXT: cmovbq %rbx, %rcx +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: testq %rbp, %rbp +; X64-NEXT: cmovnsq %rax, %rbx +; X64-NEXT: cmoveq %rcx, %rbx +; X64-NEXT: cmovnsq %rdx, %rbp +; X64-NEXT: testq %rbx, %rbx +; X64-NEXT: movl $0, %ecx +; X64-NEXT: cmovaq %rbx, %rcx +; X64-NEXT: testq %rbp, %rbp +; X64-NEXT: cmovnsq %rbp, %rax +; X64-NEXT: cmovsq %rdx, %rbx +; X64-NEXT: cmpq $-1, %rbp +; X64-NEXT: cmoveq %rcx, %rbx +; X64-NEXT: shrdq $1, %rax, %rbx +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: addq $40, %rsp +; X64-NEXT: popq %rbx +; X64-NEXT: popq %r12 +; X64-NEXT: popq %r13 +; X64-NEXT: popq %r14 +; X64-NEXT: popq %r15 +; X64-NEXT: popq %rbp +; X64-NEXT: retq +; +; X86-LABEL: func5: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $88, %esp +; X86-NEXT: movl 8(%ebp), %ecx +; X86-NEXT: movl 12(%ebp), %eax +; X86-NEXT: movl 20(%ebp), %ebx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $31, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $31, %ecx, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll $31, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl 20(%ebp) +; X86-NEXT: pushl 16(%ebp) +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %eax +; X86-NEXT: calll __divti3 +; X86-NEXT: addl $32, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl $1, %esi +; X86-NEXT: sbbl $0, %edi +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: testl %edx, %edx +; X86-NEXT: sets %al +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: sets %ah +; X86-NEXT: xorb %al, %ah +; X86-NEXT: movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %edx +; X86-NEXT: pushl 20(%ebp) +; X86-NEXT: pushl 16(%ebp) +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl %eax +; X86-NEXT: calll __modti3 +; X86-NEXT: addl $32, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: setne %al +; X86-NEXT: testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: cmovsl %ebx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: cmovsl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF +; X86-NEXT: cmovsl %edi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: andl %eax, %edx +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: cmovel %ebx, %edx +; X86-NEXT: cmpl $-1, %esi +; X86-NEXT: movl $-1, %eax +; X86-NEXT: cmovbl %esi, %eax +; X86-NEXT: cmpl $2147483647, %edi # imm = 0x7FFFFFFF +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: cmovael %ecx, %esi +; X86-NEXT: cmovel %eax, %esi +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: cmovael %eax, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: testl %esi, %esi +; X86-NEXT: movl $0, %eax +; X86-NEXT: cmoval %esi, %eax +; X86-NEXT: cmpl $-2147483648, %edi # imm = 0x80000000 +; X86-NEXT: movl $0, %ecx +; X86-NEXT: cmoval %esi, %ecx +; X86-NEXT: cmovel %eax, %ecx +; X86-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; X86-NEXT: cmoval %edi, %eax +; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl $-2147483648, %ebx # imm = 0x80000000 +; X86-NEXT: cmovsl %ebx, %edi +; X86-NEXT: movl $0, %ebx +; X86-NEXT: cmovsl %ebx, %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: cmpl $-1, %edx +; X86-NEXT: cmovel %ecx, %esi +; X86-NEXT: cmovel %eax, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %edx +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl + %tmp = call i64 @llvm.sdiv.fix.sat.i64(i64 %x, i64 %y, i32 31) + ret i64 %tmp +} + +define i18 @func6(i16 %x, i16 %y) nounwind { +; +; X64-LABEL: func6: +; X64: # %bb.0: +; X64-NEXT: movswl %di, %ecx +; X64-NEXT: movswl %si, %esi +; X64-NEXT: shll $7, %ecx +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: cltd +; X64-NEXT: idivl %esi +; X64-NEXT: # kill: def $eax killed $eax def $rax +; X64-NEXT: leal -1(%rax), %edi +; X64-NEXT: testl %esi, %esi +; X64-NEXT: sets %sil +; X64-NEXT: testl %ecx, %ecx +; X64-NEXT: sets %cl +; X64-NEXT: xorb %sil, %cl +; X64-NEXT: testl %edx, %edx +; X64-NEXT: setne %dl +; X64-NEXT: testb %cl, %dl +; X64-NEXT: cmovel %eax, %edi +; X64-NEXT: cmpl $131071, %edi # imm = 0x1FFFF +; X64-NEXT: movl $131071, %ecx # imm = 0x1FFFF +; X64-NEXT: cmovll %edi, %ecx +; X64-NEXT: cmpl $-131072, %ecx # imm = 0xFFFE0000 +; X64-NEXT: movl $-131072, %eax # imm = 0xFFFE0000 +; X64-NEXT: cmovgl %ecx, %eax +; X64-NEXT: retq +; +; X86-LABEL: func6: +; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movswl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $7, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cltd +; X86-NEXT: idivl %esi +; X86-NEXT: leal -1(%eax), %edi +; X86-NEXT: testl %esi, %esi +; X86-NEXT: sets %bl +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: sets %cl +; X86-NEXT: xorb %bl, %cl +; X86-NEXT: testl %edx, %edx +; X86-NEXT: setne %dl +; X86-NEXT: testb %cl, %dl +; X86-NEXT: cmovel %eax, %edi +; X86-NEXT: cmpl $131071, %edi # imm = 0x1FFFF +; X86-NEXT: movl $131071, %ecx # imm = 0x1FFFF +; X86-NEXT: cmovll %edi, %ecx +; X86-NEXT: cmpl $-131072, %ecx # imm = 0xFFFE0000 +; X86-NEXT: movl $-131072, %eax # imm = 0xFFFE0000 +; X86-NEXT: cmovgl %ecx, %eax +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: retl + %x2 = sext i16 %x to i18 + %y2 = sext i16 %y to i18 + %tmp = call i18 @llvm.sdiv.fix.sat.i18(i18 %x2, i18 %y2, i32 7) + ret i18 %tmp +} + +define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { +; +; X64-LABEL: vec: +; X64: # %bb.0: +; X64-NEXT: pushq %rbp +; X64-NEXT: pushq %r15 +; X64-NEXT: pushq %r14 +; X64-NEXT: pushq %r13 +; X64-NEXT: pushq %r12 +; X64-NEXT: pushq %rbx +; X64-NEXT: subq $104, %rsp +; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: paddq %xmm0, %xmm0 +; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movq %xmm0, %rbp +; X64-NEXT: movq %rbp, %r12 +; X64-NEXT: shrq $33, %r12 +; X64-NEXT: movq %rbp, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: shlq $31, %r14 +; X64-NEXT: orq %r14, %r12 +; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: pcmpgtd %xmm1, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movq %xmm1, %rdx +; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: shlq $31, %rbp +; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: movq %r12, %rsi +; X64-NEXT: movq %rbx, %rcx +; X64-NEXT: callq __divti3 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: subq $1, %r13 +; X64-NEXT: sbbq $0, %r15 +; X64-NEXT: shrq $63, %r14 +; X64-NEXT: xorl %ebx, %r14d +; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: movq %r12, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; X64-NEXT: movq %rbx, %rcx +; X64-NEXT: callq __modti3 +; X64-NEXT: orq %rax, %rdx +; X64-NEXT: setne %al +; X64-NEXT: testb %r14b, %al +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF +; X64-NEXT: cmpq %rdx, %r13 +; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF +; X64-NEXT: cmovbq %r13, %rax +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: testq %r15, %r15 +; X64-NEXT: cmovnsq %rdx, %r13 +; X64-NEXT: cmoveq %rax, %r13 +; X64-NEXT: cmovnsq %rcx, %r15 +; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 +; X64-NEXT: cmpq %rcx, %r13 +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: cmovaq %r13, %rax +; X64-NEXT: testq %r15, %r15 +; X64-NEXT: cmovsq %rcx, %r13 +; X64-NEXT: cmpq $-1, %r15 +; X64-NEXT: cmoveq %rax, %r13 +; X64-NEXT: movq %r13, %xmm0 +; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = mem[2,3,0,1] +; X64-NEXT: movq %xmm0, %r13 +; X64-NEXT: movq %r13, %rbx +; X64-NEXT: shrq $33, %rbx +; X64-NEXT: movq %r13, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: shlq $31, %r14 +; X64-NEXT: orq %r14, %rbx +; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = mem[2,3,0,1] +; X64-NEXT: movq %xmm0, %rdx +; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: sarq $63, %rbp +; X64-NEXT: shlq $31, %r13 +; X64-NEXT: movq %r13, %rdi +; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: callq __divti3 +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: subq $1, %r12 +; X64-NEXT: sbbq $0, %r15 +; X64-NEXT: shrq $63, %r14 +; X64-NEXT: xorl %ebp, %r14d +; X64-NEXT: movq %r13, %rdi +; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: callq __modti3 +; X64-NEXT: orq %rax, %rdx +; X64-NEXT: setne %al +; X64-NEXT: testb %r14b, %al +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF +; X64-NEXT: cmpq %rcx, %r12 +; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF +; X64-NEXT: cmovbq %r12, %rax +; X64-NEXT: testq %r15, %r15 +; X64-NEXT: cmovnsq %rcx, %r12 +; X64-NEXT: cmoveq %rax, %r12 +; X64-NEXT: movl $0, %eax +; X64-NEXT: cmovnsq %rax, %r15 +; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 +; X64-NEXT: cmpq %rcx, %r12 +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: cmovaq %r12, %rax +; X64-NEXT: testq %r15, %r15 +; X64-NEXT: cmovsq %rcx, %r12 +; X64-NEXT: cmpq $-1, %r15 +; X64-NEXT: cmoveq %rax, %r12 +; X64-NEXT: movq %r12, %xmm0 +; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; X64-NEXT: psrlq $1, %xmm1 +; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-NEXT: # xmm1 = mem[2,3,0,1] +; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: pcmpgtd %xmm1, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: paddq %xmm1, %xmm1 +; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movq %xmm1, %r12 +; X64-NEXT: movq %r12, %rbx +; X64-NEXT: shrq $33, %rbx +; X64-NEXT: movq %r12, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: shlq $31, %r14 +; X64-NEXT: orq %r14, %rbx +; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-NEXT: # xmm1 = mem[2,3,0,1] +; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: pcmpgtd %xmm1, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movq %xmm1, %rdx +; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: sarq $63, %rbp +; X64-NEXT: shlq $31, %r12 +; X64-NEXT: movq %r12, %rdi +; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: callq __divti3 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: subq $1, %r13 +; X64-NEXT: sbbq $0, %r15 +; X64-NEXT: shrq $63, %r14 +; X64-NEXT: xorl %ebp, %r14d +; X64-NEXT: movq %r12, %rdi +; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: callq __modti3 +; X64-NEXT: orq %rax, %rdx +; X64-NEXT: setne %al +; X64-NEXT: testb %r14b, %al +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF +; X64-NEXT: cmpq %rcx, %r13 +; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF +; X64-NEXT: cmovbq %r13, %rax +; X64-NEXT: testq %r15, %r15 +; X64-NEXT: cmovnsq %rcx, %r13 +; X64-NEXT: cmoveq %rax, %r13 +; X64-NEXT: movl $0, %eax +; X64-NEXT: cmovnsq %rax, %r15 +; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 +; X64-NEXT: cmpq %rcx, %r13 +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: cmovaq %r13, %rax +; X64-NEXT: testq %r15, %r15 +; X64-NEXT: cmovsq %rcx, %r13 +; X64-NEXT: cmpq $-1, %r15 +; X64-NEXT: cmoveq %rax, %r13 +; X64-NEXT: movq %r13, %xmm0 +; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = mem[2,3,0,1] +; X64-NEXT: movq %xmm0, %r13 +; X64-NEXT: movq %r13, %rbx +; X64-NEXT: shrq $33, %rbx +; X64-NEXT: movq %r13, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: shlq $31, %r14 +; X64-NEXT: orq %r14, %rbx +; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = mem[2,3,0,1] +; X64-NEXT: movq %xmm0, %rdx +; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: sarq $63, %rbp +; X64-NEXT: shlq $31, %r13 +; X64-NEXT: movq %r13, %rdi +; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: callq __divti3 +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: subq $1, %r12 +; X64-NEXT: sbbq $0, %r15 +; X64-NEXT: shrq $63, %r14 +; X64-NEXT: xorl %ebp, %r14d +; X64-NEXT: movq %r13, %rdi +; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: callq __modti3 +; X64-NEXT: orq %rax, %rdx +; X64-NEXT: setne %al +; X64-NEXT: testb %r14b, %al +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF +; X64-NEXT: cmpq %rcx, %r12 +; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF +; X64-NEXT: cmovbq %r12, %rax +; X64-NEXT: testq %r15, %r15 +; X64-NEXT: cmovnsq %rcx, %r12 +; X64-NEXT: cmoveq %rax, %r12 +; X64-NEXT: movl $0, %eax +; X64-NEXT: cmovnsq %rax, %r15 +; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 +; X64-NEXT: cmpq %rcx, %r12 +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: cmovaq %r12, %rax +; X64-NEXT: testq %r15, %r15 +; X64-NEXT: cmovsq %rcx, %r12 +; X64-NEXT: cmpq $-1, %r15 +; X64-NEXT: cmoveq %rax, %r12 +; X64-NEXT: movq %r12, %xmm0 +; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; X64-NEXT: psrlq $1, %xmm1 +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-NEXT: addq $104, %rsp +; X64-NEXT: popq %rbx +; X64-NEXT: popq %r12 +; X64-NEXT: popq %r13 +; X64-NEXT: popq %r14 +; X64-NEXT: popq %r15 +; X64-NEXT: popq %rbp +; X64-NEXT: retq +; +; X86-LABEL: vec: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $256, %esp # imm = 0x100 +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl 40(%ebp), %ebx +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: adcl %eax, %eax +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl $1, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: shll $31, %eax +; X86-NEXT: shrl %ecx +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $31, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: negl %edi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %ebx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax +; X86-NEXT: calll __modti3 +; X86-NEXT: addl $32, %esp +; X86-NEXT: movl 36(%ebp), %edi +; X86-NEXT: movl %edi, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl 20(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: adcl %eax, %eax +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl $1, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: shll $31, %eax +; X86-NEXT: shrl %ecx +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $31, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: negl %ebx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %edi +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax +; X86-NEXT: calll __modti3 +; X86-NEXT: addl $32, %esp +; X86-NEXT: movl 28(%ebp), %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: adcl %eax, %eax +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $1, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shll $31, %eax +; X86-NEXT: shrl %ecx +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $31, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: negl %esi +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %eax +; X86-NEXT: calll __divti3 +; X86-NEXT: addl $32, %esp +; X86-NEXT: movl 32(%ebp), %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl 16(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: adcl %eax, %eax +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: andl $1, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shll $31, %eax +; X86-NEXT: shrl %ecx +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $31, %ebx +; X86-NEXT: negl %esi +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %eax +; X86-NEXT: calll __modti3 +; X86-NEXT: addl $32, %esp +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi +; X86-NEXT: pushl 32(%ebp) +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %eax +; X86-NEXT: calll __divti3 +; X86-NEXT: addl $32, %esp +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl 40(%ebp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl %eax +; X86-NEXT: calll __divti3 +; X86-NEXT: addl $32, %esp +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl 36(%ebp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ecx +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl %eax +; X86-NEXT: calll __divti3 +; X86-NEXT: addl $32, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: sbbl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: sets %bl +; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: sets %bh +; X86-NEXT: xorb %bl, %bh +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: orl {{[0-9]+}}(%esp), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: setne %al +; X86-NEXT: testb %bh, %al +; X86-NEXT: cmovel %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sbbl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: sets %bl +; X86-NEXT: testl %edi, %edi +; X86-NEXT: sets %bh +; X86-NEXT: xorb %bl, %bh +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: orl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl %edi, %eax +; X86-NEXT: setne %al +; X86-NEXT: testb %bh, %al +; X86-NEXT: cmovel %edx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: testl %edx, %edx +; X86-NEXT: sets %al +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: sets %bl +; X86-NEXT: xorb %al, %bl +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl 28(%ebp) +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %edx +; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl %eax +; X86-NEXT: calll __modti3 +; X86-NEXT: addl $32, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: setne %al +; X86-NEXT: testb %bl, %al +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl $1, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %ecx +; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: sets %bl +; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: sets %bh +; X86-NEXT: xorb %bl, %bh +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: orl {{[0-9]+}}(%esp), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: setne %al +; X86-NEXT: testb %bh, %al +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovel %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: movl $0, %eax +; X86-NEXT: cmovsl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $-1, %eax +; X86-NEXT: cmovsl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: testl %eax, %eax +; X86-NEXT: cmovel %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %edx +; X86-NEXT: cmovsl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $-1, %eax +; X86-NEXT: cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: testl %eax, %eax +; X86-NEXT: cmovel %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %edx +; X86-NEXT: cmovsl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $-1, %eax +; X86-NEXT: cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: testl %eax, %eax +; X86-NEXT: cmovel %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %edi +; X86-NEXT: cmovsl %eax, %edi +; X86-NEXT: movl $-1, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: cmovsl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: cmovel %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: cmpl $-1, %edx +; X86-NEXT: movl $-1, %eax +; X86-NEXT: cmovael %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: notl %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: testl %edx, %edx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: cmovbl %edx, %ecx +; X86-NEXT: andl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: cmovel %ecx, %esi +; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: testl %eax, %eax +; X86-NEXT: movl $0, %ecx +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: cmpl $-1, %esi +; X86-NEXT: movl $0, %edx +; X86-NEXT: cmovnel %edx, %ecx +; X86-NEXT: testl %edi, %edi +; X86-NEXT: movl $-1, %edx +; X86-NEXT: cmovsl %edx, %esi +; X86-NEXT: movl $0, %edx +; X86-NEXT: cmovsl %edx, %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: cmpl $-1, %edi +; X86-NEXT: cmovel %ecx, %eax +; X86-NEXT: cmovnel %esi, %edi +; X86-NEXT: shldl $31, %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmpl $-1, %eax +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: cmpl $1, %esi +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: notl %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: testl %esi, %esi +; X86-NEXT: movl $0, %ecx +; X86-NEXT: cmovbl %esi, %ecx +; X86-NEXT: andl %esi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: cmovel %ecx, %ebx +; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: testl %eax, %eax +; X86-NEXT: movl $0, %ecx +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: cmpl $-1, %ebx +; X86-NEXT: movl $0, %edi +; X86-NEXT: cmovnel %edi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: testl %esi, %esi +; X86-NEXT: movl $-1, %edx +; X86-NEXT: cmovsl %edx, %ebx +; X86-NEXT: cmovsl %edi, %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: cmpl $-1, %esi +; X86-NEXT: cmovel %ecx, %eax +; X86-NEXT: cmovnel %ebx, %esi +; X86-NEXT: shldl $31, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmpl $-1, %eax +; X86-NEXT: cmovael %edx, %eax +; X86-NEXT: movl $-1, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: notl %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: testl %edx, %edx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: cmovbl %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: cmovel %ecx, %edi +; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: testl %eax, %eax +; X86-NEXT: movl $0, %ecx +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: cmpl $-1, %edi +; X86-NEXT: movl $0, %edx +; X86-NEXT: cmovnel %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: testl %esi, %esi +; X86-NEXT: cmovsl %ebx, %edi +; X86-NEXT: cmovsl %edx, %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: cmpl $-1, %esi +; X86-NEXT: cmovel %ecx, %eax +; X86-NEXT: cmovnel %edi, %esi +; X86-NEXT: shldl $31, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmpl $-1, %eax +; X86-NEXT: cmovael %ebx, %eax +; X86-NEXT: movl $-1, %esi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: notl %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: testl %edx, %edx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: cmovbl %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: andl %edx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: cmovel %ecx, %ebx +; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: testl %eax, %eax +; X86-NEXT: movl $0, %ecx +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: cmpl $-1, %ebx +; X86-NEXT: movl $0, %edi +; X86-NEXT: cmovnel %edi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: testl %edx, %edx +; X86-NEXT: cmovsl %esi, %ebx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: cmovsl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: andl %edx, %ebx +; X86-NEXT: cmpl $-1, %ebx +; X86-NEXT: cmovel %ecx, %eax +; X86-NEXT: cmovnel %esi, %ebx +; X86-NEXT: shldl $31, %eax, %ebx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %tmp = call <4 x i32> @llvm.sdiv.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 31) + ret <4 x i32> %tmp +} diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll @@ -0,0 +1,528 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86 + +declare i4 @llvm.udiv.fix.sat.i4 (i4, i4, i32) +declare i15 @llvm.udiv.fix.sat.i15 (i15, i15, i32) +declare i16 @llvm.udiv.fix.sat.i16 (i16, i16, i32) +declare i18 @llvm.udiv.fix.sat.i18 (i18, i18, i32) +declare i64 @llvm.udiv.fix.sat.i64 (i64, i64, i32) +declare <4 x i32> @llvm.udiv.fix.sat.v4i32(<4 x i32>, <4 x i32>, i32) + +define i16 @func(i16 %x, i16 %y) nounwind { +; X64-LABEL: func: +; X64: # %bb.0: +; X64-NEXT: movzwl %si, %ecx +; X64-NEXT: movzwl %di, %eax +; X64-NEXT: shll $8, %eax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: divl %ecx +; X64-NEXT: cmpl $131071, %eax # imm = 0x1FFFF +; X64-NEXT: movl $131071, %ecx # imm = 0x1FFFF +; X64-NEXT: cmovael %ecx, %eax +; X64-NEXT: shrl %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq +; +; X86-LABEL: func: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl %ax, %eax +; X86-NEXT: shll $8, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: divl %ecx +; X86-NEXT: cmpl $131071, %eax # imm = 0x1FFFF +; X86-NEXT: movl $131071, %ecx # imm = 0x1FFFF +; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: shrl %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl + %tmp = call i16 @llvm.udiv.fix.sat.i16(i16 %x, i16 %y, i32 7) + ret i16 %tmp +} + +define i16 @func2(i8 %x, i8 %y) nounwind { +; X64-LABEL: func2: +; X64: # %bb.0: +; X64-NEXT: movsbl %dil, %eax +; X64-NEXT: andl $32767, %eax # imm = 0x7FFF +; X64-NEXT: movsbl %sil, %ecx +; X64-NEXT: andl $32767, %ecx # imm = 0x7FFF +; X64-NEXT: shll $14, %eax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: divl %ecx +; X64-NEXT: cmpl $32767, %eax # imm = 0x7FFF +; X64-NEXT: movl $32767, %ecx # imm = 0x7FFF +; X64-NEXT: cmovbl %eax, %ecx +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: movswl %cx, %eax +; X64-NEXT: shrl %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq +; +; X86-LABEL: func2: +; X86: # %bb.0: +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $32767, %eax # imm = 0x7FFF +; X86-NEXT: shll $14, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: divl %ecx +; X86-NEXT: cmpl $32767, %eax # imm = 0x7FFF +; X86-NEXT: movl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: movswl %cx, %eax +; X86-NEXT: shrl %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl + %x2 = sext i8 %x to i15 + %y2 = sext i8 %y to i15 + %tmp = call i15 @llvm.udiv.fix.sat.i15(i15 %x2, i15 %y2, i32 14) + %tmp2 = sext i15 %tmp to i16 + ret i16 %tmp2 +} + +define i16 @func3(i15 %x, i8 %y) nounwind { +; X64-LABEL: func3: +; X64: # %bb.0: +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal (%rdi,%rdi), %eax +; X64-NEXT: movzbl %sil, %ecx +; X64-NEXT: shll $4, %ecx +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: divw %cx +; X64-NEXT: # kill: def $ax killed $ax def $eax +; X64-NEXT: movzwl %ax, %ecx +; X64-NEXT: cmpl $32767, %ecx # imm = 0x7FFF +; X64-NEXT: movl $32767, %ecx # imm = 0x7FFF +; X64-NEXT: cmovbl %eax, %ecx +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: movswl %cx, %eax +; X64-NEXT: shrl %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq +; +; X86-LABEL: func3: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl %eax, %eax +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: shll $4, %ecx +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: divw %cx +; X86-NEXT: # kill: def $ax killed $ax def $eax +; X86-NEXT: movzwl %ax, %ecx +; X86-NEXT: cmpl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: movl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: movswl %cx, %eax +; X86-NEXT: shrl %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl + %y2 = sext i8 %y to i15 + %y3 = shl i15 %y2, 7 + %tmp = call i15 @llvm.udiv.fix.sat.i15(i15 %x, i15 %y3, i32 4) + %tmp2 = sext i15 %tmp to i16 + ret i16 %tmp2 +} + +define i4 @func4(i4 %x, i4 %y) nounwind { +; X64-LABEL: func4: +; X64: # %bb.0: +; X64-NEXT: andb $15, %sil +; X64-NEXT: andb $15, %dil +; X64-NEXT: shlb $2, %dil +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: divb %sil +; X64-NEXT: movzbl %al, %ecx +; X64-NEXT: cmpb $15, %cl +; X64-NEXT: movl $15, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq +; +; X86-LABEL: func4: +; X86: # %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: andb $15, %cl +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: andb $15, %al +; X86-NEXT: shlb $2, %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: divb %cl +; X86-NEXT: movzbl %al, %ecx +; X86-NEXT: cmpb $15, %al +; X86-NEXT: movl $15, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %tmp = call i4 @llvm.udiv.fix.sat.i4(i4 %x, i4 %y, i32 2) + ret i4 %tmp +} + +define i64 @func5(i64 %x, i64 %y) nounwind { +; X64-LABEL: func5: +; X64: # %bb.0: +; X64-NEXT: pushq %rbx +; X64-NEXT: movq %rsi, %rdx +; X64-NEXT: leaq (%rdi,%rdi), %rsi +; X64-NEXT: shrq $33, %rsi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shrq $32, %rax +; X64-NEXT: andl $-2147483648, %eax # imm = 0x80000000 +; X64-NEXT: orq %rax, %rsi +; X64-NEXT: shlq $32, %rdi +; X64-NEXT: xorl %ebx, %ebx +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: callq __udivti3 +; X64-NEXT: cmpq $-1, %rax +; X64-NEXT: movq $-1, %rcx +; X64-NEXT: cmovbq %rax, %rcx +; X64-NEXT: cmpq $1, %rdx +; X64-NEXT: movl $1, %esi +; X64-NEXT: cmovbq %rdx, %rsi +; X64-NEXT: sbbq %rbx, %rbx +; X64-NEXT: notq %rbx +; X64-NEXT: orq %rax, %rbx +; X64-NEXT: cmpq $1, %rdx +; X64-NEXT: cmoveq %rcx, %rbx +; X64-NEXT: shrdq $1, %rsi, %rbx +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: popq %rbx +; X64-NEXT: retq +; +; X86-LABEL: func5: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $24, %esp +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrl %edx +; X86-NEXT: shldl $31, %eax, %ecx +; X86-NEXT: shll $31, %eax +; X86-NEXT: movl %esp, %esi +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $0 +; X86-NEXT: pushl 20(%ebp) +; X86-NEXT: pushl 16(%ebp) +; X86-NEXT: pushl $0 +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %eax +; X86-NEXT: pushl %esi +; X86-NEXT: calll __udivti3 +; X86-NEXT: addl $32, %esp +; X86-NEXT: movl (%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl $-1, %eax +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $-1, %esi +; X86-NEXT: cmovbl %eax, %esi +; X86-NEXT: cmpl $-1, %edx +; X86-NEXT: cmovel %edx, %eax +; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: cmovael %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: orl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmovnel %ecx, %edx +; X86-NEXT: cmovnel %ecx, %eax +; X86-NEXT: leal -4(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %ebp +; X86-NEXT: retl + %tmp = call i64 @llvm.udiv.fix.sat.i64(i64 %x, i64 %y, i32 31) + ret i64 %tmp +} + +define i18 @func6(i16 %x, i16 %y) nounwind { +; X64-LABEL: func6: +; X64: # %bb.0: +; X64-NEXT: movswl %di, %eax +; X64-NEXT: andl $262143, %eax # imm = 0x3FFFF +; X64-NEXT: movswl %si, %ecx +; X64-NEXT: andl $262143, %ecx # imm = 0x3FFFF +; X64-NEXT: shll $7, %eax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: divl %ecx +; X64-NEXT: cmpl $262143, %eax # imm = 0x3FFFF +; X64-NEXT: movl $262143, %ecx # imm = 0x3FFFF +; X64-NEXT: cmovael %ecx, %eax +; X64-NEXT: retq +; +; X86-LABEL: func6: +; X86: # %bb.0: +; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $262143, %ecx # imm = 0x3FFFF +; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $262143, %eax # imm = 0x3FFFF +; X86-NEXT: shll $7, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: divl %ecx +; X86-NEXT: cmpl $262143, %eax # imm = 0x3FFFF +; X86-NEXT: movl $262143, %ecx # imm = 0x3FFFF +; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: retl + %x2 = sext i16 %x to i18 + %y2 = sext i16 %y to i18 + %tmp = call i18 @llvm.udiv.fix.sat.i18(i18 %x2, i18 %y2, i32 7) + ret i18 %tmp +} + +define i16 @func7(i16 %x, i16 %y) nounwind { +; X64-LABEL: func7: +; X64: # %bb.0: +; X64-NEXT: movzwl %si, %ecx +; X64-NEXT: movzwl %di, %eax +; X64-NEXT: addl %eax, %eax +; X64-NEXT: shlq $16, %rax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: divq %rcx +; X64-NEXT: cmpq $131071, %rax # imm = 0x1FFFF +; X64-NEXT: movl $131071, %ecx # imm = 0x1FFFF +; X64-NEXT: cmovaeq %rcx, %rax +; X64-NEXT: shrl %eax +; X64-NEXT: # kill: def $ax killed $ax killed $rax +; X64-NEXT: retq +; +; X86-LABEL: func7: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl %cx, %ecx +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrl $16, %edx +; X86-NEXT: shll $16, %ecx +; X86-NEXT: pushl $0 +; X86-NEXT: pushl %eax +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %ecx +; X86-NEXT: calll __udivdi3 +; X86-NEXT: addl $16, %esp +; X86-NEXT: cmpl $131071, %eax # imm = 0x1FFFF +; X86-NEXT: movl $131071, %ecx # imm = 0x1FFFF +; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: testl %edx, %edx +; X86-NEXT: cmovnel %ecx, %eax +; X86-NEXT: shrl %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl + %tmp = call i16 @llvm.udiv.fix.sat.i16(i16 %x, i16 %y, i32 16) + ret i16 %tmp +} + +define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { +; X64-LABEL: vec: +; X64: # %bb.0: +; X64-NEXT: pxor %xmm8, %xmm8 +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; X64-NEXT: movq %xmm2, %rcx +; X64-NEXT: movdqa %xmm0, %xmm4 +; X64-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; X64-NEXT: paddq %xmm4, %xmm4 +; X64-NEXT: psllq $31, %xmm4 +; X64-NEXT: movq %xmm4, %rax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: divq %rcx +; X64-NEXT: movq %rax, %xmm7 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X64-NEXT: movq %xmm2, %rcx +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] +; X64-NEXT: movq %xmm2, %rax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: divq %rcx +; X64-NEXT: movq %rax, %xmm2 +; X64-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; X64-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; X64-NEXT: movdqa %xmm7, %xmm2 +; X64-NEXT: pxor %xmm4, %xmm2 +; X64-NEXT: movdqa {{.*#+}} xmm9 = [9223372043297226751,9223372043297226751] +; X64-NEXT: movdqa %xmm9, %xmm6 +; X64-NEXT: pcmpgtd %xmm2, %xmm6 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; X64-NEXT: pcmpeqd %xmm9, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; X64-NEXT: pand %xmm3, %xmm5 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; X64-NEXT: por %xmm5, %xmm2 +; X64-NEXT: movdqa {{.*#+}} xmm6 = [8589934591,8589934591] +; X64-NEXT: pand %xmm2, %xmm7 +; X64-NEXT: pandn %xmm6, %xmm2 +; X64-NEXT: por %xmm7, %xmm2 +; X64-NEXT: psrlq $1, %xmm2 +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; X64-NEXT: movq %xmm1, %rcx +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; X64-NEXT: paddq %xmm0, %xmm0 +; X64-NEXT: psllq $31, %xmm0 +; X64-NEXT: movq %xmm0, %rax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: divq %rcx +; X64-NEXT: movq %rax, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: movq %xmm1, %rcx +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-NEXT: movq %xmm0, %rax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: divq %rcx +; X64-NEXT: movq %rax, %xmm0 +; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; X64-NEXT: pxor %xmm3, %xmm4 +; X64-NEXT: movdqa %xmm9, %xmm0 +; X64-NEXT: pcmpgtd %xmm4, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] +; X64-NEXT: pcmpeqd %xmm9, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; X64-NEXT: pand %xmm1, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: por %xmm4, %xmm0 +; X64-NEXT: pand %xmm0, %xmm3 +; X64-NEXT: pandn %xmm6, %xmm0 +; X64-NEXT: por %xmm3, %xmm0 +; X64-NEXT: psrlq $1, %xmm0 +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; X64-NEXT: retq +; +; X86-LABEL: vec: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: setb %al +; X86-NEXT: shldl $31, %ecx, %eax +; X86-NEXT: shll $31, %ecx +; X86-NEXT: pushl $0 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl %eax +; X86-NEXT: pushl %ecx +; X86-NEXT: calll __udivdi3 +; X86-NEXT: addl $16, %esp +; X86-NEXT: cmpl $-1, %eax +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: notl %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: addl %esi, %esi +; X86-NEXT: setb %al +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmovel %ecx, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $1, %ecx +; X86-NEXT: cmovael %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $31, %esi, %eax +; X86-NEXT: shll $31, %esi +; X86-NEXT: pushl $0 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl %eax +; X86-NEXT: pushl %esi +; X86-NEXT: calll __udivdi3 +; X86-NEXT: addl $16, %esp +; X86-NEXT: cmpl $-1, %eax +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: movl $1, %esi +; X86-NEXT: cmovbl %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %esi +; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: notl %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: addl %edi, %edi +; X86-NEXT: setb %al +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmovel %ecx, %esi +; X86-NEXT: shldl $31, %edi, %eax +; X86-NEXT: shll $31, %edi +; X86-NEXT: pushl $0 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl %eax +; X86-NEXT: pushl %edi +; X86-NEXT: calll __udivdi3 +; X86-NEXT: addl $16, %esp +; X86-NEXT: cmpl $-1, %eax +; X86-NEXT: movl $-1, %ebx +; X86-NEXT: cmovbl %eax, %ebx +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: notl %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: addl %ebp, %ebp +; X86-NEXT: setb %cl +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl $1, %edx +; X86-NEXT: cmovael %edx, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: cmovel %ebx, %edi +; X86-NEXT: shldl $31, %ebp, %ecx +; X86-NEXT: shll $31, %ebp +; X86-NEXT: pushl $0 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ebp +; X86-NEXT: calll __udivdi3 +; X86-NEXT: addl $16, %esp +; X86-NEXT: cmpl $-1, %eax +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: movl $1, %ebx +; X86-NEXT: cmovbl %edx, %ebx +; X86-NEXT: movl $0, %ebp +; X86-NEXT: sbbl %ebp, %ebp +; X86-NEXT: notl %ebp +; X86-NEXT: orl %eax, %ebp +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: cmovel %ecx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shrdl $1, %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shrdl $1, %eax, %esi +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: shrdl $1, %eax, %edi +; X86-NEXT: shrdl $1, %ebx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ebp, 12(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: addl $16, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %tmp = call <4 x i32> @llvm.udiv.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 31) + ret <4 x i32> %tmp +}