diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -93,14 +93,14 @@ namespace Sched { - enum Preference { - None, // No preference - Source, // Follow source order. - RegPressure, // Scheduling for lowest register pressure. - Hybrid, // Scheduling for both latency and register pressure. - ILP, // Scheduling for ILP in low register pressure mode. - VLIW // Scheduling for VLIW targets. - }; +enum Preference { + None, // No preference + Source, // Follow source order. + RegPressure, // Scheduling for lowest register pressure. + Hybrid, // Scheduling for both latency and register pressure. + ILP, // Scheduling for ILP in low register pressure mode. + VLIW // Scheduling for VLIW targets. +}; } // end namespace Sched @@ -190,11 +190,11 @@ /// This enum indicates whether operations are valid for a target, and if not, /// what action should be used to make them valid. enum LegalizeAction : uint8_t { - Legal, // The target natively supports this operation. - Promote, // This operation should be executed in a larger type. - Expand, // Try to expand this to other ops, otherwise use a libcall. - LibCall, // Don't try to expand this to other ops, always use a libcall. - Custom // Use the LowerOperation hook to implement custom lowering. + Legal, // The target natively supports this operation. + Promote, // This operation should be executed in a larger type. + Expand, // Try to expand this to other ops, otherwise use a libcall. + LibCall, // Don't try to expand this to other ops, always use a libcall. + Custom // Use the LowerOperation hook to implement custom lowering. }; /// This enum indicates whether a types are legal for a target, and if not, @@ -210,13 +210,13 @@ TypeWidenVector, // This vector should be widened into a larger vector. TypePromoteFloat, // Replace this float with a larger one. TypeSoftPromoteHalf, // Soften half to i16 and use float to do arithmetic. - TypeScalarizeScalableVector, // This action is explicitly left unimplemented. - // While it is theoretically possible to - // legalize operations on scalable types with a - // loop that handles the vscale * #lanes of the - // vector, this is non-trivial at SelectionDAG - // level and these types are better to be - // widened or promoted. + TypeScalarizeScalableVector, // This action is explicitly left + // unimplemented. While it is theoretically + // possible to legalize operations on scalable + // types with a loop that handles the vscale * + // #lanes of the vector, this is non-trivial at + // SelectionDAG level and these types are + // better to be widened or promoted. }; /// LegalizeKind holds the legalization kind that needs to happen to EVT @@ -225,18 +225,18 @@ /// Enum that describes how the target represents true/false values. enum BooleanContent { - UndefinedBooleanContent, // Only bit 0 counts, the rest can hold garbage. - ZeroOrOneBooleanContent, // All bits zero except for bit 0. + UndefinedBooleanContent, // Only bit 0 counts, the rest can hold garbage. + ZeroOrOneBooleanContent, // All bits zero except for bit 0. ZeroOrNegativeOneBooleanContent // All bits equal to bit 0. }; /// Enum that describes what type of support for selects the target has. enum SelectSupportKind { - ScalarValSelect, // The target supports scalar selects (ex: cmov). - ScalarCondVectorVal, // The target supports selects with a scalar condition - // and vector values (ex: cmov). - VectorMaskSelect // The target supports vector selects with a vector - // mask (ex: x86 blends). + ScalarValSelect, // The target supports scalar selects (ex: cmov). + ScalarCondVectorVal, // The target supports selects with a scalar condition + // and vector values (ex: cmov). + VectorMaskSelect // The target supports vector selects with a vector + // mask (ex: x86 blends). }; /// Enum that specifies what an atomic load/AtomicRMWInst is expanded @@ -262,9 +262,9 @@ /// Enum that specifies when a float negation is beneficial. enum class NegatibleCost { - Cheaper = 0, // Negated expression is cheaper. - Neutral = 1, // Negated expression has the same cost. - Expensive = 2 // Negated expression is more expensive. + Cheaper = 0, // Negated expression is cheaper. + Neutral = 1, // Negated expression has the same cost. + Expensive = 2 // Negated expression is more expensive. }; class ArgListEntry { @@ -323,9 +323,7 @@ virtual ~TargetLoweringBase() = default; /// Return true if the target support strict float operation - bool isStrictFPEnabled() const { - return IsStrictFPEnabled; - } + bool isStrictFPEnabled() const { return IsStrictFPEnabled; } protected: /// Initialize all of the actions to default values. @@ -392,7 +390,8 @@ /// This callback is used to inspect load/store instructions and add /// target-specific MachineMemOperand flags to them. The default /// implementation does nothing. - virtual MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const { + virtual MachineMemOperand::Flags + getTargetMMOFlags(const Instruction &I) const { return MachineMemOperand::MONone; } @@ -411,9 +410,7 @@ /// a constant pool load whose address depends on the select condition. The /// parameter may be used to differentiate a select with FP compare from /// integer compare. - virtual bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const { - return true; - } + virtual bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const { return true; } /// Return true if multiple condition registers are available. bool hasMultipleConditionRegisters() const { @@ -464,9 +461,7 @@ virtual bool isIntDivCheap(EVT VT, AttributeList Attr) const { return false; } /// Return true if the target can handle a standalone remainder operation. - virtual bool hasStandaloneRem(EVT VT) const { - return true; - } + virtual bool hasStandaloneRem(EVT VT) const { return true; } /// Return true if SQRT(X) shouldn't be replaced with X*RSQRT(X). virtual bool isFsqrtCheap(SDValue X, SelectionDAG &DAG) const { @@ -475,11 +470,7 @@ } /// Reciprocal estimate status values used by the functions below. - enum ReciprocalEstimate : int { - Unspecified = -1, - Disabled = 0, - Enabled = 1 - }; + enum ReciprocalEstimate : int { Unspecified = -1, Disabled = 0, Enabled = 1 }; /// Return a ReciprocalEstimate enum value for a square root of the given type /// based on the function's attributes. If the operation is not overridden by @@ -556,7 +547,8 @@ bool Fast = false; return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), BitcastVT, - MMO, &Fast) && Fast; + MMO, &Fast) && + Fast; } /// Return true if the following transform is beneficial: @@ -571,8 +563,7 @@ /// Return true if it is expected to be cheaper to do a store of a non-zero /// vector constant with the given size and type for the address space than to /// store the individual scalar element constants. - virtual bool storeOfVectorConstantIsCheap(EVT MemVT, - unsigned NumElem, + virtual bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, unsigned AddrSpace) const { return false; } @@ -580,9 +571,7 @@ /// Allow store merging for the specified type after legalization in addition /// to before legalization. This may transform stores that do not exist /// earlier (for example, stores created from intrinsics). - virtual bool mergeStoresAfterLegalization(EVT MemVT) const { - return true; - } + virtual bool mergeStoresAfterLegalization(EVT MemVT) const { return true; } /// Returns if it's reasonable to merge stores to MemVT size. virtual bool canMergeStoresTo(unsigned AS, EVT MemVT, @@ -591,19 +580,13 @@ } /// Return true if it is cheap to speculate a call to intrinsic cttz. - virtual bool isCheapToSpeculateCttz() const { - return false; - } + virtual bool isCheapToSpeculateCttz() const { return false; } /// Return true if it is cheap to speculate a call to intrinsic ctlz. - virtual bool isCheapToSpeculateCtlz() const { - return false; - } + virtual bool isCheapToSpeculateCtlz() const { return false; } /// Return true if ctlz instruction is fast. - virtual bool isCtlzFast() const { - return false; - } + virtual bool isCtlzFast() const { return false; } /// Return true if instruction generated for equality comparison is folded /// with instruction generated for signed comparison. @@ -613,9 +596,7 @@ /// into the equivalent floating-point operation. This should be set to true /// if the target has IEEE-754-compliant fabs/fneg operations for the input /// type. - virtual bool hasBitPreservingFPLogic(EVT VT) const { - return false; - } + virtual bool hasBitPreservingFPLogic(EVT VT) const { return false; } /// Return true if it is cheaper to split the store of a merged int val /// from a pair of smaller values into multiple stores. @@ -642,9 +623,7 @@ /// This should be true when it takes more than one instruction to lower /// setcc (cmp+set on x86 scalar), when bitwise ops are faster than logic on /// condition bits (crand on PowerPC), and/or when reducing cmp+br is a win. - virtual bool convertSetCCLogicToBitwiseLogic(EVT VT) const { - return false; - } + virtual bool convertSetCCLogicToBitwiseLogic(EVT VT) const { return false; } /// Return the preferred operand type if the target has a quick way to compare /// integer values of the given size. Assume that any legal integer type can @@ -667,9 +646,7 @@ /// because a mask and compare of a single bit can be handled by inverting the /// predicate, for example: /// (X & 8) == 8 ---> (X & 8) != 0 - virtual bool hasAndNotCompare(SDValue Y) const { - return false; - } + virtual bool hasAndNotCompare(SDValue Y) const { return false; } /// Return true if the target has a bitwise and-not operation: /// X = ~A & B @@ -775,16 +752,12 @@ /// Return true if inserting a scalar into a variable element of an undef /// vector is more efficiently handled by splatting the scalar instead. - virtual bool shouldSplatInsEltVarIndex(EVT) const { - return false; - } + virtual bool shouldSplatInsEltVarIndex(EVT) const { return false; } /// Return true if target always beneficiates from combining into FMA for a /// given value type. This must typically return false on targets where FMA /// takes more cycles to execute than FADD. - virtual bool enableAggressiveFMAFusion(EVT VT) const { - return false; - } + virtual bool enableAggressiveFMAFusion(EVT VT) const { return false; } /// Return the ValueType of the result of SETCC operations. virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, @@ -793,8 +766,7 @@ /// Return the ValueType for comparison libcalls. Comparions libcalls include /// floating point comparion calls, and Ordered/Unordered check calls on /// floating point numbers. - virtual - MVT::SimpleValueType getCmpLibcallReturnType() const; + virtual MVT::SimpleValueType getCmpLibcallReturnType() const; /// For targets without i1 registers, this gives the nature of the high-bits /// of boolean values held in types wider than i1. @@ -833,7 +805,8 @@ /// Return the register class that should be used for the specified value /// type. - virtual const TargetRegisterClass *getRegClassFor(MVT VT, bool isDivergent = false) const { + virtual const TargetRegisterClass * + getRegClassFor(MVT VT, bool isDivergent = false) const { (void)isDivergent; const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy]; assert(RC && "This value type is not natively supported!"); @@ -970,16 +943,16 @@ } struct IntrinsicInfo { - unsigned opc = 0; // target opcode - EVT memVT; // memory VT + unsigned opc = 0; // target opcode + EVT memVT; // memory VT // value representing memory location PointerUnion ptrVal; - int offset = 0; // offset off of ptrVal - uint64_t size = 0; // the size of the memory location - // (taken from memVT if zero) - MaybeAlign align = Align(1); // alignment + int offset = 0; // offset off of ptrVal + uint64_t size = 0; // the size of the memory location + // (taken from memVT if zero) + MaybeAlign align = Align(1); // alignment MachineMemOperand::Flags flags = MachineMemOperand::MONone; IntrinsicInfo() = default; @@ -1029,10 +1002,12 @@ /// be promoted to a larger size, needs to be expanded to some other code /// sequence, or the target has a custom expander for it. LegalizeAction getOperationAction(unsigned Op, EVT VT) const { - if (VT.isExtended()) return Expand; + if (VT.isExtended()) + return Expand; // If a target-specific SDNode requires legalization, require the target // to provide custom legalization for it. - if (Op >= array_lengthof(OpActions[0])) return Custom; + if (Op >= array_lengthof(OpActions[0])) + return Custom; return OpActions[(unsigned)VT.getSimpleVT().SimpleTy][Op]; } @@ -1080,11 +1055,16 @@ LegalizeAction getStrictFPOperationAction(unsigned Op, EVT VT) const { unsigned EqOpc; switch (Op) { - default: llvm_unreachable("Unexpected FP pseudo-opcode"); + default: + llvm_unreachable("Unexpected FP pseudo-opcode"); #define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ - case ISD::STRICT_##DAGN: EqOpc = ISD::DAGN; break; + case ISD::STRICT_##DAGN: \ + EqOpc = ISD::DAGN; \ + break; #define CMP_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ - case ISD::STRICT_##DAGN: EqOpc = ISD::SETCC; break; + case ISD::STRICT_##DAGN: \ + EqOpc = ISD::SETCC; \ + break; #include "llvm/IR/ConstrainedOps.def" } @@ -1101,8 +1081,8 @@ return isOperationLegal(Op, VT); return (VT == MVT::Other || isTypeLegal(VT)) && - (getOperationAction(Op, VT) == Legal || - getOperationAction(Op, VT) == Custom); + (getOperationAction(Op, VT) == Legal || + getOperationAction(Op, VT) == Custom); } /// Return true if the specified operation is legal on this target or can be @@ -1115,8 +1095,8 @@ return isOperationLegal(Op, VT); return (VT == MVT::Other || isTypeLegal(VT)) && - (getOperationAction(Op, VT) == Legal || - getOperationAction(Op, VT) == Promote); + (getOperationAction(Op, VT) == Legal || + getOperationAction(Op, VT) == Promote); } /// Return true if the specified operation is legal on this target or can be @@ -1129,9 +1109,9 @@ return isOperationLegal(Op, VT); return (VT == MVT::Other || isTypeLegal(VT)) && - (getOperationAction(Op, VT) == Legal || - getOperationAction(Op, VT) == Custom || - getOperationAction(Op, VT) == Promote); + (getOperationAction(Op, VT) == Legal || + getOperationAction(Op, VT) == Custom || + getOperationAction(Op, VT) == Promote); } /// Return true if the operation uses custom lowering, regardless of whether @@ -1209,9 +1189,10 @@ /// code sequence, or the target has a custom expander for it. LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT, EVT MemVT) const { - if (ValVT.isExtended() || MemVT.isExtended()) return Expand; - unsigned ValI = (unsigned) ValVT.getSimpleVT().SimpleTy; - unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy; + if (ValVT.isExtended() || MemVT.isExtended()) + return Expand; + unsigned ValI = (unsigned)ValVT.getSimpleVT().SimpleTy; + unsigned MemI = (unsigned)MemVT.getSimpleVT().SimpleTy; assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValI < MVT::LAST_VALUETYPE && MemI < MVT::LAST_VALUETYPE && "Table isn't big enough!"); unsigned Shift = 4 * ExtType; @@ -1234,9 +1215,10 @@ /// legal, needs to be promoted to a larger size, needs to be expanded to some /// other code sequence, or the target has a custom expander for it. LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const { - if (ValVT.isExtended() || MemVT.isExtended()) return Expand; - unsigned ValI = (unsigned) ValVT.getSimpleVT().SimpleTy; - unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy; + if (ValVT.isExtended() || MemVT.isExtended()) + return Expand; + unsigned ValI = (unsigned)ValVT.getSimpleVT().SimpleTy; + unsigned MemI = (unsigned)MemVT.getSimpleVT().SimpleTy; assert(ValI < MVT::LAST_VALUETYPE && MemI < MVT::LAST_VALUETYPE && "Table isn't big enough!"); return TruncStoreActions[ValI][MemI]; @@ -1251,9 +1233,8 @@ /// Return true if the specified store with truncation has solution on this /// target. bool isTruncStoreLegalOrCustom(EVT ValVT, EVT MemVT) const { - return isTypeLegal(ValVT) && - (getTruncStoreAction(ValVT, MemVT) == Legal || - getTruncStoreAction(ValVT, MemVT) == Custom); + return isTypeLegal(ValVT) && (getTruncStoreAction(ValVT, MemVT) == Legal || + getTruncStoreAction(ValVT, MemVT) == Custom); } /// Return how the indexed load should be treated: either it is legal, needs @@ -1266,8 +1247,8 @@ /// Return true if the specified indexed load is legal on this target. bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const { return VT.isSimple() && - (getIndexedLoadAction(IdxMode, VT.getSimpleVT()) == Legal || - getIndexedLoadAction(IdxMode, VT.getSimpleVT()) == Custom); + (getIndexedLoadAction(IdxMode, VT.getSimpleVT()) == Legal || + getIndexedLoadAction(IdxMode, VT.getSimpleVT()) == Custom); } /// Return how the indexed store should be treated: either it is legal, needs @@ -1280,8 +1261,8 @@ /// Return true if the specified indexed load is legal on this target. bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const { return VT.isSimple() && - (getIndexedStoreAction(IdxMode, VT.getSimpleVT()) == Legal || - getIndexedStoreAction(IdxMode, VT.getSimpleVT()) == Custom); + (getIndexedStoreAction(IdxMode, VT.getSimpleVT()) == Legal || + getIndexedStoreAction(IdxMode, VT.getSimpleVT()) == Custom); } /// Return how the indexed load should be treated: either it is legal, needs @@ -1315,15 +1296,14 @@ /// Return how the condition code should be treated: either it is legal, needs /// to be expanded to some other code sequence, or the target has a custom /// expander for it. - LegalizeAction - getCondCodeAction(ISD::CondCode CC, MVT VT) const { + LegalizeAction getCondCodeAction(ISD::CondCode CC, MVT VT) const { assert((unsigned)CC < array_lengthof(CondCodeActions) && ((unsigned)VT.SimpleTy >> 3) < array_lengthof(CondCodeActions[0]) && "Table isn't big enough!"); // See setCondCodeAction for how this is encoded. uint32_t Shift = 4 * (VT.SimpleTy & 0x7); uint32_t Value = CondCodeActions[CC][VT.SimpleTy >> 3]; - LegalizeAction Action = (LegalizeAction) ((Value >> Shift) & 0xF); + LegalizeAction Action = (LegalizeAction)((Value >> Shift) & 0xF); assert(Action != Promote && "Can't promote condition code!"); return Action; } @@ -1349,19 +1329,19 @@ // See if this has an explicit type specified. std::map, MVT::SimpleValueType>::const_iterator PTTI = - PromoteToType.find(std::make_pair(Op, VT.SimpleTy)); - if (PTTI != PromoteToType.end()) return PTTI->second; + PromoteToType.find(std::make_pair(Op, VT.SimpleTy)); + if (PTTI != PromoteToType.end()) + return PTTI->second; assert((VT.isInteger() || VT.isFloatingPoint()) && "Cannot autopromote this type, add it with AddPromotedToType."); MVT NVT = VT; do { - NVT = (MVT::SimpleValueType)(NVT.SimpleTy+1); + NVT = (MVT::SimpleValueType)(NVT.SimpleTy + 1); assert(NVT.isInteger() == VT.isInteger() && NVT != MVT::isVoid && "Didn't find type to promote to!"); - } while (!isTypeLegal(NVT) || - getOperationAction(Op, NVT) == Promote); + } while (!isTypeLegal(NVT) || getOperationAction(Op, NVT) == Promote); return NVT; } @@ -1407,7 +1387,6 @@ return getValueType(DL, Ty, AllowUnknown); } - /// Return the MVT corresponding to this LLVM type. See getValueType. MVT getSimpleValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown = false) const { @@ -1429,15 +1408,15 @@ MVT getRegisterType(LLVMContext &Context, EVT VT) const { if (VT.isSimple()) { assert((unsigned)VT.getSimpleVT().SimpleTy < - array_lengthof(RegisterTypeForVT)); + array_lengthof(RegisterTypeForVT)); return RegisterTypeForVT[VT.getSimpleVT().SimpleTy]; } if (VT.isVector()) { EVT VT1; MVT RegisterVT; unsigned NumIntermediates; - (void)getVectorTypeBreakdown(Context, VT, VT1, - NumIntermediates, RegisterVT); + (void)getVectorTypeBreakdown(Context, VT, VT1, NumIntermediates, + RegisterVT); return RegisterVT; } if (VT.isInteger()) { @@ -1457,7 +1436,7 @@ unsigned getNumRegisters(LLVMContext &Context, EVT VT) const { if (VT.isSimple()) { assert((unsigned)VT.getSimpleVT().SimpleTy < - array_lengthof(NumRegistersForVT)); + array_lengthof(NumRegistersForVT)); return NumRegistersForVT[VT.getSimpleVT().SimpleTy]; } if (VT.isVector()) { @@ -1526,7 +1505,7 @@ /// perform for the specified node. bool hasTargetDAGCombine(ISD::NodeType NT) const { assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray)); - return TargetDAGCombineArray[NT >> 3] & (1 << (NT&7)); + return TargetDAGCombineArray[NT >> 3] & (1 << (NT & 7)); } unsigned getGatherAllAliasesMaxDepth() const { @@ -1732,9 +1711,7 @@ /// Should loops be aligned even when the function is marked OptSize (but not /// MinSize). - virtual bool alignLoopsWithOptSize() const { - return false; - } + virtual bool alignLoopsWithOptSize() const { return false; } /// If the target has a standard location for the stack protector guard, /// returns the address of that location. Otherwise, returns nullptr. @@ -1946,9 +1923,7 @@ } /// Returns true if arguments should be extended in lib calls. - virtual bool shouldExtendTypeInLibCall(EVT Type) const { - return true; - } + virtual bool shouldExtendTypeInLibCall(EVT Type) const { return true; } /// Returns how the given (atomic) load should be expanded by the /// IR-level AtomicExpand pass. @@ -1965,9 +1940,10 @@ /// Returns how the IR-level AtomicExpand pass should expand the given /// AtomicRMW, if at all. Default is to never expand. - virtual AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { - return RMW->isFloatingPointOperation() ? - AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; + virtual AtomicExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { + return RMW->isFloatingPointOperation() ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; } /// On some platforms, an AtomicRMW that never actually modifies the value @@ -2021,7 +1997,7 @@ // registers. LegalizeTypeAction Action = getTypeAction(Context, VT); return Action != TypeExpandInteger && Action != TypeExpandFloat && - Action != TypeSplitVector; + Action != TypeSplitVector; } virtual bool isProfitableToCombineMinNumMaxNum(EVT VT) const { return true; } @@ -2029,17 +2005,15 @@ /// Return true if a select of constants (select Cond, C1, C2) should be /// transformed into simple math ops with the condition value. For example: /// select Cond, C1, C1-1 --> add (zext Cond), C1-1 - virtual bool convertSelectOfConstantsToMath(EVT VT) const { - return false; - } + virtual bool convertSelectOfConstantsToMath(EVT VT) const { return false; } /// Return true if it is profitable to transform an integer /// multiplication-by-constant into simpler operations like shifts and adds. /// This may be true if the target does not directly support the /// multiplication operation for the specified type or the sequence of simpler /// ops is faster than the multiply. - virtual bool decomposeMulByConstant(LLVMContext &Context, - EVT VT, SDValue C) const { + virtual bool decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const { return false; } @@ -2143,8 +2117,7 @@ /// Indicate that the specified operation does not work with the specified /// type and indicate what to do about it. Note that VT may refer to either /// the type of a result or that of an operand of Op. - void setOperationAction(unsigned Op, MVT VT, - LegalizeAction Action) { + void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action) { assert(Op < array_lengthof(OpActions[0]) && "Table isn't big enough!"); OpActions[(unsigned)VT.SimpleTy][Op] = Action; } @@ -2163,8 +2136,7 @@ /// Indicate that the specified truncating store does not work with the /// specified type and indicate what to do about it. - void setTruncStoreAction(MVT ValVT, MVT MemVT, - LegalizeAction Action) { + void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action) { assert(ValVT.isValid() && MemVT.isValid() && "Table isn't big enough!"); TruncStoreActions[(unsigned)ValVT.SimpleTy][MemVT.SimpleTy] = Action; } @@ -2209,8 +2181,7 @@ /// Indicate that the specified condition code is or isn't supported on the /// target and indicate what to do about it. - void setCondCodeAction(ISD::CondCode CC, MVT VT, - LegalizeAction Action) { + void setCondCodeAction(ISD::CondCode CC, MVT VT, LegalizeAction Action) { assert(VT.isValid() && (unsigned)CC < array_lengthof(CondCodeActions) && "Table isn't big enough!"); assert((unsigned)Action < 0x10 && "too many bits for bitfield array"); @@ -2242,7 +2213,7 @@ /// PerformDAGCombine virtual method. void setTargetDAGCombine(ISD::NodeType NT) { assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray)); - TargetDAGCombineArray[NT >> 3] |= 1 << (NT&7); + TargetDAGCombineArray[NT >> 3] |= 1 << (NT & 7); } /// Set the target's minimum function alignment. @@ -2295,8 +2266,8 @@ /// targets also pass back when this should be done on intrinsics which /// load/store. virtual bool getAddrModeArguments(IntrinsicInst * /*I*/, - SmallVectorImpl &/*Ops*/, - Type *&/*AccessTy*/) const { + SmallVectorImpl & /*Ops*/, + Type *& /*AccessTy*/) const { return false; } @@ -2309,9 +2280,9 @@ /// no scale. struct AddrMode { GlobalValue *BaseGV = nullptr; - int64_t BaseOffs = 0; - bool HasBaseReg = false; - int64_t Scale = 0; + int64_t BaseOffs = 0; + bool HasBaseReg = false; + int64_t Scale = 0; AddrMode() = default; }; @@ -2347,16 +2318,12 @@ /// Return true if the specified immediate is legal icmp immediate, that is /// the target has icmp instructions which can compare a register against the /// immediate without having to materialize the immediate into a register. - virtual bool isLegalICmpImmediate(int64_t) const { - return true; - } + virtual bool isLegalICmpImmediate(int64_t) const { return true; } /// Return true if the specified immediate is legal add immediate, that is the /// target has add instructions which can add a register with the immediate /// without having to materialize the immediate into a register. - virtual bool isLegalAddImmediate(int64_t) const { - return true; - } + virtual bool isLegalAddImmediate(int64_t) const { return true; } /// Return true if the specified immediate is legal for the value input of a /// store instruction. @@ -2371,16 +2338,14 @@ /// AVX2 for example, there is a "psllw" instruction for the former case, but /// no simple instruction for a general "a << b" operation on vectors. /// This should also apply to lowering for vector funnel shifts (rotates). - virtual bool isVectorShiftByScalarCheap(Type *Ty) const { - return false; - } + virtual bool isVectorShiftByScalarCheap(Type *Ty) const { return false; } /// Given a shuffle vector SVI representing a vector splat, return a new /// scalar type of size equal to SVI's scalar type if the new type is more /// profitable. Returns nullptr otherwise. For example under MVE float splats /// are converted to integer to prevent the need to move from SPR to GPR /// registers. - virtual Type* shouldConvertSplatType(ShuffleVectorInst* SVI) const { + virtual Type *shouldConvertSplatType(ShuffleVectorInst *SVI) const { return nullptr; } @@ -2424,7 +2389,8 @@ case ISD::FMINIMUM: case ISD::FMAXIMUM: return true; - default: return false; + default: + return false; } } @@ -2456,9 +2422,7 @@ /// ToTy. e.g. On x86 it's free to truncate a i32 value in register EAX to i16 /// by referencing its sub-register AX. /// Targets must return false when FromTy <= ToTy. - virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const { - return false; - } + virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const { return false; } /// Return true if a truncation from FromTy to ToTy is permitted when deciding /// whether a call is in tail position. Typically this means that both results @@ -2469,9 +2433,7 @@ return false; } - virtual bool isTruncateFree(EVT FromVT, EVT ToVT) const { - return false; - } + virtual bool isTruncateFree(EVT FromVT, EVT ToVT) const { return false; } virtual bool isProfitableToHoist(Instruction *I) const { return true; } @@ -2542,13 +2504,9 @@ /// explicit truncate, which is not necessarily free, but this function /// does not deal with those cases. /// Targets must return false when FromTy >= ToTy. - virtual bool isZExtFree(Type *FromTy, Type *ToTy) const { - return false; - } + virtual bool isZExtFree(Type *FromTy, Type *ToTy) const { return false; } - virtual bool isZExtFree(EVT FromTy, EVT ToTy) const { - return false; - } + virtual bool isZExtFree(EVT FromTy, EVT ToTy) const { return false; } /// Return true if sign-extension from FromTy to ToTy is cheaper than /// zero-extension. @@ -2724,9 +2682,7 @@ /// Try to convert an extract element of a vector binary operation into an /// extract element followed by a scalar operation. - virtual bool shouldScalarizeBinop(SDValue VecOp) const { - return false; - } + virtual bool shouldScalarizeBinop(SDValue VecOp) const { return false; } /// Return true if extraction of a scalar element from the given vector type /// at the given index is cheap. For example, if scalar operations occur on @@ -2820,8 +2776,8 @@ // GlobalISel Hooks //===----------------------------------------------------------------------===// /// Check whether or not \p MI needs to be moved close to its uses. - virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const; - + virtual bool shouldLocalize(const MachineInstr &MI, + const TargetTransformInfo *TTI) const; private: const TargetMachine &TM; @@ -2843,7 +2799,7 @@ /// instructions. For example, BypassSlowDivWidths[32,8] tells the code /// generator to bypass 32-bit integer div/rem with an 8-bit unsigned integer /// div/rem when the operands are positive and less than 256. - DenseMap BypassSlowDivWidths; + DenseMap BypassSlowDivWidths; /// Tells the code generator that it shouldn't generate extra flow control /// instructions and should attempt to combine flow control instructions via @@ -2963,7 +2919,7 @@ /// callbacks for by calling setTargetDAGCombine(), which sets a bit in this /// array. unsigned char - TargetDAGCombineArray[(ISD::BUILTIN_OP_END+CHAR_BIT-1)/CHAR_BIT]; + TargetDAGCombineArray[(ISD::BUILTIN_OP_END + CHAR_BIT - 1) / CHAR_BIT]; /// For operations that must be promoted to a specific type, this holds the /// destination type. This map should be sparse, so don't hold it as an @@ -2972,7 +2928,7 @@ /// Targets add entries to this map with AddPromotedToType(..), clients access /// this with getTypeToPromoteTo(..). std::map, MVT::SimpleValueType> - PromoteToType; + PromoteToType; /// Stores the name each libcall. const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1]; @@ -3142,17 +3098,15 @@ return false; } - virtual bool isSDNodeAlwaysUniform(const SDNode * N) const { - return false; - } + virtual bool isSDNodeAlwaysUniform(const SDNode *N) const { return false; } /// Returns true by value, base pointer and offset pointer and addressing mode /// by reference if the node's address can be legally represented as /// pre-indexed load / store address. - virtual bool getPreIndexedAddressParts(SDNode * /*N*/, SDValue &/*Base*/, - SDValue &/*Offset*/, - ISD::MemIndexedMode &/*AM*/, - SelectionDAG &/*DAG*/) const { + virtual bool getPreIndexedAddressParts(SDNode * /*N*/, SDValue & /*Base*/, + SDValue & /*Offset*/, + ISD::MemIndexedMode & /*AM*/, + SelectionDAG & /*DAG*/) const { return false; } @@ -3160,10 +3114,10 @@ /// by reference if this node can be combined with a load / store to form a /// post-indexed load / store. virtual bool getPostIndexedAddressParts(SDNode * /*N*/, SDNode * /*Op*/, - SDValue &/*Base*/, - SDValue &/*Offset*/, - ISD::MemIndexedMode &/*AM*/, - SelectionDAG &/*DAG*/) const { + SDValue & /*Base*/, + SDValue & /*Offset*/, + ISD::MemIndexedMode & /*AM*/, + SelectionDAG & /*DAG*/) const { return false; } @@ -3182,7 +3136,7 @@ virtual const MCExpr * LowerCustomJumpTableEntry(const MachineJumpTableInfo * /*MJTI*/, const MachineBasicBlock * /*MBB*/, unsigned /*uid*/, - MCContext &/*Ctx*/) const { + MCContext & /*Ctx*/) const { llvm_unreachable("Need to implement this hook if target has custom JTIs"); } @@ -3192,9 +3146,9 @@ /// This returns the relocation base for the given PIC jumptable, the same as /// getPICJumpTableRelocBase, but as an MCExpr. - virtual const MCExpr * - getPICJumpTableRelocBaseExpr(const MachineFunction *MF, - unsigned JTI, MCContext &Ctx) const; + virtual const MCExpr *getPICJumpTableRelocBaseExpr(const MachineFunction *MF, + unsigned JTI, + MCContext &Ctx) const; /// Return true if folding a constant offset with the given GlobalAddress is /// legal. It is frequently not legal in PIC relocation models. @@ -3226,9 +3180,9 @@ /// registers are the same as from the calling function. This needs to be /// checked for tail call eligibility. bool parametersInCSRMatch(const MachineRegisterInfo &MRI, - const uint32_t *CallerPreservedMask, - const SmallVectorImpl &ArgLocs, - const SmallVectorImpl &OutVals) const; + const uint32_t *CallerPreservedMask, + const SmallVectorImpl &ArgLocs, + const SmallVectorImpl &OutVals) const; //===--------------------------------------------------------------------===// // TargetLowering Optimization Methods @@ -3244,9 +3198,8 @@ SDValue Old; SDValue New; - explicit TargetLoweringOpt(SelectionDAG &InDAG, - bool LT, bool LO) : - DAG(InDAG), LegalTys(LT), LegalOps(LO) {} + explicit TargetLoweringOpt(SelectionDAG &InDAG, bool LT, bool LO) + : DAG(InDAG), LegalTys(LT), LegalOps(LO) {} bool LegalTypes() const { return LegalTys; } bool LegalOperations() const { return LegalOps; } @@ -3258,9 +3211,9 @@ } }; - /// Determines the optimal series of memory ops to replace the memset / memcpy. - /// Return true if the number of memory ops is below the threshold (Limit). - /// It returns the types of the sequence of memory ops to perform + /// Determines the optimal series of memory ops to replace the memset / + /// memcpy. Return true if the number of memory ops is below the threshold + /// (Limit). It returns the types of the sequence of memory ops to perform /// memset / memcpy by reference. bool findOptimalMemOpLowering(std::vector &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS, unsigned SrcAS, @@ -3374,8 +3327,7 @@ /// or one and return them in the KnownZero/KnownOne bitsets. The DemandedElts /// argument allows us to only collect the known bits that are shared by the /// requested vector elements. - virtual void computeKnownBitsForTargetNode(const SDValue Op, - KnownBits &Known, + virtual void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth = 0) const; @@ -3402,8 +3354,7 @@ /// Determine which of the bits of FrameIndex \p FIOp are known to be 0. /// Default implementation computes low bits based on alignment /// information. This should preserve known bits passed into it. - virtual void computeKnownBitsForFrameIndex(int FIOp, - KnownBits &Known, + virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const; /// This method can be implemented by targets that want to expose additional @@ -3419,11 +3370,9 @@ /// information about sign bits to GlobalISel combiners. The DemandedElts /// argument allows us to only collect the minimum sign bits that are shared /// by the requested vector elements. - virtual unsigned computeNumSignBitsForTargetInstr(GISelKnownBits &Analysis, - Register R, - const APInt &DemandedElts, - const MachineRegisterInfo &MRI, - unsigned Depth = 0) const; + virtual unsigned computeNumSignBitsForTargetInstr( + GISelKnownBits &Analysis, Register R, const APInt &DemandedElts, + const MachineRegisterInfo &MRI, unsigned Depth = 0) const; /// Attempt to simplify any target nodes based on the demanded vector /// elements, returning true on success. Otherwise, analyze the expression and @@ -3439,12 +3388,9 @@ /// expression and return a mask of KnownOne and KnownZero bits for the /// expression (used to simplify the caller). The KnownZero/One bits may only /// be accurate for those bits in the Demanded masks. - virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, - const APInt &DemandedBits, - const APInt &DemandedElts, - KnownBits &Known, - TargetLoweringOpt &TLO, - unsigned Depth = 0) const; + virtual bool SimplifyDemandedBitsForTargetNode( + SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, + KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth = 0) const; /// More limited version of SimplifyDemandedBits that can be used to "look /// through" ops that don't contribute to the DemandedBits/DemandedElts - @@ -3468,20 +3414,19 @@ /// If \p SNaN is false, \returns true if \p Op is known to never be any /// NaN. If \p sNaN is true, returns if \p Op is known to never be a signaling /// NaN. - virtual bool isKnownNeverNaNForTargetNode(SDValue Op, - const SelectionDAG &DAG, + virtual bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN = false, unsigned Depth = 0) const; struct DAGCombinerInfo { - void *DC; // The DAG Combiner object. + void *DC; // The DAG Combiner object. CombineLevel Level; bool CalledByLegalizer; public: SelectionDAG &DAG; - DAGCombinerInfo(SelectionDAG &dag, CombineLevel level, bool cl, void *dc) - : DC(dc), Level(level), CalledByLegalizer(cl), DAG(dag) {} + DAGCombinerInfo(SelectionDAG &dag, CombineLevel level, bool cl, void *dc) + : DC(dc), Level(level), CalledByLegalizer(cl), DAG(dag) {} bool isBeforeLegalize() const { return Level == BeforeLegalizeTypes; } bool isBeforeLegalizeOps() const { return Level < AfterLegalizeVectorOps; } @@ -3521,8 +3466,8 @@ /// Returns true (and the GlobalValue and the offset) if the node is a /// GlobalAddress + offset. - virtual bool - isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const; + virtual bool isGAPlusOffset(SDNode *N, const GlobalValue *&GA, + int64_t &Offset) const; /// This method will be invoked for all target nodes and for any /// target-independent nodes that the target has registered with invoke it @@ -3572,21 +3517,17 @@ /// This method query the target whether it is beneficial for dag combiner to /// promote the specified node. If true, it should return the desired /// promotion type by reference. - virtual bool IsDesirableToPromoteOp(SDValue /*Op*/, EVT &/*PVT*/) const { + virtual bool IsDesirableToPromoteOp(SDValue /*Op*/, EVT & /*PVT*/) const { return false; } /// Return true if the target supports swifterror attribute. It optimizes /// loads and stores to reading and writing a specific register. - virtual bool supportSwiftError() const { - return false; - } + virtual bool supportSwiftError() const { return false; } /// Return true if the target supports that a subset of CSRs for the given /// machine function is handled explicitly via copies. - virtual bool supportSplitCSR(MachineFunction *MF) const { - return false; - } + virtual bool supportSplitCSR(MachineFunction *MF) const { return false; } /// Perform necessary initialization to handle a subset of CSRs explicitly /// via copies. This function is called at the beginning of instruction @@ -3678,16 +3619,16 @@ struct CallLoweringInfo { SDValue Chain; Type *RetTy = nullptr; - bool RetSExt : 1; - bool RetZExt : 1; - bool IsVarArg : 1; - bool IsInReg : 1; - bool DoesNotReturn : 1; + bool RetSExt : 1; + bool RetZExt : 1; + bool IsVarArg : 1; + bool IsInReg : 1; + bool DoesNotReturn : 1; bool IsReturnValueUsed : 1; - bool IsConvergent : 1; - bool IsPatchPoint : 1; + bool IsConvergent : 1; + bool IsPatchPoint : 1; bool IsPreallocated : 1; - bool NoMerge : 1; + bool NoMerge : 1; // IsTailCall should be modified by implementations of // TargetLowering::LowerCall that perform tail call conversions. @@ -3711,8 +3652,8 @@ CallLoweringInfo(SelectionDAG &DAG) : RetSExt(false), RetZExt(false), IsVarArg(false), IsInReg(false), DoesNotReturn(false), IsReturnValueUsed(true), IsConvergent(false), - IsPatchPoint(false), IsPreallocated(false), NoMerge(false), - DAG(DAG) {} + IsPatchPoint(false), IsPreallocated(false), NoMerge(false), DAG(DAG) { + } CallLoweringInfo &setDebugLoc(const SDLoc &dl) { DL = dl; @@ -3762,7 +3703,7 @@ RetSExt = Call.hasRetAttr(Attribute::SExt); RetZExt = Call.hasRetAttr(Attribute::ZExt); NoMerge = Call.hasFnAttr(Attribute::NoMerge); - + Callee = Target; CallConv = Call.getCallingConv(); @@ -3824,14 +3765,12 @@ return *this; } - CallLoweringInfo &setIsPostTypeLegalization(bool Value=true) { + CallLoweringInfo &setIsPostTypeLegalization(bool Value = true) { IsPostTypeLegalization = Value; return *this; } - ArgListTy &getArgs() { - return Args; - } + ArgListTy &getArgs() { return Args; } }; /// This structure is used to pass arguments to makeLibCall function. @@ -3890,9 +3829,8 @@ /// and the values to be returned by the call are described by the Ins /// array. The implementation should fill in the InVals array with legal-type /// return values from the call, and return the resulting token chain value. - virtual SDValue - LowerCall(CallLoweringInfo &/*CLI*/, - SmallVectorImpl &/*InVals*/) const { + virtual SDValue LowerCall(CallLoweringInfo & /*CLI*/, + SmallVectorImpl & /*InVals*/) const { llvm_unreachable("Not Implemented"); } @@ -3903,10 +3841,9 @@ /// described by the Outs array can fit into the return registers. If false /// is returned, an sret-demotion is performed. virtual bool CanLowerReturn(CallingConv::ID /*CallConv*/, - MachineFunction &/*MF*/, bool /*isVarArg*/, - const SmallVectorImpl &/*Outs*/, - LLVMContext &/*Context*/) const - { + MachineFunction & /*MF*/, bool /*isVarArg*/, + const SmallVectorImpl & /*Outs*/, + LLVMContext & /*Context*/) const { // Return true by default to get preexisting behavior. return true; } @@ -3928,27 +3865,25 @@ /// /// This is used to determine whether it is possible to codegen a libcall as /// tail call at legalization time. - virtual bool isUsedByReturnOnly(SDNode *, SDValue &/*Chain*/) const { + virtual bool isUsedByReturnOnly(SDNode *, SDValue & /*Chain*/) const { return false; } /// Return true if the target may be able emit the call instruction as a tail /// call. This is used by optimization passes to determine if it's profitable /// to duplicate return instructions to enable tailcall optimization. - virtual bool mayBeEmittedAsTailCall(const CallInst *) const { - return false; - } + virtual bool mayBeEmittedAsTailCall(const CallInst *) const { return false; } /// Return the builtin name for the __builtin___clear_cache intrinsic /// Default is to invoke the clear cache library call - virtual const char * getClearCacheBuiltinName() const { + virtual const char *getClearCacheBuiltinName() const { return "__clear_cache"; } /// Return the register ID of the name passed in. Used by named register /// global variables extension. There is no target-independent behaviour /// so the default action is to bail. - virtual Register getRegisterByName(const char* RegName, LLT Ty, + virtual Register getRegisterByName(const char *RegName, LLT Ty, const MachineFunction &MF) const { report_fatal_error("Named registers not implemented for this target"); } @@ -3960,7 +3895,7 @@ /// conventions. The frontend should handle this and include all of the /// necessary information. virtual EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, - ISD::NodeType /*ExtendKind*/) const { + ISD::NodeType /*ExtendKind*/) const { EVT MinVT = getRegisterType(Context, MVT::i32); return VT.bitsLT(MinVT) ? MinVT : VT; } @@ -4019,7 +3954,6 @@ return false; } - /// This callback is invoked by the type legalizer to legalize nodes with an /// illegal operand type but legal result types. It replaces the /// LowerOperation callback in the type Legalizer. The reason we can not do @@ -4054,8 +3988,8 @@ /// If the target has no operations that require custom lowering, it need not /// implement this. The default implementation aborts. virtual void ReplaceNodeResults(SDNode * /*N*/, - SmallVectorImpl &/*Results*/, - SelectionDAG &/*DAG*/) const { + SmallVectorImpl & /*Results*/, + SelectionDAG & /*DAG*/) const { llvm_unreachable("ReplaceNodeResults not implemented for this target!"); } @@ -4080,33 +4014,31 @@ /// llvm code if it wants to. This is useful for turning simple inline asms /// into LLVM intrinsics, which gives the compiler more information about the /// behavior of the code. - virtual bool ExpandInlineAsm(CallInst *) const { - return false; - } + virtual bool ExpandInlineAsm(CallInst *) const { return false; } enum ConstraintType { - C_Register, // Constraint represents specific register(s). - C_RegisterClass, // Constraint represents any of register(s) in class. - C_Memory, // Memory constraint. - C_Immediate, // Requires an immediate. - C_Other, // Something else. - C_Unknown // Unsupported constraint. + C_Register, // Constraint represents specific register(s). + C_RegisterClass, // Constraint represents any of register(s) in class. + C_Memory, // Memory constraint. + C_Immediate, // Requires an immediate. + C_Other, // Something else. + C_Unknown // Unsupported constraint. }; enum ConstraintWeight { // Generic weights. - CW_Invalid = -1, // No match. - CW_Okay = 0, // Acceptable. - CW_Good = 1, // Good weight. - CW_Better = 2, // Better weight. - CW_Best = 3, // Best weight. + CW_Invalid = -1, // No match. + CW_Okay = 0, // Acceptable. + CW_Good = 1, // Good weight. + CW_Better = 2, // Better weight. + CW_Best = 3, // Best weight. // Well-known weights. - CW_SpecificReg = CW_Okay, // Specific register operands. - CW_Register = CW_Good, // Register operands. - CW_Memory = CW_Better, // Memory operands. - CW_Constant = CW_Best, // Constant operand. - CW_Default = CW_Okay // Default or don't know type. + CW_SpecificReg = CW_Okay, // Specific register operands. + CW_Register = CW_Good, // Register operands. + CW_Memory = CW_Better, // Memory operands. + CW_Constant = CW_Best, // Constant operand. + CW_Default = CW_Okay // Default or don't know type. }; /// This contains information for each constraint that we are lowering. @@ -4153,20 +4085,20 @@ /// Examine constraint type and operand type and determine a weight value. /// The operand object must already have been set up with the operand type. - virtual ConstraintWeight getMultipleConstraintMatchWeight( - AsmOperandInfo &info, int maIndex) const; + virtual ConstraintWeight + getMultipleConstraintMatchWeight(AsmOperandInfo &info, int maIndex) const; /// Examine constraint string and operand type and determine a weight value. /// The operand object must already have been set up with the operand type. - virtual ConstraintWeight getSingleConstraintMatchWeight( - AsmOperandInfo &info, const char *constraint) const; + virtual ConstraintWeight + getSingleConstraintMatchWeight(AsmOperandInfo &info, + const char *constraint) const; /// Determines the constraint code and constraint type to use for the specific /// AsmOperandInfo, setting OpInfo.ConstraintCode and OpInfo.ConstraintType. /// If the actual operand being passed in is available, it can be passed in as /// Op, otherwise an empty SDValue can be passed. - virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, - SDValue Op, + virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG = nullptr) const; /// Given a constraint, return the type of constraint it is for this target. @@ -4211,9 +4143,11 @@ //===--------------------------------------------------------------------===// // Div utility functions // - SDValue BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization, + SDValue BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterOpLegalization, + bool IsAfterTyLegalization, SmallVectorImpl &Created) const; - SDValue BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization, + SDValue BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterOpLegalization, + bool IsAfterTyLegalization, SmallVectorImpl &Created) const; /// Targets may override this function to provide custom SDIV lowering for @@ -4228,9 +4162,7 @@ /// divisor. If the transform should never be done, return zero. If the /// transform should be done, return the minimum number of divisor uses /// that must exist. - virtual unsigned combineRepeatedFPDivisors() const { - return 0; - } + virtual unsigned combineRepeatedFPDivisors() const { return 0; } /// Hooks for building estimates in place of slower divisions and square /// roots. @@ -4425,9 +4357,9 @@ /// method accepts integers as its arguments. /// Note: This method may fail if the division could not be performed /// within the type. Clients must retry with a wider type if this happens. - SDValue expandFixedPointDiv(unsigned Opcode, const SDLoc &dl, - SDValue LHS, SDValue RHS, - unsigned Scale, SelectionDAG &DAG) const; + SDValue expandFixedPointDiv(unsigned Opcode, const SDLoc &dl, SDValue LHS, + SDValue RHS, unsigned Scale, + SelectionDAG &DAG) const; /// Method for building the DAG expansion of ISD::U(ADD|SUB)O. Expansion /// always suceeds and populates the Result and Overflow arguments. @@ -4476,9 +4408,7 @@ /// If this function returns true, SelectionDAGBuilder emits a /// LOAD_STACK_GUARD node when it is lowering Intrinsic::stackprotector. - virtual bool useLoadStackGuardNode() const { - return false; - } + virtual bool useLoadStackGuardNode() const { return false; } virtual SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const { @@ -4491,7 +4421,8 @@ /// Expands target specific indirect branch for the case of JumpTable /// expanasion. - virtual SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value, SDValue Addr, + virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, + SDValue Addr, SelectionDAG &DAG) const { return DAG.getNode(ISD::BRIND, dl, MVT::Other, Value, Addr); } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -21621,7 +21621,7 @@ return SDValue(); SmallVector Built; - if (SDValue S = TLI.BuildSDIV(N, DAG, LegalOperations, Built)) { + if (SDValue S = TLI.BuildSDIV(N, DAG, LegalOperations, LegalTypes, Built)) { for (SDNode *N : Built) AddToWorklist(N); return S; @@ -21662,7 +21662,7 @@ return SDValue(); SmallVector Built; - if (SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, Built)) { + if (SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, LegalTypes, Built)) { for (SDNode *N : Built) AddToWorklist(N); return S; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -75,8 +75,8 @@ return isUsedByReturnOnly(Node, Chain); } -bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI, - const uint32_t *CallerPreservedMask, +bool TargetLowering::parametersInCSRMatch( + const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl &ArgLocs, const SmallVectorImpl &OutVals) const { for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) { @@ -126,12 +126,9 @@ /// Generate a libcall taking the given operands as arguments and returning a /// result of type RetVT. -std::pair -TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, - ArrayRef Ops, - MakeLibCallOptions CallOptions, - const SDLoc &dl, - SDValue InChain) const { +std::pair TargetLowering::makeLibCall( + SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef Ops, + MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue InChain) const { if (!InChain) InChain = DAG.getEntryNode(); @@ -143,8 +140,8 @@ SDValue NewOp = Ops[i]; Entry.Node = NewOp; Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext()); - Entry.IsSExt = shouldSignExtendTypeInLibCall(NewOp.getValueType(), - CallOptions.IsSExt); + Entry.IsSExt = + shouldSignExtendTypeInLibCall(NewOp.getValueType(), CallOptions.IsSExt); Entry.IsZExt = !Entry.IsSExt; if (CallOptions.IsSoften && @@ -274,8 +271,8 @@ /// SELECT_CC, and SETCC handlers. void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, - ISD::CondCode &CCCode, - const SDLoc &dl, const SDValue OldLHS, + ISD::CondCode &CCCode, const SDLoc &dl, + const SDValue OldLHS, const SDValue OldRHS) const { SDValue Chain; return softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, dl, OldLHS, @@ -284,17 +281,17 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, - ISD::CondCode &CCCode, - const SDLoc &dl, const SDValue OldLHS, - const SDValue OldRHS, - SDValue &Chain, + ISD::CondCode &CCCode, const SDLoc &dl, + const SDValue OldLHS, + const SDValue OldRHS, SDValue &Chain, bool IsSignaling) const { // FIXME: Currently we cannot really respect all IEEE predicates due to libgcc // not supporting it. We can update this code when libgcc provides such // functions. - assert((VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128 || VT == MVT::ppcf128) - && "Unsupported setcc type!"); + assert((VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128 || + VT == MVT::ppcf128) && + "Unsupported setcc type!"); // Expand into one or more soft-fp libcall(s). RTLIB::Libcall LC1 = RTLIB::UNKNOWN_LIBCALL, LC2 = RTLIB::UNKNOWN_LIBCALL; @@ -302,85 +299,112 @@ switch (CCCode) { case ISD::SETEQ: case ISD::SETOEQ: - LC1 = (VT == MVT::f32) ? RTLIB::OEQ_F32 : - (VT == MVT::f64) ? RTLIB::OEQ_F64 : - (VT == MVT::f128) ? RTLIB::OEQ_F128 : RTLIB::OEQ_PPCF128; + LC1 = (VT == MVT::f32) + ? RTLIB::OEQ_F32 + : (VT == MVT::f64) + ? RTLIB::OEQ_F64 + : (VT == MVT::f128) ? RTLIB::OEQ_F128 : RTLIB::OEQ_PPCF128; break; case ISD::SETNE: case ISD::SETUNE: - LC1 = (VT == MVT::f32) ? RTLIB::UNE_F32 : - (VT == MVT::f64) ? RTLIB::UNE_F64 : - (VT == MVT::f128) ? RTLIB::UNE_F128 : RTLIB::UNE_PPCF128; + LC1 = (VT == MVT::f32) + ? RTLIB::UNE_F32 + : (VT == MVT::f64) + ? RTLIB::UNE_F64 + : (VT == MVT::f128) ? RTLIB::UNE_F128 : RTLIB::UNE_PPCF128; break; case ISD::SETGE: case ISD::SETOGE: - LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 : - (VT == MVT::f64) ? RTLIB::OGE_F64 : - (VT == MVT::f128) ? RTLIB::OGE_F128 : RTLIB::OGE_PPCF128; + LC1 = (VT == MVT::f32) + ? RTLIB::OGE_F32 + : (VT == MVT::f64) + ? RTLIB::OGE_F64 + : (VT == MVT::f128) ? RTLIB::OGE_F128 : RTLIB::OGE_PPCF128; break; case ISD::SETLT: case ISD::SETOLT: - LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 : - (VT == MVT::f64) ? RTLIB::OLT_F64 : - (VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128; + LC1 = (VT == MVT::f32) + ? RTLIB::OLT_F32 + : (VT == MVT::f64) + ? RTLIB::OLT_F64 + : (VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128; break; case ISD::SETLE: case ISD::SETOLE: - LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 : - (VT == MVT::f64) ? RTLIB::OLE_F64 : - (VT == MVT::f128) ? RTLIB::OLE_F128 : RTLIB::OLE_PPCF128; + LC1 = (VT == MVT::f32) + ? RTLIB::OLE_F32 + : (VT == MVT::f64) + ? RTLIB::OLE_F64 + : (VT == MVT::f128) ? RTLIB::OLE_F128 : RTLIB::OLE_PPCF128; break; case ISD::SETGT: case ISD::SETOGT: - LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 : - (VT == MVT::f64) ? RTLIB::OGT_F64 : - (VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128; + LC1 = (VT == MVT::f32) + ? RTLIB::OGT_F32 + : (VT == MVT::f64) + ? RTLIB::OGT_F64 + : (VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128; break; case ISD::SETO: ShouldInvertCC = true; LLVM_FALLTHROUGH; case ISD::SETUO: - LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 : - (VT == MVT::f64) ? RTLIB::UO_F64 : - (VT == MVT::f128) ? RTLIB::UO_F128 : RTLIB::UO_PPCF128; + LC1 = (VT == MVT::f32) + ? RTLIB::UO_F32 + : (VT == MVT::f64) + ? RTLIB::UO_F64 + : (VT == MVT::f128) ? RTLIB::UO_F128 : RTLIB::UO_PPCF128; break; case ISD::SETONE: // SETONE = O && UNE ShouldInvertCC = true; LLVM_FALLTHROUGH; case ISD::SETUEQ: - LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 : - (VT == MVT::f64) ? RTLIB::UO_F64 : - (VT == MVT::f128) ? RTLIB::UO_F128 : RTLIB::UO_PPCF128; - LC2 = (VT == MVT::f32) ? RTLIB::OEQ_F32 : - (VT == MVT::f64) ? RTLIB::OEQ_F64 : - (VT == MVT::f128) ? RTLIB::OEQ_F128 : RTLIB::OEQ_PPCF128; + LC1 = (VT == MVT::f32) + ? RTLIB::UO_F32 + : (VT == MVT::f64) + ? RTLIB::UO_F64 + : (VT == MVT::f128) ? RTLIB::UO_F128 : RTLIB::UO_PPCF128; + LC2 = (VT == MVT::f32) + ? RTLIB::OEQ_F32 + : (VT == MVT::f64) + ? RTLIB::OEQ_F64 + : (VT == MVT::f128) ? RTLIB::OEQ_F128 : RTLIB::OEQ_PPCF128; break; default: // Invert CC for unordered comparisons ShouldInvertCC = true; switch (CCCode) { case ISD::SETULT: - LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 : - (VT == MVT::f64) ? RTLIB::OGE_F64 : - (VT == MVT::f128) ? RTLIB::OGE_F128 : RTLIB::OGE_PPCF128; + LC1 = (VT == MVT::f32) + ? RTLIB::OGE_F32 + : (VT == MVT::f64) ? RTLIB::OGE_F64 + : (VT == MVT::f128) ? RTLIB::OGE_F128 + : RTLIB::OGE_PPCF128; break; case ISD::SETULE: - LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 : - (VT == MVT::f64) ? RTLIB::OGT_F64 : - (VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128; + LC1 = (VT == MVT::f32) + ? RTLIB::OGT_F32 + : (VT == MVT::f64) ? RTLIB::OGT_F64 + : (VT == MVT::f128) ? RTLIB::OGT_F128 + : RTLIB::OGT_PPCF128; break; case ISD::SETUGT: - LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 : - (VT == MVT::f64) ? RTLIB::OLE_F64 : - (VT == MVT::f128) ? RTLIB::OLE_F128 : RTLIB::OLE_PPCF128; + LC1 = (VT == MVT::f32) + ? RTLIB::OLE_F32 + : (VT == MVT::f64) ? RTLIB::OLE_F64 + : (VT == MVT::f128) ? RTLIB::OLE_F128 + : RTLIB::OLE_PPCF128; break; case ISD::SETUGE: - LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 : - (VT == MVT::f64) ? RTLIB::OLT_F64 : - (VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128; + LC1 = (VT == MVT::f32) + ? RTLIB::OLT_F32 + : (VT == MVT::f64) ? RTLIB::OLT_F64 + : (VT == MVT::f128) ? RTLIB::OLT_F128 + : RTLIB::OLT_PPCF128; break; - default: llvm_unreachable("Do not know how to soften this setcc!"); + default: + llvm_unreachable("Do not know how to soften this setcc!"); } } @@ -388,8 +412,7 @@ EVT RetVT = getCmpLibcallReturnType(); SDValue Ops[2] = {NewLHS, NewRHS}; TargetLowering::MakeLibCallOptions CallOptions; - EVT OpsVT[2] = { OldLHS.getValueType(), - OldRHS.getValueType() }; + EVT OpsVT[2] = {OldLHS.getValueType(), OldRHS.getValueType()}; CallOptions.setTypeListBeforeSoften(OpsVT, RetVT, true); auto Call = makeLibCall(DAG, LC1, RetVT, Ops, CallOptions, dl, Chain); NewLHS = Call.first; @@ -451,15 +474,13 @@ /// This returns the relocation base for the given PIC jumptable, the same as /// getPICJumpTableRelocBase, but as an MCExpr. -const MCExpr * -TargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF, - unsigned JTI,MCContext &Ctx) const{ +const MCExpr *TargetLowering::getPICJumpTableRelocBaseExpr( + const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const { // The normal PIC reloc base is the label at the start of the jump table. return MCSymbolRefExpr::create(MF->getJTISymbol(JTI, Ctx), Ctx); } -bool -TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { +bool TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { const TargetMachine &TM = getTargetMachine(); const GlobalValue *GV = GA->getGlobal(); @@ -1330,7 +1351,7 @@ if (DemandedBits.isSubsetOf(Known.Zero | Known2.Zero)) return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, VT, Op0, Op1)); - ConstantSDNode* C = isConstOrConstSplat(Op1, DemandedElts); + ConstantSDNode *C = isConstOrConstSplat(Op1, DemandedElts); if (C) { // If one side is a constant, and all of the set bits in the constant are // also known set on the other side, turn this into an AND, as we know @@ -1346,8 +1367,7 @@ // If the RHS is a constant, see if we can change it. Don't alter a -1 // constant because that's a 'not' op, and that is better for combining // and codegen. - if (!C->isAllOnesValue() && - DemandedBits.isSubsetOf(C->getAPIntValue())) { + if (!C->isAllOnesValue() && DemandedBits.isSubsetOf(C->getAPIntValue())) { // We're flipping all demanded bits. Flip the undemanded bits too. SDValue New = TLO.DAG.getNOT(dl, Op0, VT); return TLO.CombineTo(Op, New); @@ -1704,8 +1724,8 @@ // For pow-2 bitwidths we only demand the bottom modulo amt bits. if (isPowerOf2_32(BitWidth)) { APInt DemandedAmtBits(Op2.getScalarValueSizeInBits(), BitWidth - 1); - if (SimplifyDemandedBits(Op2, DemandedAmtBits, DemandedElts, - Known2, TLO, Depth + 1)) + if (SimplifyDemandedBits(Op2, DemandedAmtBits, DemandedElts, Known2, TLO, + Depth + 1)) return true; } break; @@ -1753,8 +1773,8 @@ // op legalization. // FIXME: Limit to scalars for now. if (DemandedBits.isOneValue() && !TLO.LegalOps && !VT.isVector()) - return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::PARITY, dl, VT, - Op.getOperand(0))); + return TLO.CombineTo( + Op, TLO.DAG.getNode(ISD::PARITY, dl, VT, Op.getOperand(0))); Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth); break; @@ -2706,9 +2726,8 @@ // Update legal shuffle masks based on demanded elements if it won't reduce // to Identity which can cause premature removal of the shuffle mask. if (Updated && !IdentityLHS && !IdentityRHS && !TLO.LegalOps) { - SDValue LegalShuffle = - buildLegalVectorShuffle(VT, DL, Op.getOperand(0), Op.getOperand(1), - NewMask, TLO.DAG); + SDValue LegalShuffle = buildLegalVectorShuffle( + VT, DL, Op.getOperand(0), Op.getOperand(1), NewMask, TLO.DAG); if (LegalShuffle) return TLO.CombineTo(Op, LegalShuffle); } @@ -2912,14 +2931,14 @@ } void TargetLowering::computeKnownBitsForFrameIndex( - const int FrameIdx, KnownBits &Known, const MachineFunction &MF) const { + const int FrameIdx, KnownBits &Known, const MachineFunction &MF) const { // The low bits are known zero if the pointer is aligned. Known.Zero.setLowBits(Log2(MF.getFrameInfo().getObjectAlign(FrameIdx))); } Align TargetLowering::computeKnownAlignForTargetInstr( - GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, - unsigned Depth) const { + GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, + unsigned Depth) const { return Align(1); } @@ -2939,8 +2958,8 @@ } unsigned TargetLowering::computeNumSignBitsForTargetInstr( - GISelKnownBits &Analysis, Register R, const APInt &DemandedElts, - const MachineRegisterInfo &MRI, unsigned Depth) const { + GISelKnownBits &Analysis, Register R, const APInt &DemandedElts, + const MachineRegisterInfo &MRI, unsigned Depth) const { return 1; } @@ -2982,10 +3001,10 @@ return SDValue(); } -SDValue -TargetLowering::buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, - SDValue N1, MutableArrayRef Mask, - SelectionDAG &DAG) const { +SDValue TargetLowering::buildLegalVectorShuffle(EVT VT, const SDLoc &DL, + SDValue N0, SDValue N1, + MutableArrayRef Mask, + SelectionDAG &DAG) const { bool LegalMask = isShuffleMaskLegal(Mask, VT); if (!LegalMask) { std::swap(N0, N1); @@ -2999,7 +3018,7 @@ return DAG.getVectorShuffle(VT, DL, N0, N1, Mask); } -const Constant *TargetLowering::getTargetConstantFromLoad(LoadSDNode*) const { +const Constant *TargetLowering::getTargetConstantFromLoad(LoadSDNode *) const { return nullptr; } @@ -3359,8 +3378,8 @@ return SDValue(); // (X - Y) == Y --> X == Y << 1 - EVT ShiftVT = getShiftAmountTy(OpVT, DAG.getDataLayout(), - !DCI.isBeforeLegalize()); + EVT ShiftVT = + getShiftAmountTy(OpVT, DAG.getDataLayout(), !DCI.isBeforeLegalize()); SDValue One = DAG.getConstant(1, DL, ShiftVT); SDValue YShl1 = DAG.getNode(ISD::SHL, DL, N1.getValueType(), Y, One); if (!DCI.isCalledByLegalizer()) @@ -3397,8 +3416,8 @@ if (!isConstOrConstSplat(N0) && !isConstOrConstSplat(N1) && (DCI.isBeforeLegalizeOps() || isCondCodeLegal(SwappedCC, N0.getSimpleValueType())) && - DAG.getNodeIfExists(ISD::SUB, DAG.getVTList(OpVT), { N1, N0 } ) && - !DAG.getNodeIfExists(ISD::SUB, DAG.getVTList(OpVT), { N0, N1 } )) + DAG.getNodeIfExists(ISD::SUB, DAG.getVTList(OpVT), {N1, N0}) && + !DAG.getNodeIfExists(ISD::SUB, DAG.getVTList(OpVT), {N0, N1})) return DAG.getSetCC(dl, VT, N1, N0, SwappedCC); if (auto *N1C = dyn_cast(N1.getNode())) { @@ -3423,8 +3442,7 @@ Cond = ISD::SETEQ; } SDValue Zero = DAG.getConstant(0, dl, N0.getValueType()); - return DAG.getSetCC(dl, VT, N0.getOperand(0).getOperand(0), - Zero, Cond); + return DAG.getSetCC(dl, VT, N0.getOperand(0).getOperand(0), Zero, Cond); } } @@ -3441,7 +3459,8 @@ // (ctpop x) u< 2 -> (x & x-1) == 0 // (ctpop x) u> 1 -> (x & x-1) != 0 - if ((Cond == ISD::SETULT && C1 == 2) || (Cond == ISD::SETUGT && C1 == 1)){ + if ((Cond == ISD::SETULT && C1 == 2) || + (Cond == ISD::SETUGT && C1 == 1)) { SDValue NegOne = DAG.getAllOnesConstant(dl, CTVT); SDValue Add = DAG.getNode(ISD::ADD, dl, CTVT, CTOp, NegOne); SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Add); @@ -3469,8 +3488,8 @@ // (zext x) == C --> x == (trunc C) // (sext x) == C --> x == (trunc C) - if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && - DCI.isBeforeLegalize() && N0->hasOneUse()) { + if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && DCI.isBeforeLegalize() && + N0->hasOneUse()) { unsigned MinBits = N0.getValueSizeInBits(); SDValue PreExt; bool Signed = false; @@ -3481,7 +3500,7 @@ } else if (N0->getOpcode() == ISD::AND) { // DAGCombine turns costly ZExts into ANDs if (auto *C = dyn_cast(N0->getOperand(1))) - if ((C->getAPIntValue()+1).isPowerOf2()) { + if ((C->getAPIntValue() + 1).isPowerOf2()) { MinBits = C->getAPIntValue().countTrailingOnes(); PreExt = N0->getOperand(0); } @@ -3503,14 +3522,11 @@ } // Figure out how many bits we need to preserve this constant. - unsigned ReqdBits = Signed ? - C1.getBitWidth() - C1.getNumSignBits() + 1 : - C1.getActiveBits(); + unsigned ReqdBits = Signed ? C1.getBitWidth() - C1.getNumSignBits() + 1 + : C1.getActiveBits(); // Make sure we're not losing bits from the constant. - if (MinBits > 0 && - MinBits < C1.getBitWidth() && - MinBits >= ReqdBits) { + if (MinBits > 0 && MinBits < C1.getBitWidth() && MinBits >= ReqdBits) { EVT MinVT = EVT::getIntegerVT(*DAG.getContext(), MinBits); if (isTypeDesirableForOp(ISD::SETCC, MinVT)) { // Will get folded away. @@ -3550,8 +3566,7 @@ cast(TopSetCC.getOperand(2))->get(), TopSetCC.getOperand(0).getValueType()); return DAG.getSetCC(dl, VT, TopSetCC.getOperand(0), - TopSetCC.getOperand(1), - InvCond); + TopSetCC.getOperand(1), InvCond); } } } @@ -3559,10 +3574,8 @@ // If the LHS is '(and load, const)', the RHS is 0, the test is for // equality or unsigned, and all 1 bits of the const are in the same // partial word, see if we can shorten the load. - if (DCI.isBeforeLegalize() && - !ISD::isSignedIntSetCC(Cond) && - N0.getOpcode() == ISD::AND && C1 == 0 && - N0.getNode()->hasOneUse() && + if (DCI.isBeforeLegalize() && !ISD::isSignedIntSetCC(Cond) && + N0.getOpcode() == ISD::AND && C1 == 0 && N0.getNode()->hasOneUse() && isa(N0.getOperand(0)) && N0.getOperand(0).getNode()->hasOneUse() && isa(N0.getOperand(1))) { @@ -3577,15 +3590,15 @@ if (Lod->getExtensionType() != ISD::NON_EXTLOAD) origWidth = Lod->getMemoryVT().getSizeInBits(); const APInt &Mask = N0.getConstantOperandAPInt(1); - for (unsigned width = origWidth / 2; width>=8; width /= 2) { + for (unsigned width = origWidth / 2; width >= 8; width /= 2) { APInt newMask = APInt::getLowBitsSet(maskWidth, width); - for (unsigned offset=0; offsetgetChain(), Ptr, Lod->getPointerInfo().getWithOffset(bestOffset), Lod->getOriginalAlign()); - return DAG.getSetCC(dl, VT, - DAG.getNode(ISD::AND, dl, newVT, NewLoad, - DAG.getConstant(bestMask.trunc(bestWidth), - dl, newVT)), - DAG.getConstant(0LL, dl, newVT), Cond); + return DAG.getSetCC( + dl, VT, + DAG.getNode( + ISD::AND, dl, newVT, NewLoad, + DAG.getConstant(bestMask.trunc(bestWidth), dl, newVT)), + DAG.getConstant(0LL, dl, newVT), Cond); } } } @@ -3659,8 +3673,8 @@ EVT NewSetCCVT = getSetCCResultType(Layout, *DAG.getContext(), newVT); SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT); - SDValue NewSetCC = DAG.getSetCC(dl, NewSetCCVT, N0.getOperand(0), - NewConst, Cond); + SDValue NewSetCC = + DAG.getSetCC(dl, NewSetCCVT, N0.getOperand(0), NewConst, Cond); return DAG.getBoolExtOrTrunc(NewSetCC, dl, VT, N0.getValueType()); } break; @@ -3692,20 +3706,19 @@ if (!DCI.isCalledByLegalizer()) DCI.AddToWorklist(ZextOp.getNode()); // Otherwise, make this a use of a zext. - return DAG.getSetCC(dl, VT, ZextOp, - DAG.getConstant(C1 & APInt::getLowBitsSet( - ExtDstTyBits, - ExtSrcTyBits), - dl, ExtDstTy), - Cond); + return DAG.getSetCC( + dl, VT, ZextOp, + DAG.getConstant(C1 & APInt::getLowBitsSet(ExtDstTyBits, ExtSrcTyBits), + dl, ExtDstTy), + Cond); } else if ((N1C->isNullValue() || N1C->isOne()) && - (Cond == ISD::SETEQ || Cond == ISD::SETNE)) { + (Cond == ISD::SETEQ || Cond == ISD::SETNE)) { // SETCC (SETCC), [0|1], [EQ|NE] -> SETCC - if (N0.getOpcode() == ISD::SETCC && - isTypeLegal(VT) && VT.bitsLE(N0.getValueType()) && + if (N0.getOpcode() == ISD::SETCC && isTypeLegal(VT) && + VT.bitsLE(N0.getValueType()) && (N0.getValueType() == MVT::i1 || getBooleanContents(N0.getOperand(0).getValueType()) == - ZeroOrOneBooleanContent)) { + ZeroOrOneBooleanContent)) { bool TrueWhenTrue = (Cond == ISD::SETEQ) ^ (!N1C->isOne()); if (TrueWhenTrue) return DAG.getNode(ISD::TRUNCATE, dl, VT, N0); @@ -3726,20 +3739,18 @@ // If this is (X^1) == 0/1, swap the RHS and eliminate the xor. We // can only do this if the top bits are known zero. unsigned BitWidth = N0.getValueSizeInBits(); - if (DAG.MaskedValueIsZero(N0, - APInt::getHighBitsSet(BitWidth, - BitWidth-1))) { + if (DAG.MaskedValueIsZero( + N0, APInt::getHighBitsSet(BitWidth, BitWidth - 1))) { // Okay, get the un-inverted input value. SDValue Val; if (N0.getOpcode() == ISD::XOR) { Val = N0.getOperand(0); } else { assert(N0.getOpcode() == ISD::AND && - N0.getOperand(0).getOpcode() == ISD::XOR); + N0.getOperand(0).getOpcode() == ISD::XOR); // ((X^1)&1)^1 -> X & 1 Val = DAG.getNode(ISD::AND, dl, N0.getValueType(), - N0.getOperand(0).getOperand(0), - N0.getOperand(1)); + N0.getOperand(0).getOperand(0), N0.getOperand(1)); } return DAG.getSetCC(dl, VT, Val, N1, @@ -3758,9 +3769,9 @@ // Ensure that the input setccs return an i1 type or 0/1 value. if (Op0.getValueType() == MVT::i1 || (getBooleanContents(XorLHS.getOperand(0).getValueType()) == - ZeroOrOneBooleanContent && + ZeroOrOneBooleanContent && getBooleanContents(XorRHS.getOperand(0).getValueType()) == - ZeroOrOneBooleanContent)) { + ZeroOrOneBooleanContent)) { // (xor (setcc), (setcc)) == / != 1 -> (setcc) != / == (setcc) Cond = (Cond == ISD::SETEQ) ? ISD::SETNE : ISD::SETEQ; return DAG.getSetCC(dl, VT, XorLHS, XorRHS, Cond); @@ -3771,13 +3782,15 @@ cast(Op0.getOperand(1))->isOne()) { // If this is (X&1) == / != 1, normalize it to (X&1) != / == 0. if (Op0.getValueType().bitsGT(VT)) - Op0 = DAG.getNode(ISD::AND, dl, VT, - DAG.getNode(ISD::TRUNCATE, dl, VT, Op0.getOperand(0)), - DAG.getConstant(1, dl, VT)); + Op0 = DAG.getNode( + ISD::AND, dl, VT, + DAG.getNode(ISD::TRUNCATE, dl, VT, Op0.getOperand(0)), + DAG.getConstant(1, dl, VT)); else if (Op0.getValueType().bitsLT(VT)) - Op0 = DAG.getNode(ISD::AND, dl, VT, - DAG.getNode(ISD::ANY_EXTEND, dl, VT, Op0.getOperand(0)), - DAG.getConstant(1, dl, VT)); + Op0 = DAG.getNode( + ISD::AND, dl, VT, + DAG.getNode(ISD::ANY_EXTEND, dl, VT, Op0.getOperand(0)), + DAG.getConstant(1, dl, VT)); return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, Op0.getValueType()), @@ -3838,8 +3851,7 @@ (!N1C->isOpaque() || (C.getBitWidth() <= 64 && isLegalICmpImmediate(C.getSExtValue())))) { return DAG.getSetCC(dl, VT, N0, - DAG.getConstant(C, dl, N1.getValueType()), - NewCC); + DAG.getConstant(C, dl, N1.getValueType()), NewCC); } } } @@ -3858,8 +3870,7 @@ (!N1C->isOpaque() || (C.getBitWidth() <= 64 && isLegalICmpImmediate(C.getSExtValue())))) { return DAG.getSetCC(dl, VT, N0, - DAG.getConstant(C, dl, N1.getValueType()), - NewCC); + DAG.getConstant(C, dl, N1.getValueType()), NewCC); } } } @@ -3875,7 +3886,7 @@ return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE); // If we have setult X, 1, turn it into seteq X, 0 - if (C1 == MinVal+1) + if (C1 == MinVal + 1) return DAG.getSetCC(dl, VT, N0, DAG.getConstant(MinVal, dl, N0.getValueType()), ISD::SETEQ); @@ -3893,7 +3904,7 @@ return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE); // If we have setugt X, Max-1, turn it into seteq X, Max - if (C1 == MaxVal-1) + if (C1 == MaxVal - 1) return DAG.getSetCC(dl, VT, N0, DAG.getConstant(MaxVal, dl, N0.getValueType()), ISD::SETEQ); @@ -3916,9 +3927,8 @@ // SETUGE X, SINTMIN -> SETLT X, 0 if ((Cond == ISD::SETUGT && C1.isMaxSignedValue()) || (Cond == ISD::SETUGE && C1.isMinSignedValue())) - return DAG.getSetCC(dl, VT, N0, - DAG.getConstant(0, dl, N1.getValueType()), - ISD::SETLT); + return DAG.getSetCC( + dl, VT, N0, DAG.getConstant(0, dl, N1.getValueType()), ISD::SETLT); // SETULT X, SINTMIN -> SETGT X, -1 // SETULE X, SINTMAX -> SETGT X, -1 @@ -3944,7 +3954,7 @@ if (auto *AndRHS = dyn_cast(N0.getOperand(1))) { EVT ShiftTy = getShiftAmountTy(ShValTy, Layout, !DCI.isBeforeLegalize()); - if (Cond == ISD::SETNE && C1 == 0) {// (X & 8) != 0 --> (X & 8) >> 3 + if (Cond == ISD::SETNE && C1 == 0) { // (X & 8) != 0 --> (X & 8) >> 3 // Perform the xform if the AND RHS is a single bit. unsigned ShCt = AndRHS->getAPIntValue().logBase2(); if (AndRHS->getAPIntValue().isPowerOf2() && @@ -3979,8 +3989,8 @@ unsigned ShiftBits = AndRHSC.countTrailingZeros(); if (!TLI.shouldAvoidTransformToShift(ShValTy, ShiftBits)) { SDValue Shift = - DAG.getNode(ISD::SRL, dl, ShValTy, N0.getOperand(0), - DAG.getConstant(ShiftBits, dl, ShiftTy)); + DAG.getNode(ISD::SRL, dl, ShValTy, N0.getOperand(0), + DAG.getConstant(ShiftBits, dl, ShiftTy)); SDValue CmpRHS = DAG.getConstant(C1.lshr(ShiftBits), dl, ShValTy); return DAG.getSetCC(dl, VT, Shift, CmpRHS, Cond); } @@ -4047,11 +4057,20 @@ bool IsNegInf = CFP->getValueAPF().isNegative(); ISD::CondCode NewCond = ISD::SETCC_INVALID; switch (Cond) { - case ISD::SETOEQ: NewCond = IsNegInf ? ISD::SETOLE : ISD::SETOGE; break; - case ISD::SETUEQ: NewCond = IsNegInf ? ISD::SETULE : ISD::SETUGE; break; - case ISD::SETUNE: NewCond = IsNegInf ? ISD::SETUGT : ISD::SETULT; break; - case ISD::SETONE: NewCond = IsNegInf ? ISD::SETOGT : ISD::SETOLT; break; - default: break; + case ISD::SETOEQ: + NewCond = IsNegInf ? ISD::SETOLE : ISD::SETOGE; + break; + case ISD::SETUEQ: + NewCond = IsNegInf ? ISD::SETULE : ISD::SETUGE; + break; + case ISD::SETUNE: + NewCond = IsNegInf ? ISD::SETUGT : ISD::SETULT; + break; + case ISD::SETONE: + NewCond = IsNegInf ? ISD::SETOGT : ISD::SETOLT; + break; + default: + break; } if (NewCond != ISD::SETCC_INVALID && isCondCodeLegal(NewCond, N0.getSimpleValueType())) @@ -4075,8 +4094,7 @@ // Otherwise, we can't fold it. However, we can simplify it to SETUO/SETO // if it is not already. ISD::CondCode NewCond = UOF == 0 ? ISD::SETO : ISD::SETUO; - if (NewCond != Cond && - (DCI.isBeforeLegalizeOps() || + if (NewCond != Cond && (DCI.isBeforeLegalizeOps() || isCondCodeLegal(NewCond, N0.getSimpleValueType()))) return DAG.getSetCC(dl, VT, N0, N1, NewCond); } @@ -4110,10 +4128,11 @@ if (auto *LHSR = dyn_cast(N0.getOperand(1))) { // Turn (X+C1) == C2 --> X == C2-C1 if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse()) { - return DAG.getSetCC(dl, VT, N0.getOperand(0), - DAG.getConstant(RHSC->getAPIntValue()- - LHSR->getAPIntValue(), - dl, N0.getValueType()), Cond); + return DAG.getSetCC( + dl, VT, N0.getOperand(0), + DAG.getConstant(RHSC->getAPIntValue() - LHSR->getAPIntValue(), + dl, N0.getValueType()), + Cond); } // Turn (X^C1) == C2 into X == C1^C2 iff X&~C1 = 0. @@ -4121,23 +4140,21 @@ // If we know that all of the inverted bits are zero, don't bother // performing the inversion. if (DAG.MaskedValueIsZero(N0.getOperand(0), ~LHSR->getAPIntValue())) - return - DAG.getSetCC(dl, VT, N0.getOperand(0), - DAG.getConstant(LHSR->getAPIntValue() ^ - RHSC->getAPIntValue(), - dl, N0.getValueType()), - Cond); + return DAG.getSetCC( + dl, VT, N0.getOperand(0), + DAG.getConstant(LHSR->getAPIntValue() ^ RHSC->getAPIntValue(), + dl, N0.getValueType()), + Cond); } // Turn (C1-X) == C2 --> X == C1-C2 if (auto *SUBC = dyn_cast(N0.getOperand(0))) { if (N0.getOpcode() == ISD::SUB && N0.getNode()->hasOneUse()) { - return - DAG.getSetCC(dl, VT, N0.getOperand(1), - DAG.getConstant(SUBC->getAPIntValue() - - RHSC->getAPIntValue(), - dl, N0.getValueType()), - Cond); + return DAG.getSetCC( + dl, VT, N0.getOperand(1), + DAG.getConstant(SUBC->getAPIntValue() - RHSC->getAPIntValue(), + dl, N0.getValueType()), + Cond); } } @@ -4186,14 +4203,15 @@ if (N0.getValueType().getScalarType() == MVT::i1 && foldBooleans) { SDValue Temp; switch (Cond) { - default: llvm_unreachable("Unknown integer setcc!"); - case ISD::SETEQ: // X == Y -> ~(X^Y) + default: + llvm_unreachable("Unknown integer setcc!"); + case ISD::SETEQ: // X == Y -> ~(X^Y) Temp = DAG.getNode(ISD::XOR, dl, OpVT, N0, N1); N0 = DAG.getNOT(dl, Temp, OpVT); if (!DCI.isCalledByLegalizer()) DCI.AddToWorklist(Temp.getNode()); break; - case ISD::SETNE: // X != Y --> (X^Y) + case ISD::SETNE: // X != Y --> (X^Y) N0 = DAG.getNode(ISD::XOR, dl, OpVT, N0, N1); break; case ISD::SETGT: // X >s Y --> X == 0 & Y == 1 --> ~X & Y @@ -4285,7 +4303,8 @@ if (S == 1) { switch (Constraint[0]) { - default: break; + default: + break; case 'r': return C_RegisterClass; case 'm': // memory @@ -4346,21 +4365,23 @@ std::vector &Ops, SelectionDAG &DAG) const { - if (Constraint.length() > 1) return; + if (Constraint.length() > 1) + return; char ConstraintLetter = Constraint[0]; switch (ConstraintLetter) { - default: break; - case 'X': // Allows any operand; labels (basic block) use this. + default: + break; + case 'X': // Allows any operand; labels (basic block) use this. if (Op.getOpcode() == ISD::BasicBlock || Op.getOpcode() == ISD::TargetBlockAddress) { Ops.push_back(Op); return; } LLVM_FALLTHROUGH; - case 'i': // Simple Integer or Relocatable Constant - case 'n': // Simple Integer - case 's': { // Relocatable Constant + case 'i': // Simple Integer or Relocatable Constant + case 'n': // Simple Integer + case 's': { // Relocatable Constant GlobalAddressSDNode *GA; ConstantSDNode *C; @@ -4385,12 +4406,12 @@ // ScheduleDAGSDNodes::EmitNode, which is very generic. bool IsBool = C->getConstantIntValue()->getBitWidth() == 1; BooleanContent BCont = getBooleanContents(MVT::i64); - ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont) - : ISD::SIGN_EXTEND; - int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? C->getZExtValue() - : C->getSExtValue(); - Ops.push_back(DAG.getTargetConstant(Offset + ExtVal, - SDLoc(C), MVT::i64)); + ISD::NodeType ExtOpc = + IsBool ? getExtendForContent(BCont) : ISD::SIGN_EXTEND; + int64_t ExtVal = + ExtOpc == ISD::ZERO_EXTEND ? C->getZExtValue() : C->getSExtValue(); + Ops.push_back( + DAG.getTargetConstant(Offset + ExtVal, SDLoc(C), MVT::i64)); return; } else if ((BA = dyn_cast(Op)) && ConstraintLetter != 'n') { @@ -4441,8 +4462,8 @@ if (!isLegalRC(*RI, *RC)) continue; - for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); - I != E; ++I) { + for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I != E; + ++I) { if (RegName.equals_lower(RI->getRegAsmName(*I))) { std::pair S = std::make_pair(*I, RC); @@ -4555,7 +4576,8 @@ if (!OpTy->isSingleValueType() && OpTy->isSized()) { unsigned BitSize = DL.getTypeSizeInBits(OpTy); switch (BitSize) { - default: break; + default: + break; case 1: case 8: case 16: @@ -4688,8 +4710,8 @@ /// This object must already have been set up with the operand type /// and the current alternative constraint selected. TargetLowering::ConstraintWeight - TargetLowering::getMultipleConstraintMatchWeight( - AsmOperandInfo &info, int maIndex) const { +TargetLowering::getMultipleConstraintMatchWeight(AsmOperandInfo &info, + int maIndex) const { InlineAsm::ConstraintCodeVector *rCodes; if (maIndex >= (int)info.multipleAlternatives.size()) rCodes = &info.Codes; @@ -4700,7 +4722,7 @@ // Loop over the options, keeping track of the most general one. for (unsigned i = 0, e = rCodes->size(); i != e; ++i) { ConstraintWeight weight = - getSingleConstraintMatchWeight(info, (*rCodes)[i].c_str()); + getSingleConstraintMatchWeight(info, (*rCodes)[i].c_str()); if (weight > BestWeight) BestWeight = weight; } @@ -4712,44 +4734,44 @@ /// This object must already have been set up with the operand type /// and the current alternative constraint selected. TargetLowering::ConstraintWeight - TargetLowering::getSingleConstraintMatchWeight( - AsmOperandInfo &info, const char *constraint) const { +TargetLowering::getSingleConstraintMatchWeight(AsmOperandInfo &info, + const char *constraint) const { ConstraintWeight weight = CW_Invalid; Value *CallOperandVal = info.CallOperandVal; - // If we don't have a value, we can't do a match, - // but allow it at the lowest weight. + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. if (!CallOperandVal) return CW_Default; // Look at the constraint type. switch (*constraint) { - case 'i': // immediate integer. - case 'n': // immediate integer with a known value. - if (isa(CallOperandVal)) - weight = CW_Constant; - break; - case 's': // non-explicit intregal immediate. - if (isa(CallOperandVal)) - weight = CW_Constant; - break; - case 'E': // immediate float if host format. - case 'F': // immediate float. - if (isa(CallOperandVal)) - weight = CW_Constant; - break; - case '<': // memory operand with autodecrement. - case '>': // memory operand with autoincrement. - case 'm': // memory operand. - case 'o': // offsettable memory operand - case 'V': // non-offsettable memory operand - weight = CW_Memory; - break; - case 'r': // general register. - case 'g': // general register, memory operand or immediate integer. - // note: Clang converts "g" to "imr". - if (CallOperandVal->getType()->isIntegerTy()) - weight = CW_Register; - break; - case 'X': // any operand. + case 'i': // immediate integer. + case 'n': // immediate integer with a known value. + if (isa(CallOperandVal)) + weight = CW_Constant; + break; + case 's': // non-explicit intregal immediate. + if (isa(CallOperandVal)) + weight = CW_Constant; + break; + case 'E': // immediate float if host format. + case 'F': // immediate float. + if (isa(CallOperandVal)) + weight = CW_Constant; + break; + case '<': // memory operand with autodecrement. + case '>': // memory operand with autoincrement. + case 'm': // memory operand. + case 'o': // offsettable memory operand + case 'V': // non-offsettable memory operand + weight = CW_Memory; + break; + case 'r': // general register. + case 'g': // general register, memory operand or immediate integer. + // note: Clang converts "g" to "imr". + if (CallOperandVal->getType()->isIntegerTy()) + weight = CW_Register; + break; + case 'X': // any operand. default: weight = CW_Default; break; @@ -4778,8 +4800,8 @@ /// 'm' over 'r', for example. /// static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo, - const TargetLowering &TLI, - SDValue Op, SelectionDAG *DAG) { + const TargetLowering &TLI, SDValue Op, + SelectionDAG *DAG) { assert(OpInfo.Codes.size() > 1 && "Doesn't have multiple constraint options"); unsigned BestIdx = 0; TargetLowering::ConstraintType BestType = TargetLowering::C_Unknown; @@ -4788,7 +4810,7 @@ // Loop over the options, keeping track of the most general one. for (unsigned i = 0, e = OpInfo.Codes.size(); i != e; ++i) { TargetLowering::ConstraintType CType = - TLI.getConstraintType(OpInfo.Codes[i]); + TLI.getConstraintType(OpInfo.Codes[i]); // Indirect 'other' or 'immediate' constraints are not allowed. if (OpInfo.isIndirect && !(CType == TargetLowering::C_Memory || @@ -4801,12 +4823,12 @@ // the operand is an integer in the range [0..31] we want to use I (saving a // load of a register), otherwise we must use 'r'. if ((CType == TargetLowering::C_Other || - CType == TargetLowering::C_Immediate) && Op.getNode()) { + CType == TargetLowering::C_Immediate) && + Op.getNode()) { assert(OpInfo.Codes[i].size() == 1 && "Unhandled multi-letter 'other' constraint"); std::vector ResultOps; - TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i], - ResultOps, *DAG); + TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i], ResultOps, *DAG); if (!ResultOps.empty()) { BestType = CType; BestIdx = i; @@ -4834,8 +4856,7 @@ /// Determines the constraint code and constraint type to use for the specific /// AsmOperandInfo, setting OpInfo.ConstraintCode and OpInfo.ConstraintType. -void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo, - SDValue Op, +void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG) const { assert(!OpInfo.Codes.empty() && "Must have at least one constraint"); @@ -4931,7 +4952,8 @@ return DAG.getNode(ISD::MUL, dl, VT, Res, Factor); } -SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, +SDValue +TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const { AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); @@ -4946,20 +4968,16 @@ /// multiplying by a magic number. /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG, - bool IsAfterLegalization, + bool IsAfterOpLegalization, + bool IsAfterTyLegalization, SmallVectorImpl &Created) const { SDLoc dl(N); EVT VT = N->getValueType(0); EVT SVT = VT.getScalarType(); - EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout()); + EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout(), IsAfterTyLegalization); EVT ShSVT = ShVT.getScalarType(); unsigned EltBits = VT.getScalarSizeInBits(); - // Check to see if we can do this. - // FIXME: We should be more aggressive here. - if (!isTypeLegal(VT)) - return SDValue(); - // If the sdiv has an 'exact' bit we can use a simpler lowering. if (N->getFlags().hasExact()) return BuildExactSDIV(*this, N, dl, DAG, Created); @@ -5017,18 +5035,34 @@ } // Multiply the numerator (operand 0) by the magic value. - // FIXME: We should support doing a MUL in a wider type. SDValue Q; - if (IsAfterLegalization ? isOperationLegal(ISD::MULHS, VT) - : isOperationLegalOrCustom(ISD::MULHS, VT)) + if (IsAfterOpLegalization ? isOperationLegal(ISD::MULHS, VT) + : isOperationLegalOrCustom(ISD::MULHS, VT)) Q = DAG.getNode(ISD::MULHS, dl, VT, N0, MagicFactor); - else if (IsAfterLegalization ? isOperationLegal(ISD::SMUL_LOHI, VT) - : isOperationLegalOrCustom(ISD::SMUL_LOHI, VT)) { + else if (IsAfterOpLegalization + ? isOperationLegal(ISD::SMUL_LOHI, VT) + : isOperationLegalOrCustom(ISD::SMUL_LOHI, VT)) { SDValue LoHi = DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT), N0, MagicFactor); Q = SDValue(LoHi.getNode(), 1); - } else - return SDValue(); // No mulhs or equivalent. + } else { + if (IsAfterTyLegalization) + return SDValue(); + // No MULHU or UMUL_LOHI. Multiply in a wider integer and take the upper + // part. Even if targets do not support the other two instructions well, + // they are pretty good at supporting plain `MUL` at any width. + EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), EltBits * 2); + if (VT.isVector()) + WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT, + VT.getVectorNumElements()); + SDValue DoubleMul = DAG.getNode( + ISD::MUL, dl, WideVT, DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, N0), + DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, MagicFactor)); + SDValue Upper = DAG.getNode(ISD::SRL, dl, WideVT, DoubleMul, + DAG.getConstant(EltBits, dl, WideVT)); + SDValue Hi = DAG.getNode(ISD::TRUNCATE, dl, VT, Upper); + Q = Hi; + } Created.push_back(Q.getNode()); // (Optionally) Add/subtract the numerator using Factor. @@ -5055,20 +5089,16 @@ /// multiplying by a magic number. /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG, - bool IsAfterLegalization, + bool IsAfterOpLegalization, + bool IsAfterTyLegalization, SmallVectorImpl &Created) const { SDLoc dl(N); EVT VT = N->getValueType(0); EVT SVT = VT.getScalarType(); - EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout()); + EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout(), IsAfterTyLegalization); EVT ShSVT = ShVT.getScalarType(); unsigned EltBits = VT.getScalarSizeInBits(); - // Check to see if we can do this. - // FIXME: We should be more aggressive here. - if (!isTypeLegal(VT)) - return SDValue(); - bool UseNPQ = false; SmallVector PreShifts, PostShifts, MagicFactors, NPQFactors; @@ -5137,18 +5167,32 @@ Q = DAG.getNode(ISD::SRL, dl, VT, Q, PreShift); Created.push_back(Q.getNode()); - // FIXME: We should support doing a MUL in a wider type. auto GetMULHU = [&](SDValue X, SDValue Y) { - if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, VT) - : isOperationLegalOrCustom(ISD::MULHU, VT)) + if (IsAfterOpLegalization ? isOperationLegal(ISD::MULHU, VT) + : isOperationLegalOrCustom(ISD::MULHU, VT)) return DAG.getNode(ISD::MULHU, dl, VT, X, Y); - if (IsAfterLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT) - : isOperationLegalOrCustom(ISD::UMUL_LOHI, VT)) { + if (IsAfterOpLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT) + : isOperationLegalOrCustom(ISD::UMUL_LOHI, VT)) { SDValue LoHi = DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), X, Y); return SDValue(LoHi.getNode(), 1); } - return SDValue(); // No mulhu or equivalent + if (IsAfterTyLegalization) + return SDValue(); + // No MULHU or UMUL_LOHI. Multiply in a wider integer and take the upper + // part. Even if targets do not support the other two instructions well, + // they are pretty good at supporting plain `MUL` at any width. + EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), EltBits * 2); + if (VT.isVector()) + WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT, + VT.getVectorNumElements()); + SDValue DoubleMul = DAG.getNode( + ISD::MUL, dl, WideVT, DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, X), + DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, Y)); + SDValue Upper = DAG.getNode(ISD::SRL, dl, WideVT, DoubleMul, + DAG.getConstant(EltBits, dl, WideVT)); + SDValue Hi = DAG.getNode(ISD::TRUNCATE, dl, VT, Upper); + return Hi; }; // Multiply the numerator (operand 0) by the magic value. @@ -5708,8 +5752,8 @@ return Blended; } -bool TargetLowering:: -verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const { +bool TargetLowering::verifyReturnAddressArgumentIsConstant( + SDValue Op, SelectionDAG &DAG) const { if (!isa(Op.getOperand(0))) { DAG.getContext()->emitError("argument to '__builtin_return_address' must " "be a constant integer"); @@ -6387,8 +6431,7 @@ } bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result, - SDValue &Chain, - SelectionDAG &DAG) const { + SDValue &Chain, SelectionDAG &DAG) const { SDLoc dl(SDValue(Node, 0)); unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0; SDValue Src = Node->getOperand(OpNo); @@ -6401,8 +6444,8 @@ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), DstVT); // Only expand vector types if we have the appropriate vector bit operations. - unsigned SIntOpcode = Node->isStrictFPOpcode() ? ISD::STRICT_FP_TO_SINT : - ISD::FP_TO_SINT; + unsigned SIntOpcode = + Node->isStrictFPOpcode() ? ISD::STRICT_FP_TO_SINT : ISD::FP_TO_SINT; if (DstVT.isVector() && (!isOperationLegalOrCustom(SIntOpcode, DstVT) || !isOperationLegalOrCustomOrPromote(ISD::XOR, SrcVT))) return false; @@ -6416,8 +6459,8 @@ if (APFloat::opOverflow & APF.convertFromAPInt(SignMask, false, APFloat::rmNearestTiesToEven)) { if (Node->isStrictFPOpcode()) { - Result = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { DstVT, MVT::Other }, - { Node->getOperand(0), Src }); + Result = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {DstVT, MVT::Other}, + {Node->getOperand(0), Src}); Chain = Result.getValue(1); } else Result = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src); @@ -6447,18 +6490,18 @@ // Result = fp_to_sint(Src - FltOfs) ^ IntOfs // TODO: Should any fast-math-flags be set for the FSUB? - SDValue FltOfs = DAG.getSelect(dl, SrcVT, Sel, - DAG.getConstantFP(0.0, dl, SrcVT), Cst); + SDValue FltOfs = + DAG.getSelect(dl, SrcVT, Sel, DAG.getConstantFP(0.0, dl, SrcVT), Cst); Sel = DAG.getBoolExtOrTrunc(Sel, dl, DstSetCCVT, DstVT); - SDValue IntOfs = DAG.getSelect(dl, DstVT, Sel, - DAG.getConstant(0, dl, DstVT), - DAG.getConstant(SignMask, dl, DstVT)); + SDValue IntOfs = + DAG.getSelect(dl, DstVT, Sel, DAG.getConstant(0, dl, DstVT), + DAG.getConstant(SignMask, dl, DstVT)); SDValue SInt; if (Node->isStrictFPOpcode()) { - SDValue Val = DAG.getNode(ISD::STRICT_FSUB, dl, { SrcVT, MVT::Other }, - { Chain, Src, FltOfs }); - SInt = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { DstVT, MVT::Other }, - { Val.getValue(1), Val }); + SDValue Val = DAG.getNode(ISD::STRICT_FSUB, dl, {SrcVT, MVT::Other}, + {Chain, Src, FltOfs}); + SInt = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {DstVT, MVT::Other}, + {Val.getValue(1), Val}); Chain = SInt.getValue(1); } else { SDValue Val = DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FltOfs); @@ -6484,8 +6527,7 @@ } bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result, - SDValue &Chain, - SelectionDAG &DAG) const { + SDValue &Chain, SelectionDAG &DAG) const { unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0; SDValue Src = Node->getOperand(OpNo); EVT SrcVT = Src.getValueType(); @@ -6510,8 +6552,8 @@ // of performing rounding correctly, both in the default rounding mode // and in all alternate rounding modes. SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT); - SDValue TwoP84PlusTwoP52 = DAG.getConstantFP( - BitsToDouble(UINT64_C(0x4530000000100000)), dl, DstVT); + SDValue TwoP84PlusTwoP52 = + DAG.getConstantFP(BitsToDouble(UINT64_C(0x4530000000100000)), dl, DstVT); SDValue TwoP84 = DAG.getConstant(UINT64_C(0x4530000000000000), dl, SrcVT); SDValue LoMask = DAG.getConstant(UINT64_C(0x00000000FFFFFFFF), dl, SrcVT); SDValue HiShift = DAG.getConstant(32, dl, ShiftVT); @@ -6523,15 +6565,13 @@ SDValue LoFlt = DAG.getBitcast(DstVT, LoOr); SDValue HiFlt = DAG.getBitcast(DstVT, HiOr); if (Node->isStrictFPOpcode()) { - SDValue HiSub = - DAG.getNode(ISD::STRICT_FSUB, dl, {DstVT, MVT::Other}, - {Node->getOperand(0), HiFlt, TwoP84PlusTwoP52}); + SDValue HiSub = DAG.getNode(ISD::STRICT_FSUB, dl, {DstVT, MVT::Other}, + {Node->getOperand(0), HiFlt, TwoP84PlusTwoP52}); Result = DAG.getNode(ISD::STRICT_FADD, dl, {DstVT, MVT::Other}, {HiSub.getValue(1), LoFlt, HiSub}); Chain = Result.getValue(1); } else { - SDValue HiSub = - DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52); + SDValue HiSub = DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52); Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub); } return true; @@ -6540,8 +6580,8 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node, SelectionDAG &DAG) const { SDLoc dl(Node); - unsigned NewOp = Node->getOpcode() == ISD::FMINNUM ? - ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE; + unsigned NewOp = + Node->getOpcode() == ISD::FMINNUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE; EVT VT = Node->getValueType(0); if (isOperationLegalOrCustom(NewOp, VT)) { SDValue Quiet0 = Node->getOperand(0); @@ -6551,12 +6591,12 @@ // Insert canonicalizes if it's possible we need to quiet to get correct // sNaN behavior. if (!DAG.isKnownNeverSNaN(Quiet0)) { - Quiet0 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet0, - Node->getFlags()); + Quiet0 = + DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet0, Node->getFlags()); } if (!DAG.isKnownNeverSNaN(Quiet1)) { - Quiet1 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet1, - Node->getFlags()); + Quiet1 = + DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet1, Node->getFlags()); } } @@ -6787,8 +6827,7 @@ } std::pair -TargetLowering::scalarizeVectorLoad(LoadSDNode *LD, - SelectionDAG &DAG) const { +TargetLowering::scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const { SDLoc SL(LD); SDValue Chain = LD->getChain(); SDValue BasePTR = LD->getBasePtr(); @@ -6968,20 +7007,19 @@ if (VT.isFloatingPoint() || VT.isVector()) { EVT intVT = EVT::getIntegerVT(*DAG.getContext(), LoadedVT.getSizeInBits()); if (isTypeLegal(intVT) && isTypeLegal(LoadedVT)) { - if (!isOperationLegalOrCustom(ISD::LOAD, intVT) && - LoadedVT.isVector()) { + if (!isOperationLegalOrCustom(ISD::LOAD, intVT) && LoadedVT.isVector()) { // Scalarize the load and let the individual components be handled. return scalarizeVectorLoad(LD, DAG); } // Expand to a (misaligned) integer load of the same size, // then bitconvert to floating point or vector. - SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr, - LD->getMemOperand()); + SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr, LD->getMemOperand()); SDValue Result = DAG.getNode(ISD::BITCAST, dl, LoadedVT, newLoad); if (LoadedVT != VT) - Result = DAG.getNode(VT.isFloatingPoint() ? ISD::FP_EXTEND : - ISD::ANY_EXTEND, dl, VT, Result); + Result = + DAG.getNode(VT.isFloatingPoint() ? ISD::FP_EXTEND : ISD::ANY_EXTEND, + dl, VT, Result); return std::make_pair(Result, newLoad.getValue(1)); } @@ -7025,8 +7063,8 @@ } // The last copy may be partial. Do an extending load. - EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), - 8 * (LoadedBytes - Offset)); + EVT MemVT = + EVT::getIntegerVT(*DAG.getContext(), 8 * (LoadedBytes - Offset)); SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Chain, Ptr, LD->getPointerInfo().getWithOffset(Offset), MemVT, @@ -7058,7 +7096,7 @@ // integer MVT. unsigned NumBits = LoadedVT.getSizeInBits(); EVT NewLoadedVT; - NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2); + NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits / 2); NumBits >>= 1; Align Alignment = LD->getOriginalAlign(); @@ -7094,14 +7132,13 @@ } // aggregate the two parts - SDValue ShiftAmount = - DAG.getConstant(NumBits, dl, getShiftAmountTy(Hi.getValueType(), - DAG.getDataLayout())); + SDValue ShiftAmount = DAG.getConstant( + NumBits, dl, getShiftAmountTy(Hi.getValueType(), DAG.getDataLayout())); SDValue Result = DAG.getNode(ISD::SHL, dl, VT, Hi, ShiftAmount); Result = DAG.getNode(ISD::OR, dl, VT, Result, Lo); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), - Hi.getValue(1)); + Hi.getValue(1)); return std::make_pair(Result, TF); } @@ -7231,11 +7268,10 @@ return Result; } -SDValue -TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask, - const SDLoc &DL, EVT DataVT, - SelectionDAG &DAG, - bool IsCompressedMemory) const { +SDValue TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask, + const SDLoc &DL, EVT DataVT, + SelectionDAG &DAG, + bool IsCompressedMemory) const { SDValue Increment; EVT AddrVT = Addr.getValueType(); EVT MaskVT = Mask.getValueType(); @@ -7246,7 +7282,8 @@ report_fatal_error( "Cannot currently handle compressed memory with scalable vectors"); // Incrementing the pointer according to number of '1's in the mask. - EVT MaskIntVT = EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits()); + EVT MaskIntVT = + EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits()); SDValue MaskInIntReg = DAG.getBitcast(MaskIntVT, Mask); if (MaskIntVT.getSizeInBits() < 32) { MaskInIntReg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskInIntReg); @@ -7257,8 +7294,8 @@ Increment = DAG.getNode(ISD::CTPOP, DL, MaskIntVT, MaskInIntReg); Increment = DAG.getZExtOrTrunc(Increment, DL, AddrVT); // Scale is an element size in bytes. - SDValue Scale = DAG.getConstant(DataVT.getScalarSizeInBits() / 8, DL, - AddrVT); + SDValue Scale = + DAG.getConstant(DataVT.getScalarSizeInBits() / 8, DL, AddrVT); Increment = DAG.getNode(ISD::MUL, DL, AddrVT, Increment, Scale); } else if (DataVT.isScalableVector()) { Increment = DAG.getVScale(DL, AddrVT, @@ -7270,27 +7307,23 @@ return DAG.getNode(ISD::ADD, DL, AddrVT, Addr, Increment); } -static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, - SDValue Idx, - EVT VecVT, - const SDLoc &dl) { +static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, SDValue Idx, + EVT VecVT, const SDLoc &dl) { if (!VecVT.isScalableVector() && isa(Idx)) return Idx; EVT IdxVT = Idx.getValueType(); unsigned NElts = VecVT.getVectorMinNumElements(); if (VecVT.isScalableVector()) { - SDValue VS = DAG.getVScale(dl, IdxVT, - APInt(IdxVT.getSizeInBits().getFixedSize(), - NElts)); - SDValue Sub = DAG.getNode(ISD::SUB, dl, IdxVT, VS, - DAG.getConstant(1, dl, IdxVT)); + SDValue VS = DAG.getVScale( + dl, IdxVT, APInt(IdxVT.getSizeInBits().getFixedSize(), NElts)); + SDValue Sub = + DAG.getNode(ISD::SUB, dl, IdxVT, VS, DAG.getConstant(1, dl, IdxVT)); return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx, Sub); } else { if (isPowerOf2_32(NElts)) { - APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(), - Log2_32(NElts)); + APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(), Log2_32(NElts)); return DAG.getNode(ISD::AND, dl, IdxVT, Idx, DAG.getConstant(Imm, dl, IdxVT)); } @@ -7310,7 +7343,8 @@ EVT EltVT = VecVT.getVectorElementType(); // Calculate the element offset and add it to the pointer. - unsigned EltSize = EltVT.getSizeInBits().getFixedSize() / 8; // FIXME: should be ABI size. + unsigned EltSize = + EltVT.getSizeInBits().getFixedSize() / 8; // FIXME: should be ABI size. assert(EltSize * 8 == EltVT.getSizeInBits().getFixedSize() && "Converting bits to bytes lost precision"); @@ -7338,7 +7372,7 @@ ArgListTy Args; ArgListEntry Entry; std::string NameString = ("__emutls_v." + GA->getGlobal()->getName()).str(); - Module *VariableModule = const_cast(GA->getGlobal()->getParent()); + Module *VariableModule = const_cast(GA->getGlobal()->getParent()); StringRef EmuTlsVarName(NameString); GlobalVariable *EmuTlsVar = VariableModule->getNamedGlobal(EmuTlsVarName); assert(EmuTlsVar && "Cannot find EmuTlsVar "); @@ -7438,8 +7472,8 @@ unsigned BitWidth = LHS.getScalarValueSizeInBits(); EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); - SDValue Result = DAG.getNode(OverflowOp, dl, DAG.getVTList(VT, BoolVT), - LHS, RHS); + SDValue Result = + DAG.getNode(OverflowOp, dl, DAG.getVTList(VT, BoolVT), LHS, RHS); SDValue SumDiff = Result.getValue(0); SDValue Overflow = Result.getValue(1); SDValue Zero = DAG.getConstant(0, dl, VT); @@ -7485,7 +7519,7 @@ assert((Node->getOpcode() == ISD::SSHLSAT || Node->getOpcode() == ISD::USHLSAT) && - "Expected a SHLSAT opcode"); + "Expected a SHLSAT opcode"); assert(VT == RHS.getValueType() && "Expected operands to be the same type"); assert(VT.isInteger() && "Expected operands to be integers"); @@ -7500,8 +7534,8 @@ if (IsSigned) { SDValue SatMin = DAG.getConstant(APInt::getSignedMinValue(BW), dl, VT); SDValue SatMax = DAG.getConstant(APInt::getSignedMaxValue(BW), dl, VT); - SatVal = DAG.getSelectCC(dl, LHS, DAG.getConstant(0, dl, VT), - SatMin, SatMax, ISD::SETLT); + SatVal = DAG.getSelectCC(dl, LHS, DAG.getConstant(0, dl, VT), SatMin, + SatMax, ISD::SETLT); } else { SatVal = DAG.getConstant(APInt::getMaxValue(BW), dl, VT); } @@ -7510,8 +7544,8 @@ return Result; } -SDValue -TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const { +SDValue TargetLowering::expandFixedPointMul(SDNode *Node, + SelectionDAG &DAG) const { assert((Node->getOpcode() == ISD::SMULFIX || Node->getOpcode() == ISD::UMULFIX || Node->getOpcode() == ISD::SMULFIXSAT || @@ -7606,11 +7640,10 @@ // Saturate to max if ((Hi >> Scale) != 0), // which is the same as if (Hi > ((1 << Scale) - 1)) APInt MaxVal = APInt::getMaxValue(VTSize); - SDValue LowMask = DAG.getConstant(APInt::getLowBitsSet(VTSize, Scale), - dl, VT); - Result = DAG.getSelectCC(dl, Hi, LowMask, - DAG.getConstant(MaxVal, dl, VT), Result, - ISD::SETUGT); + SDValue LowMask = + DAG.getConstant(APInt::getLowBitsSet(VTSize, Scale), dl, VT); + Result = DAG.getSelectCC(dl, Hi, LowMask, DAG.getConstant(MaxVal, dl, VT), + Result, ISD::SETUGT); return Result; } @@ -7628,8 +7661,8 @@ // Saturated to SatMin if wide product is negative, and SatMax if wide // product is positive ... SDValue Zero = DAG.getConstant(0, dl, VT); - SDValue ResultIfOverflow = DAG.getSelectCC(dl, Hi, Zero, SatMin, SatMax, - ISD::SETLT); + SDValue ResultIfOverflow = + DAG.getSelectCC(dl, Hi, Zero, SatMin, SatMax, ISD::SETLT); // ... but only if we overflowed. return DAG.getSelect(dl, VT, Overflow, ResultIfOverflow, Result); } @@ -7638,22 +7671,21 @@ // Saturate to max if ((Hi >> (Scale - 1)) > 0), // which is the same as if (Hi > (1 << (Scale - 1)) - 1) - SDValue LowMask = DAG.getConstant(APInt::getLowBitsSet(VTSize, Scale - 1), - dl, VT); + SDValue LowMask = + DAG.getConstant(APInt::getLowBitsSet(VTSize, Scale - 1), dl, VT); Result = DAG.getSelectCC(dl, Hi, LowMask, SatMax, Result, ISD::SETGT); // Saturate to min if (Hi >> (Scale - 1)) < -1), // which is the same as if (HI < (-1 << (Scale - 1)) - SDValue HighMask = - DAG.getConstant(APInt::getHighBitsSet(VTSize, VTSize - Scale + 1), - dl, VT); + SDValue HighMask = DAG.getConstant( + APInt::getHighBitsSet(VTSize, VTSize - Scale + 1), dl, VT); Result = DAG.getSelectCC(dl, Hi, HighMask, SatMin, Result, ISD::SETLT); return Result; } -SDValue -TargetLowering::expandFixedPointDiv(unsigned Opcode, const SDLoc &dl, - SDValue LHS, SDValue RHS, - unsigned Scale, SelectionDAG &DAG) const { +SDValue TargetLowering::expandFixedPointDiv(unsigned Opcode, const SDLoc &dl, + SDValue LHS, SDValue RHS, + unsigned Scale, + SelectionDAG &DAG) const { assert((Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT || Opcode == ISD::UDIVFIX || Opcode == ISD::UDIVFIXSAT) && "Expected a fixed point division opcode"); @@ -7707,38 +7739,33 @@ // FIXME: Ideally we would always produce an SDIVREM here, but if the // type isn't legal, SDIVREM cannot be expanded. There is no reason why // we couldn't just form a libcall, but the type legalizer doesn't do it. - if (isTypeLegal(VT) && - isOperationLegalOrCustom(ISD::SDIVREM, VT)) { - Quot = DAG.getNode(ISD::SDIVREM, dl, - DAG.getVTList(VT, VT), - LHS, RHS); + if (isTypeLegal(VT) && isOperationLegalOrCustom(ISD::SDIVREM, VT)) { + Quot = DAG.getNode(ISD::SDIVREM, dl, DAG.getVTList(VT, VT), LHS, RHS); Rem = Quot.getValue(1); Quot = Quot.getValue(0); } else { - Quot = DAG.getNode(ISD::SDIV, dl, VT, - LHS, RHS); - Rem = DAG.getNode(ISD::SREM, dl, VT, - LHS, RHS); + Quot = DAG.getNode(ISD::SDIV, dl, VT, LHS, RHS); + Rem = DAG.getNode(ISD::SREM, dl, VT, LHS, RHS); } SDValue Zero = DAG.getConstant(0, dl, VT); SDValue RemNonZero = DAG.getSetCC(dl, BoolVT, Rem, Zero, ISD::SETNE); SDValue LHSNeg = DAG.getSetCC(dl, BoolVT, LHS, Zero, ISD::SETLT); SDValue RHSNeg = DAG.getSetCC(dl, BoolVT, RHS, Zero, ISD::SETLT); SDValue QuotNeg = DAG.getNode(ISD::XOR, dl, BoolVT, LHSNeg, RHSNeg); - SDValue Sub1 = DAG.getNode(ISD::SUB, dl, VT, Quot, - DAG.getConstant(1, dl, VT)); + SDValue Sub1 = + DAG.getNode(ISD::SUB, dl, VT, Quot, DAG.getConstant(1, dl, VT)); Quot = DAG.getSelect(dl, VT, DAG.getNode(ISD::AND, dl, BoolVT, RemNonZero, QuotNeg), Sub1, Quot); } else - Quot = DAG.getNode(ISD::UDIV, dl, VT, - LHS, RHS); + Quot = DAG.getNode(ISD::UDIV, dl, VT, LHS, RHS); return Quot; } -void TargetLowering::expandUADDSUBO( - SDNode *Node, SDValue &Result, SDValue &Overflow, SelectionDAG &DAG) const { +void TargetLowering::expandUADDSUBO(SDNode *Node, SDValue &Result, + SDValue &Overflow, + SelectionDAG &DAG) const { SDLoc dl(Node); SDValue LHS = Node->getOperand(0); SDValue RHS = Node->getOperand(1); @@ -7748,37 +7775,38 @@ unsigned OpcCarry = IsAdd ? ISD::ADDCARRY : ISD::SUBCARRY; if (isOperationLegalOrCustom(OpcCarry, Node->getValueType(0))) { SDValue CarryIn = DAG.getConstant(0, dl, Node->getValueType(1)); - SDValue NodeCarry = DAG.getNode(OpcCarry, dl, Node->getVTList(), - { LHS, RHS, CarryIn }); + SDValue NodeCarry = + DAG.getNode(OpcCarry, dl, Node->getVTList(), {LHS, RHS, CarryIn}); Result = SDValue(NodeCarry.getNode(), 0); Overflow = SDValue(NodeCarry.getNode(), 1); return; } - Result = DAG.getNode(IsAdd ? ISD::ADD : ISD::SUB, dl, - LHS.getValueType(), LHS, RHS); + Result = DAG.getNode(IsAdd ? ISD::ADD : ISD::SUB, dl, LHS.getValueType(), LHS, + RHS); EVT ResultType = Node->getValueType(1); - EVT SetCCType = getSetCCResultType( - DAG.getDataLayout(), *DAG.getContext(), Node->getValueType(0)); + EVT SetCCType = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), + Node->getValueType(0)); ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT; SDValue SetCC = DAG.getSetCC(dl, SetCCType, Result, LHS, CC); Overflow = DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType); } -void TargetLowering::expandSADDSUBO( - SDNode *Node, SDValue &Result, SDValue &Overflow, SelectionDAG &DAG) const { +void TargetLowering::expandSADDSUBO(SDNode *Node, SDValue &Result, + SDValue &Overflow, + SelectionDAG &DAG) const { SDLoc dl(Node); SDValue LHS = Node->getOperand(0); SDValue RHS = Node->getOperand(1); bool IsAdd = Node->getOpcode() == ISD::SADDO; - Result = DAG.getNode(IsAdd ? ISD::ADD : ISD::SUB, dl, - LHS.getValueType(), LHS, RHS); + Result = DAG.getNode(IsAdd ? ISD::ADD : ISD::SUB, dl, LHS.getValueType(), LHS, + RHS); EVT ResultType = Node->getValueType(1); - EVT OType = getSetCCResultType( - DAG.getDataLayout(), *DAG.getContext(), Node->getValueType(0)); + EVT OType = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), + Node->getValueType(0)); // If SADDSAT/SSUBSAT is legal, compare results to detect overflow. unsigned OpcSat = IsAdd ? ISD::SADDSAT : ISD::SSUBSAT; @@ -7826,37 +7854,39 @@ SDValue ShiftAmt = DAG.getConstant(C.logBase2(), dl, ShiftAmtTy); Result = DAG.getNode(ISD::SHL, dl, VT, LHS, ShiftAmt); Overflow = DAG.getSetCC(dl, SetCCVT, - DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, - dl, VT, Result, ShiftAmt), - LHS, ISD::SETNE); + DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, + dl, VT, Result, ShiftAmt), + LHS, ISD::SETNE); return true; } } - EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2); + EVT WideVT = + EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2); if (VT.isVector()) - WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT, - VT.getVectorNumElements()); + WideVT = + EVT::getVectorVT(*DAG.getContext(), WideVT, VT.getVectorNumElements()); SDValue BottomHalf; SDValue TopHalf; - static const unsigned Ops[2][3] = - { { ISD::MULHU, ISD::UMUL_LOHI, ISD::ZERO_EXTEND }, - { ISD::MULHS, ISD::SMUL_LOHI, ISD::SIGN_EXTEND }}; + static const unsigned Ops[2][3] = { + {ISD::MULHU, ISD::UMUL_LOHI, ISD::ZERO_EXTEND}, + {ISD::MULHS, ISD::SMUL_LOHI, ISD::SIGN_EXTEND}}; if (isOperationLegalOrCustom(Ops[isSigned][0], VT)) { BottomHalf = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS); TopHalf = DAG.getNode(Ops[isSigned][0], dl, VT, LHS, RHS); } else if (isOperationLegalOrCustom(Ops[isSigned][1], VT)) { - BottomHalf = DAG.getNode(Ops[isSigned][1], dl, DAG.getVTList(VT, VT), LHS, - RHS); + BottomHalf = + DAG.getNode(Ops[isSigned][1], dl, DAG.getVTList(VT, VT), LHS, RHS); TopHalf = BottomHalf.getValue(1); } else if (isTypeLegal(WideVT)) { LHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, LHS); RHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, RHS); SDValue Mul = DAG.getNode(ISD::MUL, dl, WideVT, LHS, RHS); BottomHalf = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul); - SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits(), dl, - getShiftAmountTy(WideVT, DAG.getDataLayout())); + SDValue ShiftAmt = + DAG.getConstant(VT.getScalarSizeInBits(), dl, + getShiftAmountTy(WideVT, DAG.getDataLayout())); TopHalf = DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getNode(ISD::SRL, dl, WideVT, Mul, ShiftAmt)); } else { @@ -7884,17 +7914,15 @@ // The high part is obtained by SRA'ing all but one of the bits of low // part. unsigned LoSize = VT.getSizeInBits(); - HiLHS = - DAG.getNode(ISD::SRA, dl, VT, LHS, - DAG.getConstant(LoSize - 1, dl, - getPointerTy(DAG.getDataLayout()))); - HiRHS = - DAG.getNode(ISD::SRA, dl, VT, RHS, - DAG.getConstant(LoSize - 1, dl, - getPointerTy(DAG.getDataLayout()))); + HiLHS = DAG.getNode( + ISD::SRA, dl, VT, LHS, + DAG.getConstant(LoSize - 1, dl, getPointerTy(DAG.getDataLayout()))); + HiRHS = DAG.getNode( + ISD::SRA, dl, VT, RHS, + DAG.getConstant(LoSize - 1, dl, getPointerTy(DAG.getDataLayout()))); } else { - HiLHS = DAG.getConstant(0, dl, VT); - HiRHS = DAG.getConstant(0, dl, VT); + HiLHS = DAG.getConstant(0, dl, VT); + HiRHS = DAG.getConstant(0, dl, VT); } // Here we're passing the 2 arguments explicitly as 4 arguments that are @@ -7910,10 +7938,10 @@ // depending on platform endianness. This is usually handled by // the C calling convention, but we can't defer to it in // the legalizer. - SDValue Args[] = { LHS, HiLHS, RHS, HiRHS }; + SDValue Args[] = {LHS, HiLHS, RHS, HiRHS}; Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first; } else { - SDValue Args[] = { HiLHS, LHS, HiRHS, RHS }; + SDValue Args[] = {HiLHS, LHS, HiRHS, RHS}; Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first; } assert(Ret.getOpcode() == ISD::MERGE_VALUES && @@ -7936,8 +7964,8 @@ SDValue Sign = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, ShiftAmt); Overflow = DAG.getSetCC(dl, SetCCVT, TopHalf, Sign, ISD::SETNE); } else { - Overflow = DAG.getSetCC(dl, SetCCVT, TopHalf, - DAG.getConstant(0, dl, VT), ISD::SETNE); + Overflow = DAG.getSetCC(dl, SetCCVT, TopHalf, DAG.getConstant(0, dl, VT), + ISD::SETNE); } // Truncate the result if SetCC returns a larger type than needed. @@ -7954,20 +7982,47 @@ SDLoc dl(Node); unsigned BaseOpcode = 0; switch (Node->getOpcode()) { - default: llvm_unreachable("Expected VECREDUCE opcode"); - case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break; - case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break; - case ISD::VECREDUCE_ADD: BaseOpcode = ISD::ADD; break; - case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break; - case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break; - case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break; - case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break; - case ISD::VECREDUCE_SMAX: BaseOpcode = ISD::SMAX; break; - case ISD::VECREDUCE_SMIN: BaseOpcode = ISD::SMIN; break; - case ISD::VECREDUCE_UMAX: BaseOpcode = ISD::UMAX; break; - case ISD::VECREDUCE_UMIN: BaseOpcode = ISD::UMIN; break; - case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break; - case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break; + default: + llvm_unreachable("Expected VECREDUCE opcode"); + case ISD::VECREDUCE_FADD: + BaseOpcode = ISD::FADD; + break; + case ISD::VECREDUCE_FMUL: + BaseOpcode = ISD::FMUL; + break; + case ISD::VECREDUCE_ADD: + BaseOpcode = ISD::ADD; + break; + case ISD::VECREDUCE_MUL: + BaseOpcode = ISD::MUL; + break; + case ISD::VECREDUCE_AND: + BaseOpcode = ISD::AND; + break; + case ISD::VECREDUCE_OR: + BaseOpcode = ISD::OR; + break; + case ISD::VECREDUCE_XOR: + BaseOpcode = ISD::XOR; + break; + case ISD::VECREDUCE_SMAX: + BaseOpcode = ISD::SMAX; + break; + case ISD::VECREDUCE_SMIN: + BaseOpcode = ISD::SMIN; + break; + case ISD::VECREDUCE_UMAX: + BaseOpcode = ISD::UMAX; + break; + case ISD::VECREDUCE_UMIN: + BaseOpcode = ISD::UMIN; + break; + case ISD::VECREDUCE_FMAX: + BaseOpcode = ISD::FMAXNUM; + break; + case ISD::VECREDUCE_FMIN: + BaseOpcode = ISD::FMINNUM; + break; } SDValue Op = Node->getOperand(0); diff --git a/llvm/test/CodeGen/AArch64/srem-seteq.ll b/llvm/test/CodeGen/AArch64/srem-seteq.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq.ll @@ -83,13 +83,10 @@ define i16 @test_srem_even(i16 %X) nounwind { ; CHECK-LABEL: test_srem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #9363 ; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: movk w9, #37449, lsl #16 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: asr w9, w8, #3 +; CHECK-NEXT: mov w9, #18725 +; CHECK-NEXT: mul w8, w8, w9 +; CHECK-NEXT: asr w9, w8, #18 ; CHECK-NEXT: add w8, w9, w8, lsr #31 ; CHECK-NEXT: mov w9, #14 ; CHECK-NEXT: msub w8, w8, w9, w0 diff --git a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll --- a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll @@ -4,50 +4,21 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { ; CHECK-LABEL: fold_srem_vec_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #63421 -; CHECK-NEXT: mov w12, #33437 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: movk w9, #31710, lsl #16 -; CHECK-NEXT: smov w11, v0.h[2] -; CHECK-NEXT: movk w12, #21399, lsl #16 -; CHECK-NEXT: smull x12, w11, w12 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: lsr x13, x12, #63 -; CHECK-NEXT: asr x12, x12, #37 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w12, w12, w13 -; CHECK-NEXT: mov w13, #98 -; CHECK-NEXT: sub w9, w9, w8 -; CHECK-NEXT: msub w11, w12, w13, w11 -; CHECK-NEXT: asr w13, w9, #6 -; CHECK-NEXT: add w9, w13, w9, lsr #31 -; CHECK-NEXT: mov w13, #37253 -; CHECK-NEXT: mov w10, #-124 -; CHECK-NEXT: smov w12, v0.h[0] -; CHECK-NEXT: movk w13, #44150, lsl #16 -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: smull x10, w12, w13 -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: add w10, w10, w12 -; CHECK-NEXT: asr w13, w10, #6 -; CHECK-NEXT: mov w9, #95 -; CHECK-NEXT: add w10, w13, w10, lsr #31 -; CHECK-NEXT: msub w9, w10, w9, w12 -; CHECK-NEXT: mov w10, #63249 -; CHECK-NEXT: smov w13, v0.h[3] -; CHECK-NEXT: movk w10, #48808, lsl #16 -; CHECK-NEXT: smull x10, w13, w10 -; CHECK-NEXT: lsr x12, x10, #63 -; CHECK-NEXT: asr x10, x10, #40 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: add w10, w10, w12 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #-1003 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: msub w8, w10, w8, w13 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x8, .LCPI0_1 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: adrp x8, .LCPI0_2 +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: adrp x8, .LCPI0_3 +; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h +; CHECK-NEXT: shrn v2.4h, v2.4s, #16 +; CHECK-NEXT: mla v2.4h, v0.4h, v1.4h +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_3] +; CHECK-NEXT: neg v3.4h, v3.4h +; CHECK-NEXT: sshl v2.4h, v2.4h, v3.4h +; CHECK-NEXT: usra v2.4h, v2.4h, #15 +; CHECK-NEXT: mls v0.4h, v2.4h, v1.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -56,43 +27,15 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { ; CHECK-LABEL: fold_srem_vec_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #37253 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: movk w9, #44150, lsl #16 -; CHECK-NEXT: smov w10, v0.h[0] -; CHECK-NEXT: smull x13, w8, w9 -; CHECK-NEXT: smov w11, v0.h[2] -; CHECK-NEXT: smull x14, w10, w9 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: smull x15, w11, w9 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: add w13, w13, w8 -; CHECK-NEXT: smull x9, w12, w9 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: add w14, w14, w10 -; CHECK-NEXT: asr w16, w13, #6 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w15, w15, w11 -; CHECK-NEXT: add w13, w16, w13, lsr #31 -; CHECK-NEXT: asr w16, w14, #6 -; CHECK-NEXT: add w9, w9, w12 -; CHECK-NEXT: add w14, w16, w14, lsr #31 -; CHECK-NEXT: asr w16, w15, #6 -; CHECK-NEXT: add w15, w16, w15, lsr #31 -; CHECK-NEXT: asr w16, w9, #6 -; CHECK-NEXT: add w9, w16, w9, lsr #31 -; CHECK-NEXT: mov w16, #95 -; CHECK-NEXT: msub w10, w14, w16, w10 -; CHECK-NEXT: msub w8, w13, w16, w8 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: msub w11, w15, w16, w11 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: msub w8, w9, w16, w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov w8, #44151 +; CHECK-NEXT: dup v1.4h, w8 +; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: add v1.4h, v1.4h, v0.4h +; CHECK-NEXT: sshr v1.4h, v1.4h, #6 +; CHECK-NEXT: usra v1.4h, v1.4h, #15 +; CHECK-NEXT: movi v2.4h, #95 +; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -103,47 +46,16 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; CHECK-LABEL: combine_srem_sdiv: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #37253 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: movk w8, #44150, lsl #16 -; CHECK-NEXT: smov w9, v0.h[1] -; CHECK-NEXT: smov w10, v0.h[0] -; CHECK-NEXT: smull x13, w9, w8 -; CHECK-NEXT: smov w11, v0.h[2] -; CHECK-NEXT: smull x14, w10, w8 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: smull x15, w11, w8 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: add w13, w13, w9 -; CHECK-NEXT: smull x8, w12, w8 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: add w14, w14, w10 -; CHECK-NEXT: asr w16, w13, #6 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: add w15, w15, w11 -; CHECK-NEXT: add w13, w16, w13, lsr #31 -; CHECK-NEXT: asr w16, w14, #6 -; CHECK-NEXT: add w8, w8, w12 -; CHECK-NEXT: add w14, w16, w14, lsr #31 -; CHECK-NEXT: asr w16, w15, #6 -; CHECK-NEXT: add w15, w16, w15, lsr #31 -; CHECK-NEXT: asr w16, w8, #6 -; CHECK-NEXT: add w8, w16, w8, lsr #31 -; CHECK-NEXT: mov w16, #95 -; CHECK-NEXT: msub w10, w14, w16, w10 -; CHECK-NEXT: msub w9, w13, w16, w9 -; CHECK-NEXT: fmov s0, w14 -; CHECK-NEXT: fmov s1, w10 -; CHECK-NEXT: msub w11, w15, w16, w11 -; CHECK-NEXT: mov v0.h[1], w13 -; CHECK-NEXT: mov v1.h[1], w9 -; CHECK-NEXT: msub w12, w8, w16, w12 -; CHECK-NEXT: mov v0.h[2], w15 -; CHECK-NEXT: mov v1.h[2], w11 -; CHECK-NEXT: mov v1.h[3], w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-NEXT: mov w8, #44151 +; CHECK-NEXT: dup v2.4h, w8 +; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h +; CHECK-NEXT: shrn v2.4h, v2.4s, #16 +; CHECK-NEXT: add v2.4h, v2.4h, v0.4h +; CHECK-NEXT: sshr v2.4h, v2.4h, #6 +; CHECK-NEXT: movi v1.4h, #95 +; CHECK-NEXT: usra v2.4h, v2.4h, #15 +; CHECK-NEXT: mls v0.4h, v2.4h, v1.4h +; CHECK-NEXT: add v0.4h, v0.4h, v2.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, %2 = sdiv <4 x i16> %x, @@ -155,40 +67,19 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_power_of_two: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: add w12, w8, #31 // =31 -; CHECK-NEXT: cmp w8, #0 // =0 -; CHECK-NEXT: mov w11, #37253 -; CHECK-NEXT: csel w12, w12, w8, lt -; CHECK-NEXT: smov w9, v0.h[0] -; CHECK-NEXT: smov w10, v0.h[3] -; CHECK-NEXT: movk w11, #44150, lsl #16 -; CHECK-NEXT: and w12, w12, #0xffffffe0 -; CHECK-NEXT: sub w8, w8, w12 -; CHECK-NEXT: add w12, w9, #63 // =63 -; CHECK-NEXT: smull x11, w10, w11 -; CHECK-NEXT: cmp w9, #0 // =0 -; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: csel w12, w12, w9, lt -; CHECK-NEXT: add w11, w11, w10 -; CHECK-NEXT: and w12, w12, #0xffffffc0 -; CHECK-NEXT: sub w9, w9, w12 -; CHECK-NEXT: asr w12, w11, #6 -; CHECK-NEXT: add w11, w12, w11, lsr #31 -; CHECK-NEXT: smov w12, v0.h[2] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: add w9, w12, #7 // =7 -; CHECK-NEXT: cmp w12, #0 // =0 -; CHECK-NEXT: csel w9, w9, w12, lt -; CHECK-NEXT: and w9, w9, #0xfffffff8 -; CHECK-NEXT: sub w9, w12, w9 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #95 -; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: msub w8, w11, w8, w10 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: adrp x9, .LCPI3_1 +; CHECK-NEXT: ldr d2, [x9, :lo12:.LCPI3_1] +; CHECK-NEXT: adrp x8, .LCPI3_2 +; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI3_2] +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: add v1.4h, v1.4h, v0.4h +; CHECK-NEXT: neg v2.4h, v2.4h +; CHECK-NEXT: sshl v1.4h, v1.4h, v2.4h +; CHECK-NEXT: usra v1.4h, v1.4h, #15 +; CHECK-NEXT: mls v0.4h, v1.4h, v3.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -198,41 +89,24 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #17097 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[2] -; CHECK-NEXT: movk w9, #45590, lsl #16 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w9, w9, w8 -; CHECK-NEXT: asr w12, w9, #4 -; CHECK-NEXT: add w9, w12, w9, lsr #31 -; CHECK-NEXT: mov w12, #30865 -; CHECK-NEXT: mov w10, #23 -; CHECK-NEXT: smov w11, v0.h[1] -; CHECK-NEXT: movk w12, #51306, lsl #16 -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: smull x10, w11, w12 -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: add w10, w10, w11 -; CHECK-NEXT: asr w12, w10, #9 -; CHECK-NEXT: mov w9, #654 -; CHECK-NEXT: add w10, w12, w10, lsr #31 -; CHECK-NEXT: msub w9, w10, w9, w11 -; CHECK-NEXT: mov w10, #47143 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: movk w10, #24749, lsl #16 -; CHECK-NEXT: smull x10, w12, w10 -; CHECK-NEXT: lsr x11, x10, #63 -; CHECK-NEXT: asr x10, x10, #43 -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: add w10, w10, w11 -; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: mov w9, #5423 -; CHECK-NEXT: mov v0.h[2], w8 -; CHECK-NEXT: msub w8, w10, w9, w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: adrp x8, .LCPI4_1 +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_1] +; CHECK-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h +; CHECK-NEXT: and v1.8b, v0.8b, v1.8b +; CHECK-NEXT: shrn v2.4h, v2.4s, #16 +; CHECK-NEXT: adrp x8, .LCPI4_2 +; CHECK-NEXT: neg v3.4h, v3.4h +; CHECK-NEXT: add v1.4h, v2.4h, v1.4h +; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI4_2] +; CHECK-NEXT: sshl v1.4h, v1.4h, v3.4h +; CHECK-NEXT: ushr v2.4h, v1.4h, #15 +; CHECK-NEXT: movi d3, #0xffffffffffff0000 +; CHECK-NEXT: and v2.8b, v2.8b, v3.8b +; CHECK-NEXT: add v1.4h, v1.4h, v2.4h +; CHECK-NEXT: mls v0.4h, v1.4h, v4.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -242,38 +116,24 @@ define <4 x i16> @dont_fold_srem_i16_smax(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_i16_smax: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w10, #17097 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w9, v0.h[2] -; CHECK-NEXT: movk w10, #45590, lsl #16 -; CHECK-NEXT: smull x10, w9, w10 -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: add w10, w10, w9 -; CHECK-NEXT: asr w12, w10, #4 -; CHECK-NEXT: mov w11, #23 -; CHECK-NEXT: add w10, w12, w10, lsr #31 -; CHECK-NEXT: msub w9, w10, w11, w9 -; CHECK-NEXT: mov w10, #47143 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: movk w10, #24749, lsl #16 -; CHECK-NEXT: smull x10, w12, w10 -; CHECK-NEXT: lsr x11, x10, #63 -; CHECK-NEXT: asr x10, x10, #43 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: add w10, w10, w11 -; CHECK-NEXT: mov w11, #32767 -; CHECK-NEXT: add w11, w8, w11 -; CHECK-NEXT: cmp w8, #0 // =0 -; CHECK-NEXT: csel w11, w11, w8, lt -; CHECK-NEXT: and w11, w11, #0xffff8000 -; CHECK-NEXT: sub w8, w8, w11 -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #5423 -; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: msub w8, w10, w8, w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: adrp x8, .LCPI5_1 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI5_1] +; CHECK-NEXT: adrp x8, .LCPI5_2 +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI5_2] +; CHECK-NEXT: adrp x8, .LCPI5_3 +; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h +; CHECK-NEXT: shrn v2.4h, v2.4s, #16 +; CHECK-NEXT: neg v3.4h, v3.4h +; CHECK-NEXT: mla v2.4h, v0.4h, v1.4h +; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI5_3] +; CHECK-NEXT: sshl v1.4h, v2.4h, v3.4h +; CHECK-NEXT: ushr v2.4h, v1.4h, #15 +; CHECK-NEXT: movi d3, #0xffffffffffff0000 +; CHECK-NEXT: and v2.8b, v2.8b, v3.8b +; CHECK-NEXT: add v1.4h, v1.4h, v2.4h +; CHECK-NEXT: mls v0.4h, v1.4h, v4.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -283,41 +143,64 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { ; CHECK-LABEL: dont_fold_srem_i64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x9, #21445 +; CHECK-NEXT: movk x9, #1603, lsl #16 +; CHECK-NEXT: movk x9, #15432, lsl #32 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: movk x9, #25653, lsl #48 +; CHECK-NEXT: smulh x8, x8, x9 ; CHECK-NEXT: mov x9, #6055 ; CHECK-NEXT: movk x9, #58853, lsl #16 ; CHECK-NEXT: movk x9, #47142, lsl #32 -; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: mov x11, v1.d[1] ; CHECK-NEXT: movk x9, #24749, lsl #48 -; CHECK-NEXT: smulh x9, x8, x9 -; CHECK-NEXT: asr x12, x9, #11 +; CHECK-NEXT: smulh x9, x11, x9 +; CHECK-NEXT: mov x11, #8549 +; CHECK-NEXT: movk x11, #22795, lsl #16 +; CHECK-NEXT: adrp x10, .LCPI6_0 +; CHECK-NEXT: movk x11, #17096, lsl #32 +; CHECK-NEXT: ldr q2, [x10, :lo12:.LCPI6_0] +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: movk x11, #45590, lsl #48 +; CHECK-NEXT: smulh x10, x10, x11 +; CHECK-NEXT: adrp x11, .LCPI6_3 +; CHECK-NEXT: ldr q3, [x11, :lo12:.LCPI6_3] +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: fmov d5, x8 +; CHECK-NEXT: mov v4.d[1], v5.d[0] +; CHECK-NEXT: fmov d5, x9 +; CHECK-NEXT: fmov d6, x10 +; CHECK-NEXT: mov v6.d[1], v5.d[0] +; CHECK-NEXT: and v5.16b, v1.16b, v2.16b +; CHECK-NEXT: add v5.2d, v6.2d, v5.2d +; CHECK-NEXT: neg v3.2d, v3.2d +; CHECK-NEXT: adrp x8, .LCPI6_1 +; CHECK-NEXT: sshl v3.2d, v5.2d, v3.2d +; CHECK-NEXT: usra v3.2d, v5.2d, #63 +; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI6_1] +; CHECK-NEXT: adrp x9, .LCPI6_2 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI6_2] +; CHECK-NEXT: and v2.16b, v0.16b, v2.16b +; CHECK-NEXT: neg v5.2d, v5.2d +; CHECK-NEXT: add v2.2d, v4.2d, v2.2d +; CHECK-NEXT: sshl v4.2d, v2.2d, v5.2d +; CHECK-NEXT: ushr v2.2d, v2.2d, #63 +; CHECK-NEXT: mov w9, #23 +; CHECK-NEXT: and v2.16b, v2.16b, v3.16b +; CHECK-NEXT: mul x9, x10, x9 ; CHECK-NEXT: mov w10, #5423 -; CHECK-NEXT: add x9, x12, x9, lsr #63 -; CHECK-NEXT: msub x8, x9, x10, x8 -; CHECK-NEXT: mov x9, #21445 -; CHECK-NEXT: movk x9, #1603, lsl #16 -; CHECK-NEXT: movk x9, #15432, lsl #32 -; CHECK-NEXT: mov x12, v0.d[1] -; CHECK-NEXT: movk x9, #25653, lsl #48 -; CHECK-NEXT: smulh x9, x12, x9 -; CHECK-NEXT: asr x10, x9, #8 -; CHECK-NEXT: add x9, x10, x9, lsr #63 +; CHECK-NEXT: add v2.2d, v4.2d, v2.2d +; CHECK-NEXT: mul x8, x8, x10 ; CHECK-NEXT: mov w10, #654 -; CHECK-NEXT: msub x9, x9, x10, x12 -; CHECK-NEXT: mov x10, #8549 -; CHECK-NEXT: movk x10, #22795, lsl #16 -; CHECK-NEXT: movk x10, #17096, lsl #32 -; CHECK-NEXT: fmov x11, d1 -; CHECK-NEXT: movk x10, #45590, lsl #48 -; CHECK-NEXT: smulh x10, x11, x10 -; CHECK-NEXT: add x10, x10, x11 -; CHECK-NEXT: asr x12, x10, #4 -; CHECK-NEXT: add x10, x12, x10, lsr #63 -; CHECK-NEXT: mov w12, #23 -; CHECK-NEXT: msub x10, x10, x12, x11 -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: fmov d1, x10 -; CHECK-NEXT: mov v1.d[1], x8 -; CHECK-NEXT: mov v0.d[1], x9 +; CHECK-NEXT: fmov d3, x9 +; CHECK-NEXT: mov x9, v2.d[1] +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov v3.d[1], x8 +; CHECK-NEXT: mov v2.d[1], x9 +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sub v1.2d, v1.2d, v3.2d ; CHECK-NEXT: ret %1 = srem <4 x i64> %x, ret <4 x i64> %1 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll @@ -195,14 +195,15 @@ define i1 @t16_3_2(i16 %X) nounwind { ; CHECK-LABEL: t16_3_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #43691 ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: movk w9, #43690, lsl #16 -; CHECK-NEXT: mov w10, #-1431655766 -; CHECK-NEXT: madd w8, w8, w9, w10 -; CHECK-NEXT: mov w9, #1431655765 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w9, #43691 +; CHECK-NEXT: mul w8, w8, w9 +; CHECK-NEXT: lsr w8, w8, #17 +; CHECK-NEXT: add w8, w8, w8, lsl #1 +; CHECK-NEXT: sub w8, w0, w8 +; CHECK-NEXT: and w8, w8, #0xffff +; CHECK-NEXT: cmp w8, #2 // =2 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i16 %X, 3 %cmp = icmp eq i16 %urem, 2 @@ -212,14 +213,15 @@ define i1 @t8_3_2(i8 %X) nounwind { ; CHECK-LABEL: t8_3_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #43691 ; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: movk w9, #43690, lsl #16 -; CHECK-NEXT: mov w10, #-1431655766 -; CHECK-NEXT: madd w8, w8, w9, w10 -; CHECK-NEXT: mov w9, #1431655765 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w9, #171 +; CHECK-NEXT: mul w8, w8, w9 +; CHECK-NEXT: lsr w8, w8, #9 +; CHECK-NEXT: add w8, w8, w8, lsl #1 +; CHECK-NEXT: sub w8, w0, w8 +; CHECK-NEXT: and w8, w8, #0xff +; CHECK-NEXT: cmp w8, #2 // =2 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i8 %X, 3 %cmp = icmp eq i8 %urem, 2 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq.ll b/llvm/test/CodeGen/AArch64/urem-seteq.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq.ll @@ -78,15 +78,14 @@ define i16 @test_urem_even(i16 %X) nounwind { ; CHECK-LABEL: test_urem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #28087 -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: movk w9, #46811, lsl #16 +; CHECK-NEXT: ubfx w8, w0, #1, #15 +; CHECK-NEXT: mov w9, #18725 ; CHECK-NEXT: mul w8, w8, w9 -; CHECK-NEXT: mov w9, #9362 -; CHECK-NEXT: ror w8, w8, #1 -; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: lsr w8, w8, #17 +; CHECK-NEXT: mov w9, #14 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: tst w8, #0xffff +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %urem = urem i16 %X, 14 %cmp = icmp ne i16 %urem, 0 diff --git a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll --- a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll @@ -4,44 +4,27 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { ; CHECK-LABEL: fold_urem_vec_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w11, #33437 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: movk w11, #21399, lsl #16 -; CHECK-NEXT: umull x11, w10, w11 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: mov w9, #16913 -; CHECK-NEXT: mov w12, #98 -; CHECK-NEXT: lsr x11, x11, #37 -; CHECK-NEXT: movk w9, #8456, lsl #16 -; CHECK-NEXT: msub w10, w11, w12, w10 -; CHECK-NEXT: ubfx w12, w8, #2, #14 -; CHECK-NEXT: umull x9, w12, w9 -; CHECK-NEXT: mov w11, #124 -; CHECK-NEXT: lsr x9, x9, #34 -; CHECK-NEXT: msub w8, w9, w11, w8 -; CHECK-NEXT: mov w9, #8969 -; CHECK-NEXT: umov w12, v0.h[0] -; CHECK-NEXT: movk w9, #22765, lsl #16 -; CHECK-NEXT: umull x9, w12, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: sub w11, w12, w9 -; CHECK-NEXT: add w9, w9, w11, lsr #1 -; CHECK-NEXT: mov w11, #95 -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: msub w9, w9, w11, w12 -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov w9, #2287 -; CHECK-NEXT: movk w9, #16727, lsl #16 -; CHECK-NEXT: umull x9, w11, w9 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #1003 -; CHECK-NEXT: lsr x9, x9, #40 -; CHECK-NEXT: mov v0.h[2], w10 -; CHECK-NEXT: msub w8, w9, w8, w11 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x8, .LCPI0_1 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: adrp x8, .LCPI0_2 +; CHECK-NEXT: neg v1.4h, v1.4h +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: ushl v1.4h, v0.4h, v1.4h +; CHECK-NEXT: adrp x8, .LCPI0_3 +; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h +; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI0_3] +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: adrp x8, .LCPI0_4 +; CHECK-NEXT: sub v2.4h, v0.4h, v1.4h +; CHECK-NEXT: umull v2.4s, v2.4h, v3.4h +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI0_4] +; CHECK-NEXT: shrn v2.4h, v2.4s, #16 +; CHECK-NEXT: add v1.4h, v2.4h, v1.4h +; CHECK-NEXT: neg v2.4h, v4.4h +; CHECK-NEXT: ushl v1.4h, v1.4h, v2.4h +; CHECK-NEXT: mls v0.4h, v1.4h, v3.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -50,43 +33,13 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { ; CHECK-LABEL: fold_urem_vec_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #8969 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: movk w9, #22765, lsl #16 -; CHECK-NEXT: umov w10, v0.h[0] -; CHECK-NEXT: umull x13, w8, w9 -; CHECK-NEXT: umov w11, v0.h[2] -; CHECK-NEXT: umull x14, w10, w9 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: umov w12, v0.h[3] -; CHECK-NEXT: umull x15, w11, w9 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: sub w16, w8, w13 -; CHECK-NEXT: umull x9, w12, w9 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: add w13, w13, w16, lsr #1 -; CHECK-NEXT: sub w16, w10, w14 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w14, w14, w16, lsr #1 -; CHECK-NEXT: sub w16, w11, w15 -; CHECK-NEXT: add w15, w15, w16, lsr #1 -; CHECK-NEXT: sub w16, w12, w9 -; CHECK-NEXT: add w9, w9, w16, lsr #1 -; CHECK-NEXT: mov w16, #95 -; CHECK-NEXT: lsr w13, w13, #6 -; CHECK-NEXT: msub w8, w13, w16, w8 -; CHECK-NEXT: lsr w13, w14, #6 -; CHECK-NEXT: msub w10, w13, w16, w10 -; CHECK-NEXT: lsr w13, w15, #6 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: msub w11, w13, w16, w11 -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: msub w8, w9, w16, w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov w8, #44151 +; CHECK-NEXT: dup v1.4h, w8 +; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: ushr v1.4s, v1.4s, #22 +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: movi v2.4h, #95 +; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -97,47 +50,14 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; CHECK-LABEL: combine_urem_udiv: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #8969 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: movk w8, #22765, lsl #16 -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v0.h[0] -; CHECK-NEXT: umull x13, w9, w8 -; CHECK-NEXT: umov w11, v0.h[2] -; CHECK-NEXT: umull x14, w10, w8 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: umov w12, v0.h[3] -; CHECK-NEXT: umull x15, w11, w8 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: sub w16, w9, w13 -; CHECK-NEXT: umull x8, w12, w8 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: add w13, w13, w16, lsr #1 -; CHECK-NEXT: sub w16, w10, w14 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: add w14, w14, w16, lsr #1 -; CHECK-NEXT: sub w16, w11, w15 -; CHECK-NEXT: add w15, w15, w16, lsr #1 -; CHECK-NEXT: sub w16, w12, w8 -; CHECK-NEXT: add w8, w8, w16, lsr #1 -; CHECK-NEXT: mov w16, #95 -; CHECK-NEXT: lsr w14, w14, #6 -; CHECK-NEXT: lsr w13, w13, #6 -; CHECK-NEXT: msub w10, w14, w16, w10 -; CHECK-NEXT: lsr w15, w15, #6 -; CHECK-NEXT: msub w9, w13, w16, w9 -; CHECK-NEXT: fmov s0, w14 -; CHECK-NEXT: fmov s1, w10 -; CHECK-NEXT: lsr w8, w8, #6 -; CHECK-NEXT: msub w11, w15, w16, w11 -; CHECK-NEXT: mov v0.h[1], w13 -; CHECK-NEXT: mov v1.h[1], w9 -; CHECK-NEXT: msub w12, w8, w16, w12 -; CHECK-NEXT: mov v0.h[2], w15 -; CHECK-NEXT: mov v1.h[2], w11 -; CHECK-NEXT: mov v1.h[3], w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-NEXT: mov w8, #44151 +; CHECK-NEXT: dup v2.4h, w8 +; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h +; CHECK-NEXT: ushr v2.4s, v2.4s, #22 +; CHECK-NEXT: movi v1.4h, #95 +; CHECK-NEXT: xtn v2.4h, v2.4s +; CHECK-NEXT: mls v0.4h, v2.4h, v1.4h +; CHECK-NEXT: add v0.4h, v0.4h, v2.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, %2 = udiv <4 x i16> %x, @@ -150,28 +70,17 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_urem_power_of_two: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #8969 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: movk w9, #22765, lsl #16 -; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: sub w10, w8, w9 -; CHECK-NEXT: add w9, w9, w10, lsr #1 -; CHECK-NEXT: mov w10, #95 -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: and w9, w9, #0x3f -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: and w10, w10, #0x1f -; CHECK-NEXT: and w9, w9, #0x7 -; CHECK-NEXT: mov v1.h[1], w10 -; CHECK-NEXT: mov v1.h[2], w9 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: adrp x9, .LCPI3_1 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: ldr d2, [x9, :lo12:.LCPI3_1] +; CHECK-NEXT: adrp x8, .LCPI3_2 +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI3_2] +; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: neg v2.4h, v2.4h +; CHECK-NEXT: ushl v1.4h, v1.4h, v2.4h +; CHECK-NEXT: mls v0.4h, v1.4h, v3.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -181,34 +90,28 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #17097 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: movk w9, #45590, lsl #16 -; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: mov w10, #23 -; CHECK-NEXT: lsr x9, x9, #36 -; CHECK-NEXT: umov w11, v0.h[1] -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: mov w9, #30865 -; CHECK-NEXT: movk w9, #51306, lsl #16 -; CHECK-NEXT: ubfx w10, w11, #1, #15 -; CHECK-NEXT: umull x9, w10, w9 -; CHECK-NEXT: mov w10, #654 -; CHECK-NEXT: lsr x9, x9, #40 -; CHECK-NEXT: msub w9, w9, w10, w11 -; CHECK-NEXT: mov w11, #47143 -; CHECK-NEXT: umov w10, v0.h[3] -; CHECK-NEXT: movk w11, #24749, lsl #16 -; CHECK-NEXT: movi d1, #0000000000000000 -; CHECK-NEXT: umull x11, w10, w11 -; CHECK-NEXT: mov v1.h[1], w9 -; CHECK-NEXT: mov w9, #5423 -; CHECK-NEXT: lsr x11, x11, #43 -; CHECK-NEXT: mov v1.h[2], w8 -; CHECK-NEXT: msub w8, w11, w9, w10 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: adrp x8, .LCPI4_1 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_1] +; CHECK-NEXT: adrp x8, .LCPI4_2 +; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h +; CHECK-NEXT: umull v2.4s, v3.4h, v2.4h +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_2] +; CHECK-NEXT: shrn v2.4h, v2.4s, #16 +; CHECK-NEXT: add v1.4h, v2.4h, v1.4h +; CHECK-NEXT: adrp x8, .LCPI4_3 +; CHECK-NEXT: neg v3.4h, v3.4h +; CHECK-NEXT: movi d2, #0xffffffffffff0000 +; CHECK-NEXT: ushl v1.4h, v1.4h, v3.4h +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_3] +; CHECK-NEXT: and v1.8b, v1.8b, v2.8b +; CHECK-NEXT: movi d2, #0x0000000000ffff +; CHECK-NEXT: and v2.8b, v0.8b, v2.8b +; CHECK-NEXT: orr v1.8b, v2.8b, v1.8b +; CHECK-NEXT: mls v0.4h, v1.4h, v3.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -227,40 +130,62 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { ; CHECK-LABEL: dont_fold_urem_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x10, #12109 -; CHECK-NEXT: movk x10, #52170, lsl #16 -; CHECK-NEXT: movk x10, #28749, lsl #32 +; CHECK-NEXT: mov x9, #12109 +; CHECK-NEXT: movk x9, #52170, lsl #16 +; CHECK-NEXT: movk x9, #28749, lsl #32 ; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: movk x10, #49499, lsl #48 -; CHECK-NEXT: umulh x10, x8, x10 -; CHECK-NEXT: mov w11, #5423 -; CHECK-NEXT: lsr x10, x10, #12 -; CHECK-NEXT: msub x8, x10, x11, x8 +; CHECK-NEXT: movk x9, #49499, lsl #48 +; CHECK-NEXT: umulh x8, x8, x9 +; CHECK-NEXT: mov x9, #17097 +; CHECK-NEXT: movk x9, #45590, lsl #16 +; CHECK-NEXT: movk x9, #34192, lsl #32 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: movk x9, #25644, lsl #48 +; CHECK-NEXT: ushr v3.2d, v0.2d, #1 +; CHECK-NEXT: umulh x9, x10, x9 ; CHECK-NEXT: mov x10, #21445 +; CHECK-NEXT: fmov d4, x8 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: fmov d3, x9 ; CHECK-NEXT: movk x10, #1603, lsl #16 -; CHECK-NEXT: mov x12, v0.d[1] +; CHECK-NEXT: mov v3.d[1], v4.d[0] ; CHECK-NEXT: movk x10, #15432, lsl #32 +; CHECK-NEXT: sub v4.2d, v1.2d, v3.2d ; CHECK-NEXT: movk x10, #25653, lsl #48 -; CHECK-NEXT: lsr x11, x12, #1 -; CHECK-NEXT: umulh x10, x11, x10 -; CHECK-NEXT: mov w11, #654 -; CHECK-NEXT: lsr x10, x10, #7 -; CHECK-NEXT: msub x10, x10, x11, x12 -; CHECK-NEXT: mov x11, #17097 -; CHECK-NEXT: movk x11, #45590, lsl #16 -; CHECK-NEXT: movk x11, #34192, lsl #32 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: movk x11, #25644, lsl #48 -; CHECK-NEXT: umulh x11, x9, x11 -; CHECK-NEXT: sub x12, x9, x11 -; CHECK-NEXT: add x11, x11, x12, lsr #1 -; CHECK-NEXT: mov w12, #23 -; CHECK-NEXT: lsr x11, x11, #4 -; CHECK-NEXT: msub x9, x11, x12, x9 -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: mov v1.d[1], x8 -; CHECK-NEXT: mov v0.d[1], x10 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: umulh x8, x8, x10 +; CHECK-NEXT: lsr x9, x9, #1 +; CHECK-NEXT: fmov d2, xzr +; CHECK-NEXT: adrp x10, .LCPI6_0 +; CHECK-NEXT: fmov d4, x8 +; CHECK-NEXT: fmov d5, x9 +; CHECK-NEXT: mov v5.d[1], v2.d[0] +; CHECK-NEXT: mov v2.d[1], v4.d[0] +; CHECK-NEXT: ldr q4, [x10, :lo12:.LCPI6_0] +; CHECK-NEXT: add v3.2d, v5.2d, v3.2d +; CHECK-NEXT: adrp x8, .LCPI6_1 +; CHECK-NEXT: adrp x9, .LCPI6_2 +; CHECK-NEXT: neg v4.2d, v4.2d +; CHECK-NEXT: ushl v3.2d, v3.2d, v4.2d +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI6_1] +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI6_2] +; CHECK-NEXT: neg v4.2d, v4.2d +; CHECK-NEXT: mov w9, #23 +; CHECK-NEXT: ushl v2.2d, v2.2d, v4.2d +; CHECK-NEXT: mul x9, x10, x9 +; CHECK-NEXT: mov w10, #5423 +; CHECK-NEXT: bit v2.16b, v0.16b, v3.16b +; CHECK-NEXT: mul x8, x8, x10 +; CHECK-NEXT: mov w10, #654 +; CHECK-NEXT: fmov d3, x9 +; CHECK-NEXT: mov x9, v2.d[1] +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov v3.d[1], x8 +; CHECK-NEXT: mov v2.d[1], x9 +; CHECK-NEXT: sub v1.2d, v1.2d, v3.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d ; CHECK-NEXT: ret %1 = urem <4 x i64> %x, ret <4 x i64> %1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -1,5 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s @@ -4926,127 +4925,34 @@ ; ; GCN-LABEL: udiv_i64_oddk_denom: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 0x4f176a73 -; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GCN-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s2, 0xfee0 -; GCN-NEXT: s_mov_b32 s3, 0x68958c89 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s3 -; GCN-NEXT: v_mul_lo_u32 v4, v1, s3 -; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, 0x38f83e5 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v3, v0, s3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: s_movk_i32 s4, 0x11e -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: s_mov_b32 s10, -1 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 -; GCN-NEXT: v_mul_hi_u32 v5, v0, s3 -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v6, v2, s3 -; GCN-NEXT: s_mov_b32 s2, 0x976a7377 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, s3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 -; GCN-NEXT: s_movk_i32 s3, 0x11f -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc -; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s5, 0x64c139ef +; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mul_hi_u32 v4, s6, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s7, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s7, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s7, v1 +; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 +; GCN-NEXT: s_mul_i32 s5, s7, s5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s5, v4 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mul_i32 s4, s6, s8 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v4 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v0, s3 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v4, v1, s2 -; GCN-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v3, v0, s2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s7, v2 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 -; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v4 -; GCN-NEXT: s_mov_b32 s2, 0x976a7376 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s2, v5 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 -; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] -; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 -; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] -; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 -; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v6, s7 -; GCN-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s2, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc +; GCN-NEXT: s_mul_i32 s4, s7, s8 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s4, v2 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v0, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 2, v0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = udiv i64 %x, 1235195949943 store i64 %r, i64 addrspace(1)* %out @@ -5149,113 +5055,40 @@ ; ; GCN-LABEL: udiv_v2i64_mixed_pow2k_denom: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 -; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s6, 0xf001 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-NEXT: s_movk_i32 s0, 0xfff -; GCN-NEXT: v_mul_hi_u32 v3, v0, s6 -; GCN-NEXT: v_mul_lo_u32 v5, v1, s6 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s6 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v7, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v8, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc -; GCN-NEXT: v_mul_hi_u32 v5, v0, s6 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v1, v4, s[2:3] -; GCN-NEXT: v_mul_lo_u32 v6, v3, s6 -; GCN-NEXT: v_mul_lo_u32 v8, v0, s6 -; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v8 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v11, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v7, v10, vcc -; GCN-NEXT: v_mul_lo_u32 v10, v3, v8 -; GCN-NEXT: v_mul_hi_u32 v8, v3, v8 -; GCN-NEXT: v_mul_lo_u32 v3, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s9, 0x10010011 +; GCN-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NEXT: s_mov_b32 s10, 0x100100 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v3, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v6, s11, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: s_lshr_b64 s[2:3], s[8:9], 12 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v6, v2, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v7, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, s0 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s0 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 -; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s0, v4 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc -; GCN-NEXT: s_movk_i32 s0, 0xffe -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s3, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s2, v0 +; GCN-NEXT: s_mul_i32 s9, s3, s9 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s9, v3 +; GCN-NEXT: s_mul_i32 s8, s2, s10 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, s8, v3 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: v_addc_u32_e64 v2, s[8:9], 0, 0, vcc +; GCN-NEXT: s_mul_i32 s8, s3, s10 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s8, v1 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v0, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v3 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-NEXT: v_lshr_b64 v[0:1], v[0:1], 1 +; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; GCN-NEXT: v_lshr_b64 v[2:3], v[0:1], 11 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %r = udiv <2 x i64> %x, @@ -5309,126 +5142,41 @@ ; ; GCN-LABEL: urem_i64_oddk_denom: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 -; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GCN-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s2, 0xfee0 -; GCN-NEXT: s_mov_b32 s3, 0x689e0837 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s3 -; GCN-NEXT: v_mul_lo_u32 v4, v1, s3 -; GCN-NEXT: s_movk_i32 s12, 0x11f -; GCN-NEXT: s_mov_b32 s13, 0x9761f7c9 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v3, v0, s3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: s_mov_b32 s10, 0xe3e10011 +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: s_movk_i32 s5, 0x11e -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 -; GCN-NEXT: v_mul_hi_u32 v5, v0, s3 -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v6, v2, s3 -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, s3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 -; GCN-NEXT: s_mov_b32 s4, 0x9761f7c8 -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc -; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: s_mov_b32 s10, -1 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s6, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s7, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s4, 0xf6841139 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s7, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: s_mul_i32 s4, s7, s4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; GCN-NEXT: s_mul_i32 s1, s6, s10 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, s1, v3 ; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v0, s12 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 -; GCN-NEXT: v_mul_lo_u32 v1, v1, s13 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s13 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 -; GCN-NEXT: v_mov_b32_e32 v3, s12 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc -; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s13, v0 -; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s5, v5 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v4 -; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s13, v4 -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, v5 -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] -; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v5, s7 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s5, v1 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s12, v1 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: s_mul_i32 s1, s7, s10 +; GCN-NEXT: v_addc_u32_e64 v2, s[8:9], 0, 0, vcc +; GCN-NEXT: v_add_i32_e32 v1, vcc, s1, v1 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v0, v2, vcc +; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GCN-NEXT: s_mov_b32 s4, 0x9761f7c9 +; GCN-NEXT: v_mul_hi_u32 v1, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v2, v0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_movk_i32 s5, 0x11f +; GCN-NEXT: v_mad_u32_u24 v1, v0, s5, v1 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v2 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = urem i64 %x, 1235195393993 store i64 %r, i64 addrspace(1)* %out @@ -5575,122 +5323,44 @@ ; ; GCN-LABEL: sdiv_i64_oddk_denom: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 -; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s2, 0xffed2705 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v2, v1, s2 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s9, 0xfd81e19 +; GCN-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NEXT: s_mov_b32 s8, 0x6ca94220 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc -; GCN-NEXT: s_mov_b32 s5, s9 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v4, v2, s2 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, s2 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: s_ashr_i32 s2, s11, 31 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] -; GCN-NEXT: s_add_u32 s0, s10, s2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_mov_b32 s3, s2 -; GCN-NEXT: s_addc_u32 s1, s11, s2 -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s7, v2 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: s_mul_i32 s5, s7, s9 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s5, v3 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mul_i32 s4, s6, s8 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, s4, v3 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s1, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 -; GCN-NEXT: s_mov_b32 s3, 0x12d8fb -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, s3 -; GCN-NEXT: v_mul_hi_u32 v3, s3, v0 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s0, v4 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s3, v4 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc -; GCN-NEXT: s_mov_b32 s0, 0x12d8fa -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GCN-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc +; GCN-NEXT: s_ashr_i32 s5, s7, 31 +; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-NEXT: v_mul_hi_u32 v2, s5, v2 +; GCN-NEXT: s_mul_i32 s4, s7, s8 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s4, v1 +; GCN-NEXT: s_mul_i32 s4, s5, s8 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v2 +; GCN-NEXT: s_mul_i32 s5, s5, s9 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s5, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s5, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; GCN-NEXT: v_ashr_i64 v[2:3], v[0:1], 19 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = sdiv i64 %x, 1235195 store i64 %r, i64 addrspace(1)* %out @@ -5930,128 +5600,57 @@ ; ; GCN-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 0x457ff000 -; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GCN-NEXT: v_mac_f32_e32 v0, 0, v1 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s6, 0xf001 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s10, 0x8008009 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: s_mov_b32 s8, 0x80080080 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s0, s9, 31 -; GCN-NEXT: s_lshr_b32 s0, s0, 20 -; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 -; GCN-NEXT: v_mul_lo_u32 v3, v1, s6 -; GCN-NEXT: s_add_u32 s2, s8, s0 -; GCN-NEXT: s_addc_u32 s3, s9, 0 -; GCN-NEXT: s_ashr_i32 s8, s11, 31 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v0, s6 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 -; GCN-NEXT: s_mov_b32 s9, s8 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v5, v2, s6 -; GCN-NEXT: v_mul_hi_u32 v7, s6, v0 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GCN-NEXT: v_mul_lo_u32 v7, v0, s6 -; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 -; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc +; GCN-NEXT: v_mul_hi_u32 v4, s2, v2 +; GCN-NEXT: v_mul_hi_u32 v3, s3, v2 +; GCN-NEXT: s_ashr_i32 s9, s1, 31 +; GCN-NEXT: v_mul_hi_u32 v1, s2, v0 +; GCN-NEXT: s_mul_i32 s11, s3, s10 +; GCN-NEXT: s_lshr_b32 s9, s9, 20 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s11, v4 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_mul_i32 s9, s2, s8 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s9, v4 +; GCN-NEXT: s_ashr_i32 s11, s3, 31 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-NEXT: s_mul_i32 s9, s3, s8 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v5, vcc +; GCN-NEXT: v_add_i32_e32 v1, vcc, s9, v1 +; GCN-NEXT: s_mul_i32 s8, s11, s8 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, s8, v2 +; GCN-NEXT: s_mul_i32 s8, s11, s10 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s8, v2 +; GCN-NEXT: v_mov_b32_e32 v3, s8 +; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3 +; GCN-NEXT: v_subb_u32_e32 v2, vcc, v2, v4, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] -; GCN-NEXT: s_add_u32 s0, s10, s8 -; GCN-NEXT: s_addc_u32 s1, s11, s8 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v7, s1, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 -; GCN-NEXT: s_movk_i32 s9, 0xfff +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s2, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc +; GCN-NEXT: v_ashr_i64 v[2:3], v[0:1], 11 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v0 +; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, s9 -; GCN-NEXT: v_mul_hi_u32 v3, s9, v0 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s9 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s0, v4 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s9, v4 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc -; GCN-NEXT: s_movk_i32 s0, 0xffe -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s8, v1 -; GCN-NEXT: v_mov_b32_e32 v3, s8 -; GCN-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %r = sdiv <2 x i64> %x, @@ -6356,120 +5955,51 @@ ; ; GCN-LABEL: srem_i64_oddk_denom: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 -; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s2, 0xffed2705 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v2, v1, s2 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s9, 0xfd81e19 +; GCN-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NEXT: s_mov_b32 s8, 0x6ca94220 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc -; GCN-NEXT: s_mov_b32 s5, s9 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v4, v2, s2 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, s2 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: s_ashr_i32 s2, s11, 31 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] -; GCN-NEXT: s_add_u32 s0, s10, s2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_mov_b32 s3, s2 -; GCN-NEXT: s_addc_u32 s1, s11, s2 -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s7, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: s_mul_i32 s1, s7, s9 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s1, v3 +; GCN-NEXT: s_mul_i32 s0, s6, s8 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, s0, v3 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s1, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 -; GCN-NEXT: s_mov_b32 s3, 0x12d8fb -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_hi_u32 v2, s3, v0 -; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s3 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GCN-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc +; GCN-NEXT: s_ashr_i32 s1, s7, 31 +; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-NEXT: v_mul_hi_u32 v2, s1, v2 +; GCN-NEXT: s_mul_i32 s0, s7, s8 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v1 +; GCN-NEXT: s_mul_i32 s0, s1, s8 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GCN-NEXT: s_mul_i32 s1, s1, s9 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s1, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; GCN-NEXT: v_ashr_i64 v[2:3], v[0:1], 19 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: s_mov_b32 s8, 0x12d8fb +; GCN-NEXT: v_mul_hi_u32 v2, v0, s8 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: v_mad_i32_i24 v1, v1, s8, v2 +; GCN-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s3, v0 -; GCN-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s3, v2 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc -; GCN-NEXT: s_mov_b32 s0, 0x12d8fa -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = srem i64 %x, 1235195 store i64 %r, i64 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -1424,110 +1424,33 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x) { ; GCN-LABEL: s_test_udiv_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 -; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x41c00000 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s2, 0xffe8 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v3, v1, s2 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s10, 0xaaaaaaaa +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v4, v0, s2 -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v5, v2, s2 -; GCN-NEXT: v_mul_lo_u32 v6, v0, s2 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 -; GCN-NEXT: s_mov_b32 s5, s9 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc -; GCN-NEXT: v_mul_lo_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v6 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s4, 0xaaaaaaab +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s7, v2 +; GCN-NEXT: s_mul_i32 s4, s7, s4 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; GCN-NEXT: s_mul_i32 s1, s6, s10 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, s1, v3 +; GCN-NEXT: v_mul_hi_u32 v3, s7, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s11, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 -; GCN-NEXT: v_mul_hi_u32 v3, v0, 24 -; GCN-NEXT: v_mul_lo_u32 v4, v0, 24 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, 24, v4 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 23, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], 23, v4 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: s_mul_i32 s1, s7, s10 +; GCN-NEXT: v_addc_u32_e64 v2, s[8:9], 0, 0, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, s1, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; GCN-NEXT: v_lshr_b64 v[0:1], v[0:1], 4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv_k_den_i64: @@ -1618,102 +1541,25 @@ ; GCN-LABEL: v_test_udiv_k_den_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GCN-NEXT: v_madak_f32 v2, 0, v2, 0x41c00000 -; GCN-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-NEXT: s_movk_i32 s6, 0xffe8 -; GCN-NEXT: v_mov_b32_e32 v10, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GCN-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_mul_hi_u32 v4, v2, s6 -; GCN-NEXT: v_mul_lo_u32 v5, v3, s6 -; GCN-NEXT: v_mul_lo_u32 v6, v2, s6 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v2, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v8, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v11, v3, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_mul_lo_u32 v8, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v6, v3, v6 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v6, vcc -; GCN-NEXT: v_mul_hi_u32 v6, v2, s6 -; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[4:5] -; GCN-NEXT: v_mul_lo_u32 v7, v4, s6 -; GCN-NEXT: v_mul_lo_u32 v8, v2, s6 -; GCN-NEXT: v_subrev_i32_e32 v6, vcc, v2, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v11, v2, v8 -; GCN-NEXT: v_mul_hi_u32 v12, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v13, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v10, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v12, v4, v8 -; GCN-NEXT: v_mul_hi_u32 v8, v4, v8 -; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v13, v9, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v6, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v6, s[4:5] -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v6, vcc -; GCN-NEXT: v_mul_lo_u32 v6, v1, v2 -; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v9, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v10, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v3, 24 -; GCN-NEXT: v_mul_hi_u32 v5, v2, 24 -; GCN-NEXT: v_mul_lo_u32 v6, v2, 24 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, 24, v0 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v1, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 23, v4 -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v2 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v2 -; GCN-NEXT: v_cmp_lt_u32_e64 s[4:5], 23, v0 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v4, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; GCN-NEXT: s_mov_b32 s4, 0xaaaaaaab +; GCN-NEXT: v_mul_lo_u32 v3, v1, s4 +; GCN-NEXT: v_mul_hi_u32 v4, v0, s4 +; GCN-NEXT: s_mov_b32 s6, 0xaaaaaaaa +; GCN-NEXT: v_mul_hi_u32 v5, v1, s4 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s6 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, s6 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s6 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GCN-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 4 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 4, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IR-LABEL: v_test_udiv_k_den_i64: diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -958,108 +958,41 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x) { ; GCN-LABEL: s_test_urem_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 -; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x41c00000 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s2, 0xffe8 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v3, v1, s2 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 -; GCN-NEXT: s_mov_b32 s10, -1 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc +; GCN-NEXT: s_mov_b32 s10, 0xaaaaaaaa +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v4, v0, s2 -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v5, v2, s2 -; GCN-NEXT: v_mul_lo_u32 v6, v0, s2 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc -; GCN-NEXT: v_mul_lo_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v6 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s6, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s7, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s4, 0xaaaaaaab +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s7, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: s_mul_i32 s4, s7, s4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; GCN-NEXT: s_mul_i32 s1, s6, s10 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, s1, v3 ; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_hi_u32 v2, v0, 24 -; GCN-NEXT: v_mul_lo_u32 v1, v1, 24 -; GCN-NEXT: v_mul_lo_u32 v0, v0, 24 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, 24, v0 -; GCN-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 23, v2 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], 23, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: s_mul_i32 s1, s7, s10 +; GCN-NEXT: v_addc_u32_e64 v2, s[8:9], 0, 0, vcc +; GCN-NEXT: v_add_i32_e32 v1, vcc, s1, v1 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v0, v2, vcc +; GCN-NEXT: v_alignbit_b32 v1, v0, v1, 4 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 4, v0 +; GCN-NEXT: v_mul_hi_u32 v2, v1, 24 +; GCN-NEXT: v_mul_lo_u32 v0, v0, 24 +; GCN-NEXT: v_mul_lo_u32 v1, v1, 24 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v1 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem_k_den_i64: diff --git a/llvm/test/CodeGen/BPF/32-bit-subreg-alu.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-alu.ll --- a/llvm/test/CodeGen/BPF/32-bit-subreg-alu.ll +++ b/llvm/test/CodeGen/BPF/32-bit-subreg-alu.ll @@ -190,7 +190,10 @@ define dso_local i32 @div_i(i32 %a) local_unnamed_addr #0 { entry: %div = udiv i32 %a, 15 -; CHECK: w{{[0-9]+}} /= 15 +; CHECK: [[REG1:r[0-9]+]] = w{{[0-9]+}} +; CHECK: [[REG2:r[0-9]+]] = 2290649225 ll +; CHECK: [[REG1]] *= [[REG2]] +; CHECK: [[REG1]] >>= 35 ret i32 %div } diff --git a/llvm/test/CodeGen/BPF/sdiv_error.ll b/llvm/test/CodeGen/BPF/sdiv_error.ll --- a/llvm/test/CodeGen/BPF/sdiv_error.ll +++ b/llvm/test/CodeGen/BPF/sdiv_error.ll @@ -3,7 +3,7 @@ ; CHECK: Unsupport signed division ; Function Attrs: norecurse nounwind readnone -define i32 @test(i32 %len) #0 { - %1 = srem i32 %len, 15 +define i32 @test(i32 %len, i32 %rhs) #0 { + %1 = srem i32 %len, %rhs ret i32 %1 } diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll --- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll @@ -617,7 +617,6 @@ ; CHECK-LABEL: test_ds_cross_basic_blocks: ; CHECK: # %bb.0: ; CHECK-NEXT: cmplwi r4, 0 -; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill @@ -629,59 +628,57 @@ ; CHECK-NEXT: addi r6, r3, 4009 ; CHECK-NEXT: addis r5, r2, .LC0@toc@ha ; CHECK-NEXT: ld r5, .LC0@toc@l(r5) -; CHECK-NEXT: iselgt r8, r4, r7 -; CHECK-NEXT: lis r4, -21846 +; CHECK-NEXT: iselgt r4, r4, r7 ; CHECK-NEXT: li r3, 0 -; CHECK-NEXT: li r9, -7 -; CHECK-NEXT: li r10, -6 +; CHECK-NEXT: li r8, -7 +; CHECK-NEXT: li r9, -6 +; CHECK-NEXT: li r10, 1 ; CHECK-NEXT: li r11, 1 ; CHECK-NEXT: li r12, 1 ; CHECK-NEXT: li r30, 1 ; CHECK-NEXT: ld r5, 0(r5) -; CHECK-NEXT: mtctr r8 -; CHECK-NEXT: li r8, -9 +; CHECK-NEXT: mtctr r4 +; CHECK-NEXT: li r4, -9 ; CHECK-NEXT: addi r5, r5, -1 -; CHECK-NEXT: ori r4, r4, 43691 ; CHECK-NEXT: li r29, 1 -; CHECK-NEXT: li r28, 1 ; CHECK-NEXT: b .LBB6_4 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB6_2: -; CHECK-NEXT: ldx r0, r6, r8 -; CHECK-NEXT: add r28, r0, r28 -; CHECK-NEXT: ld r0, -8(r6) +; CHECK-NEXT: ldx r0, r6, r4 ; CHECK-NEXT: add r29, r0, r29 +; CHECK-NEXT: ld r0, -8(r6) +; CHECK-NEXT: add r30, r0, r30 ; CHECK-NEXT: .LBB6_3: ; CHECK-NEXT: addi r6, r6, 1 -; CHECK-NEXT: mulld r0, r29, r28 -; CHECK-NEXT: mulld r0, r0, r30 +; CHECK-NEXT: mulld r0, r30, r29 ; CHECK-NEXT: mulld r0, r0, r12 ; CHECK-NEXT: mulld r0, r0, r11 +; CHECK-NEXT: mulld r0, r0, r10 ; CHECK-NEXT: maddld r3, r0, r7, r3 ; CHECK-NEXT: bdz .LBB6_9 ; CHECK-NEXT: .LBB6_4: ; CHECK-NEXT: lbzu r0, 1(r5) -; CHECK-NEXT: mulhwu r27, r0, r4 -; CHECK-NEXT: rlwinm r26, r27, 0, 0, 30 -; CHECK-NEXT: srwi r27, r27, 1 -; CHECK-NEXT: add r27, r27, r26 -; CHECK-NEXT: sub r0, r0, r27 +; CHECK-NEXT: mulli r28, r0, 171 +; CHECK-NEXT: rlwinm r27, r28, 24, 8, 30 +; CHECK-NEXT: srwi r28, r28, 9 +; CHECK-NEXT: add r28, r28, r27 +; CHECK-NEXT: sub r0, r0, r28 +; CHECK-NEXT: clrlwi r0, r0, 24 ; CHECK-NEXT: cmplwi r0, 1 ; CHECK-NEXT: beq cr0, .LBB6_2 ; CHECK-NEXT: # %bb.5: -; CHECK-NEXT: clrlwi r0, r0, 24 ; CHECK-NEXT: cmplwi r0, 2 ; CHECK-NEXT: bne cr0, .LBB6_7 ; CHECK-NEXT: # %bb.6: -; CHECK-NEXT: ldx r0, r6, r9 -; CHECK-NEXT: add r30, r0, r30 -; CHECK-NEXT: ld r0, -4(r6) +; CHECK-NEXT: ldx r0, r6, r8 ; CHECK-NEXT: add r12, r0, r12 +; CHECK-NEXT: ld r0, -4(r6) +; CHECK-NEXT: add r11, r0, r11 ; CHECK-NEXT: b .LBB6_3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB6_7: -; CHECK-NEXT: ldx r0, r6, r10 -; CHECK-NEXT: add r11, r0, r11 +; CHECK-NEXT: ldx r0, r6, r9 +; CHECK-NEXT: add r10, r0, r10 ; CHECK-NEXT: ld r0, 0(r6) ; CHECK-NEXT: add r7, r0, r7 ; CHECK-NEXT: b .LBB6_3 @@ -692,7 +689,6 @@ ; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload ; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload ; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload ; CHECK-NEXT: blr %3 = sext i32 %1 to i64 %4 = icmp eq i32 %1, 0 diff --git a/llvm/test/CodeGen/PowerPC/srem-lkk.ll b/llvm/test/CodeGen/PowerPC/srem-lkk.ll --- a/llvm/test/CodeGen/PowerPC/srem-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/srem-lkk.ll @@ -130,20 +130,93 @@ ; Don't fold i64 srem define i64 @dont_fold_srem_i64(i64 %x) { -; CHECK-LABEL: dont_fold_srem_i64: -; CHECK: # %bb.0: -; CHECK-NEXT: mflr 0 -; CHECK-NEXT: stw 0, 4(1) -; CHECK-NEXT: stwu 1, -16(1) -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset lr, 4 -; CHECK-NEXT: li 5, 0 -; CHECK-NEXT: li 6, 98 -; CHECK-NEXT: bl __moddi3 -; CHECK-NEXT: lwz 0, 20(1) -; CHECK-NEXT: addi 1, 1, 16 -; CHECK-NEXT: mtlr 0 -; CHECK-NEXT: blr +; CHECK64-LABEL: dont_fold_srem_i64: +; CHECK64: # %bb.0: +; CHECK64-NEXT: mflr 0 +; CHECK64-NEXT: stw 0, 4(1) +; CHECK64-NEXT: stwu 1, -32(1) +; CHECK64-NEXT: .cfi_def_cfa_offset 32 +; CHECK64-NEXT: .cfi_offset lr, 4 +; CHECK64-NEXT: .cfi_offset r29, -12 +; CHECK64-NEXT: .cfi_offset r30, -8 +; CHECK64-NEXT: stw 29, 20(1) # 4-byte Folded Spill +; CHECK64-NEXT: mr 29, 3 +; CHECK64-NEXT: srawi 3, 3, 31 +; CHECK64-NEXT: lis 5, -17388 +; CHECK64-NEXT: li 7, 0 +; CHECK64-NEXT: stw 30, 24(1) # 4-byte Folded Spill +; CHECK64-NEXT: mr 30, 4 +; CHECK64-NEXT: lis 4, 21399 +; CHECK64-NEXT: ori 9, 4, 33436 +; CHECK64-NEXT: ori 10, 5, 58849 +; CHECK64-NEXT: mr 4, 3 +; CHECK64-NEXT: mr 5, 29 +; CHECK64-NEXT: mr 6, 30 +; CHECK64-NEXT: li 8, 0 +; CHECK64-NEXT: bl __multi3 +; CHECK64-NEXT: rotlwi 4, 4, 27 +; CHECK64-NEXT: srwi 6, 3, 31 +; CHECK64-NEXT: rlwimi 4, 3, 27, 0, 4 +; CHECK64-NEXT: srawi 3, 3, 5 +; CHECK64-NEXT: addc 4, 4, 6 +; CHECK64-NEXT: li 5, 98 +; CHECK64-NEXT: addze 3, 3 +; CHECK64-NEXT: mulhwu 5, 4, 5 +; CHECK64-NEXT: mulli 4, 4, 98 +; CHECK64-NEXT: mulli 3, 3, 98 +; CHECK64-NEXT: add 3, 5, 3 +; CHECK64-NEXT: subc 4, 30, 4 +; CHECK64-NEXT: lwz 30, 24(1) # 4-byte Folded Reload +; CHECK64-NEXT: subfe 3, 3, 29 +; CHECK64-NEXT: lwz 29, 20(1) # 4-byte Folded Reload +; CHECK64-NEXT: lwz 0, 36(1) +; CHECK64-NEXT: addi 1, 1, 32 +; CHECK64-NEXT: mtlr 0 +; CHECK64-NEXT: blr +; +; CHECK32-LABEL: dont_fold_srem_i64: +; CHECK32: # %bb.0: +; CHECK32-NEXT: mflr 0 +; CHECK32-NEXT: stw 0, 4(1) +; CHECK32-NEXT: stwu 1, -32(1) +; CHECK32-NEXT: .cfi_def_cfa_offset 32 +; CHECK32-NEXT: .cfi_offset lr, 4 +; CHECK32-NEXT: .cfi_offset r29, -12 +; CHECK32-NEXT: .cfi_offset r30, -8 +; CHECK32-NEXT: stw 29, 20(1) # 4-byte Folded Spill +; CHECK32-NEXT: mr 29, 3 +; CHECK32-NEXT: stw 30, 24(1) # 4-byte Folded Spill +; CHECK32-NEXT: mr 30, 4 +; CHECK32-NEXT: srawi 3, 3, 31 +; CHECK32-NEXT: lis 4, 21399 +; CHECK32-NEXT: lis 5, -17388 +; CHECK32-NEXT: ori 9, 4, 33436 +; CHECK32-NEXT: ori 10, 5, 58849 +; CHECK32-NEXT: mr 5, 29 +; CHECK32-NEXT: mr 6, 30 +; CHECK32-NEXT: mr 4, 3 +; CHECK32-NEXT: li 7, 0 +; CHECK32-NEXT: li 8, 0 +; CHECK32-NEXT: bl __multi3 +; CHECK32-NEXT: rotlwi 4, 4, 27 +; CHECK32-NEXT: srwi 6, 3, 31 +; CHECK32-NEXT: rlwimi 4, 3, 27, 0, 4 +; CHECK32-NEXT: srawi 3, 3, 5 +; CHECK32-NEXT: addc 4, 4, 6 +; CHECK32-NEXT: li 5, 98 +; CHECK32-NEXT: addze 3, 3 +; CHECK32-NEXT: mulhwu 5, 4, 5 +; CHECK32-NEXT: mulli 4, 4, 98 +; CHECK32-NEXT: subc 4, 30, 4 +; CHECK32-NEXT: lwz 30, 24(1) # 4-byte Folded Reload +; CHECK32-NEXT: mulli 3, 3, 98 +; CHECK32-NEXT: add 3, 5, 3 +; CHECK32-NEXT: subfe 3, 3, 29 +; CHECK32-NEXT: lwz 29, 20(1) # 4-byte Folded Reload +; CHECK32-NEXT: lwz 0, 36(1) +; CHECK32-NEXT: addi 1, 1, 32 +; CHECK32-NEXT: mtlr 0 +; CHECK32-NEXT: blr %1 = srem i64 %x, 98 ret i64 %1 } diff --git a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll --- a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll @@ -11,233 +11,128 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { ; P9LE-LABEL: fold_srem_vec_1: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: lis r4, -21386 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 37253 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: add r4, r4, r3 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, 31710 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: ori r4, r4, 63421 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: sub r4, r4, r3 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, -124 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, 21399 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: ori r4, r4, 33437 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 5 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 98 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, -16728 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: ori r4, r4, 63249 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 8 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, -1003 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v2, v4 -; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; P9LE-NEXT: vmrglh v4, v2, v2 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; P9LE-NEXT: vextsh2w v4, v4 +; P9LE-NEXT: lxvx v3, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI0_1@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI0_1@toc@l +; P9LE-NEXT: vmuluwm v3, v4, v3 +; P9LE-NEXT: vspltisw v4, 8 +; P9LE-NEXT: vadduwm v4, v4, v4 +; P9LE-NEXT: vsrw v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI0_2@toc@ha +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: addi r3, r3, .LCPI0_2@toc@l +; P9LE-NEXT: vmladduhm v3, v2, v4, v3 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI0_3@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI0_3@toc@l +; P9LE-NEXT: vsrah v3, v3, v4 +; P9LE-NEXT: vspltish v4, 15 +; P9LE-NEXT: vsrh v4, v3, v4 +; P9LE-NEXT: vadduhm v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: fold_srem_vec_1: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: lis r4, 31710 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 63421 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: sub r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, -124 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, -21386 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 37253 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, -16728 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 63249 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 8 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, -1003 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, 21399 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 33437 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 5 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 98 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v3, v2 +; P9BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; P9BE-NEXT: vmrghh v4, v2, v2 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; P9BE-NEXT: vextsh2w v4, v4 +; P9BE-NEXT: lxvx v3, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI0_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI0_1@toc@l +; P9BE-NEXT: vmuluwm v3, v4, v3 +; P9BE-NEXT: vspltisw v4, 8 +; P9BE-NEXT: vadduwm v4, v4, v4 +; P9BE-NEXT: vsrw v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI0_2@toc@ha +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: addi r3, r3, .LCPI0_2@toc@l +; P9BE-NEXT: vmladduhm v3, v2, v4, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI0_3@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI0_3@toc@l +; P9BE-NEXT: vsrah v3, v3, v4 +; P9BE-NEXT: vspltish v4, 15 +; P9BE-NEXT: vsrh v4, v3, v4 +; P9BE-NEXT: vadduhm v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: fold_srem_vec_1: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 21399 -; P8LE-NEXT: lis r8, -16728 -; P8LE-NEXT: lis r9, -21386 -; P8LE-NEXT: lis r10, 31710 -; P8LE-NEXT: ori r3, r3, 33437 -; P8LE-NEXT: ori r8, r8, 63249 -; P8LE-NEXT: ori r9, r9, 37253 -; P8LE-NEXT: ori r10, r10, 63421 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: rldicl r5, r4, 32, 48 -; P8LE-NEXT: rldicl r6, r4, 16, 48 -; P8LE-NEXT: clrldi r7, r4, 48 -; P8LE-NEXT: extsh r5, r5 -; P8LE-NEXT: extsh r6, r6 -; P8LE-NEXT: rldicl r4, r4, 48, 48 -; P8LE-NEXT: extsh r7, r7 -; P8LE-NEXT: mulhw r3, r5, r3 -; P8LE-NEXT: extsh r4, r4 -; P8LE-NEXT: mulhw r8, r6, r8 -; P8LE-NEXT: mulhw r9, r7, r9 -; P8LE-NEXT: mulhw r10, r4, r10 -; P8LE-NEXT: srwi r11, r3, 31 -; P8LE-NEXT: srawi r3, r3, 5 -; P8LE-NEXT: add r3, r3, r11 -; P8LE-NEXT: srwi r11, r8, 31 -; P8LE-NEXT: add r9, r9, r7 -; P8LE-NEXT: srawi r8, r8, 8 -; P8LE-NEXT: sub r10, r10, r4 -; P8LE-NEXT: add r8, r8, r11 -; P8LE-NEXT: srwi r11, r9, 31 -; P8LE-NEXT: srawi r9, r9, 6 -; P8LE-NEXT: mulli r3, r3, 98 -; P8LE-NEXT: add r9, r9, r11 -; P8LE-NEXT: srwi r11, r10, 31 -; P8LE-NEXT: srawi r10, r10, 6 -; P8LE-NEXT: mulli r8, r8, -1003 -; P8LE-NEXT: add r10, r10, r11 -; P8LE-NEXT: mulli r9, r9, 95 -; P8LE-NEXT: mulli r10, r10, -124 -; P8LE-NEXT: sub r3, r5, r3 -; P8LE-NEXT: mtvsrd v2, r3 -; P8LE-NEXT: sub r5, r6, r8 -; P8LE-NEXT: sub r3, r7, r9 -; P8LE-NEXT: mtvsrd v3, r5 -; P8LE-NEXT: sub r4, r4, r10 -; P8LE-NEXT: mtvsrd v4, r3 -; P8LE-NEXT: mtvsrd v5, r4 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: vmrghh v3, v5, v4 -; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: vmrglh v3, v2, v2 +; P8LE-NEXT: vspltisw v4, 8 +; P8LE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI0_1@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI0_1@toc@l +; P8LE-NEXT: vslw v3, v3, v4 +; P8LE-NEXT: vsraw v3, v3, v4 +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: vxor v5, v5, v5 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI0_2@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI0_2@toc@l +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vmladduhm v3, v2, v4, v3 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI0_3@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI0_3@toc@l +; P8LE-NEXT: vsrah v3, v3, v4 +; P8LE-NEXT: vspltish v4, 15 +; P8LE-NEXT: vsrh v4, v3, v4 +; P8LE-NEXT: vadduhm v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: vmladduhm v3, v3, v4, v5 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: fold_srem_vec_1: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, -16728 -; P8BE-NEXT: lis r8, 21399 -; P8BE-NEXT: lis r9, 31710 -; P8BE-NEXT: lis r10, -21386 -; P8BE-NEXT: ori r3, r3, 63249 -; P8BE-NEXT: ori r8, r8, 33437 -; P8BE-NEXT: ori r9, r9, 63421 -; P8BE-NEXT: ori r10, r10, 37253 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: rldicl r7, r4, 32, 48 -; P8BE-NEXT: extsh r5, r5 -; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: rldicl r4, r4, 16, 48 -; P8BE-NEXT: extsh r7, r7 -; P8BE-NEXT: mulhw r3, r5, r3 -; P8BE-NEXT: extsh r4, r4 -; P8BE-NEXT: mulhw r8, r6, r8 -; P8BE-NEXT: mulhw r9, r7, r9 -; P8BE-NEXT: mulhw r10, r4, r10 -; P8BE-NEXT: srwi r11, r3, 31 -; P8BE-NEXT: srawi r3, r3, 8 -; P8BE-NEXT: add r3, r3, r11 -; P8BE-NEXT: srwi r11, r8, 31 -; P8BE-NEXT: sub r9, r9, r7 -; P8BE-NEXT: srawi r8, r8, 5 -; P8BE-NEXT: add r10, r10, r4 -; P8BE-NEXT: add r8, r8, r11 -; P8BE-NEXT: srwi r11, r9, 31 -; P8BE-NEXT: srawi r9, r9, 6 -; P8BE-NEXT: mulli r3, r3, -1003 -; P8BE-NEXT: add r9, r9, r11 -; P8BE-NEXT: srwi r11, r10, 31 -; P8BE-NEXT: srawi r10, r10, 6 -; P8BE-NEXT: mulli r8, r8, 98 -; P8BE-NEXT: add r10, r10, r11 -; P8BE-NEXT: mulli r9, r9, -124 -; P8BE-NEXT: mulli r10, r10, 95 -; P8BE-NEXT: sub r3, r5, r3 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: sub r5, r6, r8 -; P8BE-NEXT: mtvsrd v2, r3 -; P8BE-NEXT: sub r6, r7, r9 -; P8BE-NEXT: sldi r3, r5, 48 -; P8BE-NEXT: sub r4, r4, r10 -; P8BE-NEXT: mtvsrd v3, r3 -; P8BE-NEXT: sldi r3, r6, 48 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: mtvsrd v5, r4 -; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: vmrghh v3, v5, v4 -; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: vmrghh v3, v2, v2 +; P8BE-NEXT: vspltisw v4, 8 +; P8BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI0_1@toc@ha +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI0_1@toc@l +; P8BE-NEXT: vslw v3, v3, v4 +; P8BE-NEXT: vsraw v3, v3, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: vxor v5, v5, v5 +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI0_2@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI0_2@toc@l +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vmladduhm v3, v2, v4, v3 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI0_3@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI0_3@toc@l +; P8BE-NEXT: lxvw4x v0, 0, r3 +; P8BE-NEXT: vsrah v3, v3, v4 +; P8BE-NEXT: vspltish v4, 15 +; P8BE-NEXT: vsrh v4, v3, v4 +; P8BE-NEXT: vadduhm v3, v3, v4 +; P8BE-NEXT: vmladduhm v3, v3, v0, v5 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -246,217 +141,108 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { ; P9LE-LABEL: fold_srem_vec_2: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: lis r4, -21386 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 37253 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r5, r3, r4 -; P9LE-NEXT: add r5, r5, r3 -; P9LE-NEXT: srwi r6, r5, 31 -; P9LE-NEXT: srawi r5, r5, 6 -; P9LE-NEXT: add r5, r5, r6 -; P9LE-NEXT: mulli r5, r5, 95 -; P9LE-NEXT: sub r3, r3, r5 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r5, r3, r4 -; P9LE-NEXT: add r5, r5, r3 -; P9LE-NEXT: srwi r6, r5, 31 -; P9LE-NEXT: srawi r5, r5, 6 -; P9LE-NEXT: add r5, r5, r6 -; P9LE-NEXT: mulli r5, r5, 95 -; P9LE-NEXT: sub r3, r3, r5 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r5, r3, r4 -; P9LE-NEXT: add r5, r5, r3 -; P9LE-NEXT: srwi r6, r5, 31 -; P9LE-NEXT: srawi r5, r5, 6 -; P9LE-NEXT: add r5, r5, r6 -; P9LE-NEXT: mulli r5, r5, 95 -; P9LE-NEXT: sub r3, r3, r5 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: add r4, r4, r3 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v2, v4 -; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P9LE-NEXT: vmrglh v4, v2, v2 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; P9LE-NEXT: vextsh2w v4, v4 +; P9LE-NEXT: lxvx v3, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI1_1@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI1_1@toc@l +; P9LE-NEXT: vmuluwm v3, v4, v3 +; P9LE-NEXT: vspltisw v4, 8 +; P9LE-NEXT: vadduwm v4, v4, v4 +; P9LE-NEXT: vsrw v3, v3, v4 +; P9LE-NEXT: vspltish v4, 6 +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: vadduhm v3, v3, v2 +; P9LE-NEXT: vsrah v3, v3, v4 +; P9LE-NEXT: vspltish v4, 15 +; P9LE-NEXT: vsrh v4, v3, v4 +; P9LE-NEXT: vadduhm v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: fold_srem_vec_2: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r4, -21386 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 37253 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r5, r3, r4 -; P9BE-NEXT: add r5, r5, r3 -; P9BE-NEXT: srwi r6, r5, 31 -; P9BE-NEXT: srawi r5, r5, 6 -; P9BE-NEXT: add r5, r5, r6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: sub r3, r3, r5 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r5, r3, r4 -; P9BE-NEXT: add r5, r5, r3 -; P9BE-NEXT: srwi r6, r5, 31 -; P9BE-NEXT: srawi r5, r5, 6 -; P9BE-NEXT: add r5, r5, r6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: sub r3, r3, r5 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r5, r3, r4 -; P9BE-NEXT: add r5, r5, r3 -; P9BE-NEXT: srwi r6, r5, 31 -; P9BE-NEXT: srawi r5, r5, 6 -; P9BE-NEXT: add r5, r5, r6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: sub r3, r3, r5 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P9BE-NEXT: vmrghh v4, v2, v2 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; P9BE-NEXT: vextsh2w v4, v4 +; P9BE-NEXT: lxvx v3, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI1_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI1_1@toc@l +; P9BE-NEXT: vmuluwm v3, v4, v3 +; P9BE-NEXT: vspltisw v4, 8 +; P9BE-NEXT: vadduwm v4, v4, v4 +; P9BE-NEXT: vsrw v3, v3, v4 +; P9BE-NEXT: vspltish v4, 6 +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: vadduhm v3, v3, v2 +; P9BE-NEXT: vsrah v3, v3, v4 +; P9BE-NEXT: vspltish v4, 15 +; P9BE-NEXT: vsrh v4, v3, v4 +; P9BE-NEXT: vadduhm v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: fold_srem_vec_2: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, -21386 -; P8LE-NEXT: ori r3, r3, 37253 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: clrldi r5, r4, 48 -; P8LE-NEXT: rldicl r6, r4, 48, 48 -; P8LE-NEXT: extsh r5, r5 -; P8LE-NEXT: rldicl r7, r4, 32, 48 -; P8LE-NEXT: extsh r6, r6 -; P8LE-NEXT: mulhw r8, r5, r3 -; P8LE-NEXT: rldicl r4, r4, 16, 48 -; P8LE-NEXT: extsh r7, r7 -; P8LE-NEXT: mulhw r9, r6, r3 -; P8LE-NEXT: extsh r4, r4 -; P8LE-NEXT: mulhw r10, r7, r3 -; P8LE-NEXT: mulhw r3, r4, r3 -; P8LE-NEXT: add r8, r8, r5 -; P8LE-NEXT: add r9, r9, r6 -; P8LE-NEXT: srwi r11, r8, 31 -; P8LE-NEXT: srawi r8, r8, 6 -; P8LE-NEXT: add r10, r10, r7 -; P8LE-NEXT: add r3, r3, r4 -; P8LE-NEXT: add r8, r8, r11 -; P8LE-NEXT: srwi r11, r9, 31 -; P8LE-NEXT: srawi r9, r9, 6 -; P8LE-NEXT: mulli r8, r8, 95 -; P8LE-NEXT: add r9, r9, r11 -; P8LE-NEXT: srwi r11, r10, 31 -; P8LE-NEXT: srawi r10, r10, 6 -; P8LE-NEXT: mulli r9, r9, 95 -; P8LE-NEXT: add r10, r10, r11 -; P8LE-NEXT: srwi r11, r3, 31 -; P8LE-NEXT: srawi r3, r3, 6 -; P8LE-NEXT: mulli r10, r10, 95 -; P8LE-NEXT: sub r5, r5, r8 -; P8LE-NEXT: add r3, r3, r11 -; P8LE-NEXT: mtvsrd v2, r5 -; P8LE-NEXT: mulli r3, r3, 95 -; P8LE-NEXT: sub r6, r6, r9 -; P8LE-NEXT: mtvsrd v3, r6 -; P8LE-NEXT: sub r5, r7, r10 -; P8LE-NEXT: mtvsrd v4, r5 -; P8LE-NEXT: sub r3, r4, r3 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: mtvsrd v5, r3 -; P8LE-NEXT: vmrghh v3, v5, v4 -; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v2, v2 +; P8LE-NEXT: vspltisw v4, 8 +; P8LE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI1_1@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI1_1@toc@l +; P8LE-NEXT: vslw v3, v3, v4 +; P8LE-NEXT: vsraw v3, v3, v4 +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: vxor v5, v5, v5 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: vspltish v4, 6 +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vadduhm v3, v3, v2 +; P8LE-NEXT: vsrah v3, v3, v4 +; P8LE-NEXT: vspltish v4, 15 +; P8LE-NEXT: vsrh v4, v3, v4 +; P8LE-NEXT: vadduhm v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: vmladduhm v3, v3, v4, v5 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: fold_srem_vec_2: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, -21386 -; P8BE-NEXT: ori r3, r3, 37253 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: extsh r5, r5 -; P8BE-NEXT: rldicl r7, r4, 32, 48 -; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: mulhw r8, r5, r3 -; P8BE-NEXT: rldicl r4, r4, 16, 48 -; P8BE-NEXT: extsh r7, r7 -; P8BE-NEXT: mulhw r9, r6, r3 -; P8BE-NEXT: extsh r4, r4 -; P8BE-NEXT: mulhw r10, r7, r3 -; P8BE-NEXT: mulhw r3, r4, r3 -; P8BE-NEXT: add r8, r8, r5 -; P8BE-NEXT: add r9, r9, r6 -; P8BE-NEXT: srwi r11, r8, 31 -; P8BE-NEXT: srawi r8, r8, 6 -; P8BE-NEXT: add r10, r10, r7 -; P8BE-NEXT: add r3, r3, r4 -; P8BE-NEXT: add r8, r8, r11 -; P8BE-NEXT: srwi r11, r9, 31 -; P8BE-NEXT: srawi r9, r9, 6 -; P8BE-NEXT: mulli r8, r8, 95 -; P8BE-NEXT: add r9, r9, r11 -; P8BE-NEXT: srwi r11, r10, 31 -; P8BE-NEXT: srawi r10, r10, 6 -; P8BE-NEXT: mulli r9, r9, 95 -; P8BE-NEXT: add r10, r10, r11 -; P8BE-NEXT: srwi r11, r3, 31 -; P8BE-NEXT: srawi r3, r3, 6 -; P8BE-NEXT: mulli r10, r10, 95 -; P8BE-NEXT: sub r5, r5, r8 -; P8BE-NEXT: add r3, r3, r11 -; P8BE-NEXT: sldi r5, r5, 48 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: sub r6, r6, r9 -; P8BE-NEXT: mtvsrd v2, r5 -; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: sub r7, r7, r10 -; P8BE-NEXT: mtvsrd v3, r6 -; P8BE-NEXT: sub r3, r4, r3 -; P8BE-NEXT: sldi r4, r7, 48 -; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v4, r4 -; P8BE-NEXT: mtvsrd v5, r3 -; P8BE-NEXT: vmrghh v3, v5, v4 -; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: vmrghh v3, v2, v2 +; P8BE-NEXT: vspltisw v4, 8 +; P8BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI1_1@toc@ha +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI1_1@toc@l +; P8BE-NEXT: lxvw4x v0, 0, r3 +; P8BE-NEXT: vslw v3, v3, v4 +; P8BE-NEXT: vsraw v3, v3, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: vxor v5, v5, v5 +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: vspltish v4, 6 +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vadduhm v3, v3, v2 +; P8BE-NEXT: vsrah v3, v3, v4 +; P8BE-NEXT: vspltish v4, 15 +; P8BE-NEXT: vsrh v4, v3, v4 +; P8BE-NEXT: vadduhm v3, v3, v4 +; P8BE-NEXT: vmladduhm v3, v3, v0, v5 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -467,257 +253,112 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; P9LE-LABEL: combine_srem_sdiv: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: lis r4, -21386 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 37253 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r5, r3, r4 -; P9LE-NEXT: add r5, r5, r3 -; P9LE-NEXT: srwi r6, r5, 31 -; P9LE-NEXT: srawi r5, r5, 6 -; P9LE-NEXT: add r5, r5, r6 -; P9LE-NEXT: mulli r6, r5, 95 -; P9LE-NEXT: sub r3, r3, r6 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r6, r3 -; P9LE-NEXT: mulhw r7, r6, r4 -; P9LE-NEXT: add r6, r7, r6 -; P9LE-NEXT: srwi r7, r6, 31 -; P9LE-NEXT: srawi r6, r6, 6 -; P9LE-NEXT: add r6, r6, r7 -; P9LE-NEXT: mulli r7, r6, 95 -; P9LE-NEXT: sub r3, r3, r7 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: extsh r7, r3 -; P9LE-NEXT: mulhw r8, r7, r4 -; P9LE-NEXT: add r7, r8, r7 -; P9LE-NEXT: srwi r8, r7, 31 -; P9LE-NEXT: srawi r7, r7, 6 -; P9LE-NEXT: add r7, r7, r8 -; P9LE-NEXT: mulli r8, r7, 95 -; P9LE-NEXT: sub r3, r3, r8 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r8, r3 -; P9LE-NEXT: mulhw r4, r8, r4 -; P9LE-NEXT: add r4, r4, r8 -; P9LE-NEXT: srwi r8, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r8 -; P9LE-NEXT: mulli r8, r4, 95 -; P9LE-NEXT: mtvsrd v5, r4 -; P9LE-NEXT: sub r3, r3, r8 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v2, v4 -; P9LE-NEXT: mtvsrd v4, r6 -; P9LE-NEXT: vmrglw v2, v2, v3 -; P9LE-NEXT: mtvsrd v3, r5 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: mtvsrd v4, r7 -; P9LE-NEXT: vmrghh v4, v5, v4 -; P9LE-NEXT: vmrglw v3, v4, v3 +; P9LE-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; P9LE-NEXT: vmrglh v4, v2, v2 +; P9LE-NEXT: addi r3, r3, .LCPI2_1@toc@l +; P9LE-NEXT: vextsh2w v4, v4 +; P9LE-NEXT: lxvx v3, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; P9LE-NEXT: lxvx v5, 0, r3 +; P9LE-NEXT: vmuluwm v3, v4, v3 +; P9LE-NEXT: vspltisw v4, 8 +; P9LE-NEXT: vadduwm v4, v4, v4 +; P9LE-NEXT: vsrw v3, v3, v4 +; P9LE-NEXT: vspltish v4, 6 +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: vadduhm v3, v3, v2 +; P9LE-NEXT: vsrah v3, v3, v4 +; P9LE-NEXT: vspltish v4, 15 +; P9LE-NEXT: vsrh v4, v3, v4 +; P9LE-NEXT: vadduhm v3, v3, v4 +; P9LE-NEXT: vxor v4, v4, v4 +; P9LE-NEXT: vmladduhm v4, v3, v5, v4 +; P9LE-NEXT: vsubuhm v2, v2, v4 ; P9LE-NEXT: vadduhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: combine_srem_sdiv: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r5, -21386 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r5, r5, 37253 -; P9BE-NEXT: extsh r4, r3 -; P9BE-NEXT: mulhw r6, r4, r5 -; P9BE-NEXT: add r4, r6, r4 -; P9BE-NEXT: srwi r6, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r6 -; P9BE-NEXT: mulli r6, r4, 95 -; P9BE-NEXT: sub r3, r3, r6 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r6, r3 -; P9BE-NEXT: mulhw r7, r6, r5 -; P9BE-NEXT: add r6, r7, r6 -; P9BE-NEXT: srwi r7, r6, 31 -; P9BE-NEXT: srawi r6, r6, 6 -; P9BE-NEXT: add r6, r6, r7 -; P9BE-NEXT: mulli r7, r6, 95 -; P9BE-NEXT: sub r3, r3, r7 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: extsh r7, r3 -; P9BE-NEXT: mulhw r8, r7, r5 -; P9BE-NEXT: add r7, r8, r7 -; P9BE-NEXT: srwi r8, r7, 31 -; P9BE-NEXT: srawi r7, r7, 6 -; P9BE-NEXT: add r7, r7, r8 -; P9BE-NEXT: mulli r8, r7, 95 -; P9BE-NEXT: sub r3, r3, r8 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r5, r3, r5 -; P9BE-NEXT: add r5, r5, r3 -; P9BE-NEXT: srwi r8, r5, 31 -; P9BE-NEXT: srawi r5, r5, 6 -; P9BE-NEXT: add r5, r5, r8 -; P9BE-NEXT: mulli r8, r5, 95 -; P9BE-NEXT: sub r3, r3, r8 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: sldi r3, r4, 48 -; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v2, v3 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: sldi r3, r6, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: sldi r3, r7, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: sldi r3, r5, 48 -; P9BE-NEXT: mtvsrd v5, r3 -; P9BE-NEXT: vmrghh v4, v5, v4 -; P9BE-NEXT: vmrghw v3, v4, v3 +; P9BE-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; P9BE-NEXT: vmrghh v4, v2, v2 +; P9BE-NEXT: addi r3, r3, .LCPI2_1@toc@l +; P9BE-NEXT: vextsh2w v4, v4 +; P9BE-NEXT: lxvx v3, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; P9BE-NEXT: lxvx v5, 0, r3 +; P9BE-NEXT: vmuluwm v3, v4, v3 +; P9BE-NEXT: vspltisw v4, 8 +; P9BE-NEXT: vadduwm v4, v4, v4 +; P9BE-NEXT: vsrw v3, v3, v4 +; P9BE-NEXT: vspltish v4, 6 +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: vadduhm v3, v3, v2 +; P9BE-NEXT: vsrah v3, v3, v4 +; P9BE-NEXT: vspltish v4, 15 +; P9BE-NEXT: vsrh v4, v3, v4 +; P9BE-NEXT: vadduhm v3, v3, v4 +; P9BE-NEXT: vxor v4, v4, v4 +; P9BE-NEXT: vmladduhm v4, v3, v5, v4 +; P9BE-NEXT: vsubuhm v2, v2, v4 ; P9BE-NEXT: vadduhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: combine_srem_sdiv: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, -21386 -; P8LE-NEXT: ori r3, r3, 37253 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: clrldi r5, r4, 48 -; P8LE-NEXT: rldicl r6, r4, 48, 48 -; P8LE-NEXT: rldicl r7, r4, 32, 48 -; P8LE-NEXT: extsh r5, r5 -; P8LE-NEXT: extsh r8, r6 -; P8LE-NEXT: extsh r9, r7 -; P8LE-NEXT: mulhw r10, r5, r3 -; P8LE-NEXT: mulhw r11, r8, r3 -; P8LE-NEXT: rldicl r4, r4, 16, 48 -; P8LE-NEXT: mulhw r12, r9, r3 -; P8LE-NEXT: extsh r0, r4 -; P8LE-NEXT: mulhw r3, r0, r3 -; P8LE-NEXT: add r10, r10, r5 -; P8LE-NEXT: add r8, r11, r8 -; P8LE-NEXT: srwi r11, r10, 31 -; P8LE-NEXT: add r9, r12, r9 -; P8LE-NEXT: srawi r10, r10, 6 -; P8LE-NEXT: srawi r12, r8, 6 -; P8LE-NEXT: srwi r8, r8, 31 -; P8LE-NEXT: add r10, r10, r11 -; P8LE-NEXT: add r3, r3, r0 -; P8LE-NEXT: srawi r11, r9, 6 -; P8LE-NEXT: srwi r9, r9, 31 -; P8LE-NEXT: add r8, r12, r8 -; P8LE-NEXT: mtvsrd v2, r10 -; P8LE-NEXT: mulli r12, r10, 95 -; P8LE-NEXT: add r9, r11, r9 -; P8LE-NEXT: srwi r11, r3, 31 -; P8LE-NEXT: mtvsrd v3, r8 -; P8LE-NEXT: srawi r3, r3, 6 -; P8LE-NEXT: mulli r10, r8, 95 -; P8LE-NEXT: mtvsrd v4, r9 -; P8LE-NEXT: add r3, r3, r11 -; P8LE-NEXT: mulli r8, r9, 95 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: mulli r9, r3, 95 -; P8LE-NEXT: sub r5, r5, r12 -; P8LE-NEXT: sub r6, r6, r10 -; P8LE-NEXT: mtvsrd v3, r5 -; P8LE-NEXT: mtvsrd v5, r6 -; P8LE-NEXT: sub r5, r7, r8 -; P8LE-NEXT: sub r4, r4, r9 -; P8LE-NEXT: mtvsrd v0, r5 -; P8LE-NEXT: mtvsrd v1, r4 -; P8LE-NEXT: vmrghh v3, v5, v3 -; P8LE-NEXT: mtvsrd v5, r3 -; P8LE-NEXT: vmrghh v0, v1, v0 -; P8LE-NEXT: vmrghh v4, v5, v4 -; P8LE-NEXT: vmrglw v3, v0, v3 -; P8LE-NEXT: vmrglw v2, v4, v2 -; P8LE-NEXT: vadduhm v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v2, v2 +; P8LE-NEXT: vspltisw v4, 8 +; P8LE-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI2_1@toc@l +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; P8LE-NEXT: lvx v0, 0, r3 +; P8LE-NEXT: vslw v3, v3, v4 +; P8LE-NEXT: vsraw v3, v3, v4 +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: vxor v5, v5, v5 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: vspltish v4, 6 +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vadduhm v3, v3, v2 +; P8LE-NEXT: vsrah v3, v3, v4 +; P8LE-NEXT: vspltish v4, 15 +; P8LE-NEXT: vsrh v4, v3, v4 +; P8LE-NEXT: vadduhm v3, v3, v4 +; P8LE-NEXT: vmladduhm v4, v3, v0, v5 +; P8LE-NEXT: vsubuhm v2, v2, v4 +; P8LE-NEXT: vadduhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: combine_srem_sdiv: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r5, v2 -; P8BE-NEXT: lis r4, -21386 -; P8BE-NEXT: ori r4, r4, 37253 -; P8BE-NEXT: clrldi r3, r5, 48 -; P8BE-NEXT: rldicl r6, r5, 48, 48 -; P8BE-NEXT: extsh r8, r3 -; P8BE-NEXT: rldicl r7, r5, 32, 48 -; P8BE-NEXT: extsh r9, r6 -; P8BE-NEXT: rldicl r5, r5, 16, 48 -; P8BE-NEXT: mulhw r11, r8, r4 -; P8BE-NEXT: extsh r10, r7 -; P8BE-NEXT: extsh r5, r5 -; P8BE-NEXT: mulhw r12, r9, r4 -; P8BE-NEXT: mulhw r0, r10, r4 -; P8BE-NEXT: mulhw r4, r5, r4 -; P8BE-NEXT: add r8, r11, r8 -; P8BE-NEXT: add r9, r12, r9 -; P8BE-NEXT: srawi r11, r8, 6 -; P8BE-NEXT: srwi r8, r8, 31 -; P8BE-NEXT: add r10, r0, r10 -; P8BE-NEXT: add r4, r4, r5 -; P8BE-NEXT: add r8, r11, r8 -; P8BE-NEXT: srawi r12, r9, 6 -; P8BE-NEXT: srwi r9, r9, 31 -; P8BE-NEXT: srawi r0, r10, 6 -; P8BE-NEXT: srawi r11, r4, 6 -; P8BE-NEXT: srwi r10, r10, 31 -; P8BE-NEXT: add r9, r12, r9 -; P8BE-NEXT: srwi r4, r4, 31 -; P8BE-NEXT: mulli r12, r8, 95 -; P8BE-NEXT: add r10, r0, r10 -; P8BE-NEXT: add r4, r11, r4 -; P8BE-NEXT: mulli r0, r9, 95 -; P8BE-NEXT: sldi r9, r9, 48 -; P8BE-NEXT: sldi r8, r8, 48 -; P8BE-NEXT: mtvsrd v3, r9 -; P8BE-NEXT: mulli r9, r4, 95 -; P8BE-NEXT: mtvsrd v2, r8 -; P8BE-NEXT: mulli r8, r10, 95 -; P8BE-NEXT: sldi r10, r10, 48 -; P8BE-NEXT: sub r3, r3, r12 -; P8BE-NEXT: mtvsrd v4, r10 -; P8BE-NEXT: sub r6, r6, r0 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: mtvsrd v3, r3 -; P8BE-NEXT: sub r3, r5, r9 -; P8BE-NEXT: sub r7, r7, r8 -; P8BE-NEXT: mtvsrd v5, r6 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: sldi r5, r7, 48 -; P8BE-NEXT: mtvsrd v1, r3 -; P8BE-NEXT: sldi r3, r4, 48 -; P8BE-NEXT: mtvsrd v0, r5 -; P8BE-NEXT: vmrghh v3, v5, v3 -; P8BE-NEXT: mtvsrd v5, r3 -; P8BE-NEXT: vmrghh v0, v1, v0 -; P8BE-NEXT: vmrghh v4, v5, v4 -; P8BE-NEXT: vmrghw v3, v0, v3 -; P8BE-NEXT: vmrghw v2, v4, v2 -; P8BE-NEXT: vadduhm v2, v3, v2 +; P8BE-NEXT: vmrghh v3, v2, v2 +; P8BE-NEXT: vspltisw v4, 8 +; P8BE-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI2_1@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; P8BE-NEXT: lxvw4x v0, 0, r3 +; P8BE-NEXT: vslw v3, v3, v4 +; P8BE-NEXT: vsraw v3, v3, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: vxor v5, v5, v5 +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: vspltish v4, 6 +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vadduhm v3, v3, v2 +; P8BE-NEXT: vsrah v3, v3, v4 +; P8BE-NEXT: vspltish v4, 15 +; P8BE-NEXT: vsrh v4, v3, v4 +; P8BE-NEXT: vadduhm v3, v3, v4 +; P8BE-NEXT: vmladduhm v4, v3, v0, v5 +; P8BE-NEXT: vsubuhm v2, v2, v4 +; P8BE-NEXT: vadduhm v2, v2, v3 ; P8BE-NEXT: blr %1 = srem <4 x i16> %x, %2 = sdiv <4 x i16> %x, @@ -729,181 +370,116 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; P9LE-LABEL: dont_fold_srem_power_of_two: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: srawi r4, r3, 6 -; P9LE-NEXT: addze r4, r4 -; P9LE-NEXT: slwi r4, r4, 6 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: srawi r4, r3, 5 -; P9LE-NEXT: addze r4, r4 -; P9LE-NEXT: slwi r4, r4, 5 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, -21386 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: ori r4, r4, 37253 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: add r4, r4, r3 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: srawi r4, r3, 3 -; P9LE-NEXT: addze r4, r4 -; P9LE-NEXT: slwi r4, r4, 3 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v4, v2 -; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P9LE-NEXT: vmrglh v3, v2, v2 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; P9LE-NEXT: vextsh2w v3, v3 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI3_1@toc@l +; P9LE-NEXT: vmuluwm v3, v3, v4 +; P9LE-NEXT: vspltisw v4, 8 +; P9LE-NEXT: vadduwm v4, v4, v4 +; P9LE-NEXT: vsrw v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI3_2@toc@ha +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: addi r3, r3, .LCPI3_2@toc@l +; P9LE-NEXT: vadduhm v3, v3, v2 +; P9LE-NEXT: vsrah v3, v3, v4 +; P9LE-NEXT: vspltish v4, 15 +; P9LE-NEXT: vsrh v4, v3, v4 +; P9LE-NEXT: vadduhm v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: dont_fold_srem_power_of_two: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: srawi r4, r3, 5 -; P9BE-NEXT: addze r4, r4 -; P9BE-NEXT: slwi r4, r4, 5 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: srawi r4, r3, 6 -; P9BE-NEXT: addze r4, r4 -; P9BE-NEXT: slwi r4, r4, 6 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, -21386 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 37253 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: srawi r4, r3, 3 -; P9BE-NEXT: addze r4, r4 -; P9BE-NEXT: slwi r4, r4, 3 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v3, v2 +; P9BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P9BE-NEXT: vmrghh v3, v2, v2 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; P9BE-NEXT: vextsh2w v3, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI3_1@toc@l +; P9BE-NEXT: vmuluwm v3, v3, v4 +; P9BE-NEXT: vspltisw v4, 8 +; P9BE-NEXT: vadduwm v4, v4, v4 +; P9BE-NEXT: vsrw v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI3_2@toc@ha +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: addi r3, r3, .LCPI3_2@toc@l +; P9BE-NEXT: vadduhm v3, v3, v2 +; P9BE-NEXT: vsrah v3, v3, v4 +; P9BE-NEXT: vspltish v4, 15 +; P9BE-NEXT: vsrh v4, v3, v4 +; P9BE-NEXT: vadduhm v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: dont_fold_srem_power_of_two: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, -21386 -; P8LE-NEXT: ori r3, r3, 37253 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: rldicl r5, r4, 16, 48 -; P8LE-NEXT: clrldi r6, r4, 48 -; P8LE-NEXT: extsh r5, r5 -; P8LE-NEXT: extsh r6, r6 -; P8LE-NEXT: mulhw r3, r5, r3 -; P8LE-NEXT: rldicl r7, r4, 48, 48 -; P8LE-NEXT: srawi r8, r6, 6 -; P8LE-NEXT: extsh r7, r7 -; P8LE-NEXT: addze r8, r8 -; P8LE-NEXT: rldicl r4, r4, 32, 48 -; P8LE-NEXT: srawi r9, r7, 5 -; P8LE-NEXT: extsh r4, r4 -; P8LE-NEXT: slwi r8, r8, 6 -; P8LE-NEXT: add r3, r3, r5 -; P8LE-NEXT: addze r9, r9 -; P8LE-NEXT: sub r6, r6, r8 -; P8LE-NEXT: srwi r10, r3, 31 -; P8LE-NEXT: srawi r3, r3, 6 -; P8LE-NEXT: slwi r8, r9, 5 -; P8LE-NEXT: mtvsrd v2, r6 -; P8LE-NEXT: add r3, r3, r10 -; P8LE-NEXT: srawi r9, r4, 3 -; P8LE-NEXT: sub r6, r7, r8 -; P8LE-NEXT: mulli r3, r3, 95 -; P8LE-NEXT: addze r7, r9 -; P8LE-NEXT: mtvsrd v3, r6 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: sub r3, r5, r3 -; P8LE-NEXT: slwi r5, r7, 3 -; P8LE-NEXT: sub r4, r4, r5 -; P8LE-NEXT: mtvsrd v4, r3 -; P8LE-NEXT: mtvsrd v5, r4 -; P8LE-NEXT: vmrghh v3, v4, v5 -; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v2, v2 +; P8LE-NEXT: vspltisw v4, 8 +; P8LE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI3_1@toc@l +; P8LE-NEXT: vslw v3, v3, v4 +; P8LE-NEXT: vsraw v3, v3, v4 +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: vxor v5, v5, v5 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI3_2@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI3_2@toc@l +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vadduhm v3, v3, v2 +; P8LE-NEXT: vsrah v3, v3, v4 +; P8LE-NEXT: vspltish v4, 15 +; P8LE-NEXT: vsrh v4, v3, v4 +; P8LE-NEXT: vadduhm v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: vmladduhm v3, v3, v4, v5 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_srem_power_of_two: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, -21386 -; P8BE-NEXT: ori r3, r3, 37253 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 32, 48 -; P8BE-NEXT: extsh r5, r5 -; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: mulhw r3, r5, r3 -; P8BE-NEXT: rldicl r7, r4, 16, 48 -; P8BE-NEXT: srawi r8, r6, 5 -; P8BE-NEXT: extsh r7, r7 -; P8BE-NEXT: addze r8, r8 -; P8BE-NEXT: rldicl r4, r4, 48, 48 -; P8BE-NEXT: srawi r9, r7, 6 -; P8BE-NEXT: extsh r4, r4 -; P8BE-NEXT: slwi r8, r8, 5 -; P8BE-NEXT: add r3, r3, r5 -; P8BE-NEXT: addze r9, r9 -; P8BE-NEXT: sub r6, r6, r8 -; P8BE-NEXT: srwi r10, r3, 31 -; P8BE-NEXT: srawi r3, r3, 6 -; P8BE-NEXT: slwi r8, r9, 6 -; P8BE-NEXT: add r3, r3, r10 -; P8BE-NEXT: srawi r9, r4, 3 -; P8BE-NEXT: sub r7, r7, r8 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: addze r8, r9 -; P8BE-NEXT: mtvsrd v2, r6 -; P8BE-NEXT: slwi r6, r8, 3 -; P8BE-NEXT: sub r4, r4, r6 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: sub r3, r5, r3 -; P8BE-NEXT: sldi r5, r7, 48 -; P8BE-NEXT: mtvsrd v5, r4 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v3, r5 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: vmrghh v3, v5, v4 -; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: vmrghh v3, v2, v2 +; P8BE-NEXT: vspltisw v4, 8 +; P8BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI3_1@toc@l +; P8BE-NEXT: vslw v3, v3, v4 +; P8BE-NEXT: vsraw v3, v3, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: vxor v5, v5, v5 +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI3_2@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI3_2@toc@l +; P8BE-NEXT: lxvw4x v0, 0, r3 +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vadduhm v3, v3, v2 +; P8BE-NEXT: vsrah v3, v3, v4 +; P8BE-NEXT: vspltish v4, 15 +; P8BE-NEXT: vsrh v4, v3, v4 +; P8BE-NEXT: vadduhm v3, v3, v4 +; P8BE-NEXT: vmladduhm v3, v3, v0, v5 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -913,195 +489,146 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; P9LE-LABEL: dont_fold_srem_one: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: lis r4, -14230 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 30865 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: add r4, r4, r3 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 9 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 654 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, -19946 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: ori r4, r4, 17097 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v3, v4 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: add r4, r4, r3 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 4 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 23 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, 24749 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: ori r4, r4, 47143 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 11 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 5423 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v2, v4 -; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P9LE-NEXT: vmrglh v5, v2, v2 +; P9LE-NEXT: vspltisw v3, -16 +; P9LE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P9LE-NEXT: vextsh2w v5, v5 +; P9LE-NEXT: vsrw v3, v3, v3 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; P9LE-NEXT: xxland v3, v2, v3 +; P9LE-NEXT: addi r3, r3, .LCPI4_1@toc@l +; P9LE-NEXT: vmuluwm v4, v5, v4 +; P9LE-NEXT: vspltisw v5, 8 +; P9LE-NEXT: vadduwm v5, v5, v5 +; P9LE-NEXT: vsrw v4, v4, v5 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: vpkuwum v4, v4, v4 +; P9LE-NEXT: vadduhm v3, v4, v3 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI4_2@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI4_2@toc@l +; P9LE-NEXT: lxvx vs0, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI4_3@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI4_3@toc@l +; P9LE-NEXT: vsrah v3, v3, v4 +; P9LE-NEXT: vspltish v4, 15 +; P9LE-NEXT: vsrh v4, v3, v4 +; P9LE-NEXT: xxland v4, v4, vs0 +; P9LE-NEXT: vadduhm v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: dont_fold_srem_one: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: lis r4, -19946 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 17097 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 4 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 23 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, 24749 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 47143 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 11 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 5423 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, -14230 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 30865 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v3, v4 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 9 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 654 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: vmrghh v2, v4, v2 -; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P9BE-NEXT: vmrghh v5, v2, v2 +; P9BE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P9BE-NEXT: vextsh2w v5, v5 +; P9BE-NEXT: lxvx vs0, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI4_1@toc@l +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_2@toc@ha +; P9BE-NEXT: xxland v3, v2, vs0 +; P9BE-NEXT: addi r3, r3, .LCPI4_2@toc@l +; P9BE-NEXT: vmuluwm v4, v5, v4 +; P9BE-NEXT: vspltisw v5, 8 +; P9BE-NEXT: vadduwm v5, v5, v5 +; P9BE-NEXT: vsrw v4, v4, v5 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: vpkuwum v4, v4, v4 +; P9BE-NEXT: vadduhm v3, v4, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_3@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI4_3@toc@l +; P9BE-NEXT: lxvx vs0, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_4@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI4_4@toc@l +; P9BE-NEXT: vsrah v3, v3, v4 +; P9BE-NEXT: vspltish v4, 15 +; P9BE-NEXT: vsrh v4, v3, v4 +; P9BE-NEXT: xxland v4, v4, vs0 +; P9BE-NEXT: vadduhm v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: dont_fold_srem_one: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r5, 24749 -; P8LE-NEXT: lis r6, -19946 -; P8LE-NEXT: lis r8, -14230 -; P8LE-NEXT: ori r5, r5, 47143 -; P8LE-NEXT: ori r6, r6, 17097 -; P8LE-NEXT: ori r8, r8, 30865 -; P8LE-NEXT: mffprd r3, f0 -; P8LE-NEXT: rldicl r4, r3, 16, 48 -; P8LE-NEXT: rldicl r7, r3, 32, 48 -; P8LE-NEXT: rldicl r3, r3, 48, 48 -; P8LE-NEXT: extsh r4, r4 -; P8LE-NEXT: extsh r7, r7 -; P8LE-NEXT: extsh r3, r3 -; P8LE-NEXT: mulhw r5, r4, r5 -; P8LE-NEXT: mulhw r6, r7, r6 -; P8LE-NEXT: mulhw r8, r3, r8 -; P8LE-NEXT: srwi r9, r5, 31 -; P8LE-NEXT: srawi r5, r5, 11 -; P8LE-NEXT: add r6, r6, r7 -; P8LE-NEXT: add r8, r8, r3 -; P8LE-NEXT: add r5, r5, r9 -; P8LE-NEXT: srwi r9, r6, 31 -; P8LE-NEXT: srawi r6, r6, 4 -; P8LE-NEXT: add r6, r6, r9 -; P8LE-NEXT: srwi r9, r8, 31 -; P8LE-NEXT: srawi r8, r8, 9 -; P8LE-NEXT: mulli r5, r5, 5423 -; P8LE-NEXT: add r8, r8, r9 -; P8LE-NEXT: mulli r6, r6, 23 -; P8LE-NEXT: li r9, 0 -; P8LE-NEXT: mulli r8, r8, 654 -; P8LE-NEXT: mtvsrd v2, r9 -; P8LE-NEXT: sub r4, r4, r5 -; P8LE-NEXT: sub r5, r7, r6 -; P8LE-NEXT: mtvsrd v3, r4 -; P8LE-NEXT: sub r3, r3, r8 -; P8LE-NEXT: mtvsrd v4, r5 -; P8LE-NEXT: mtvsrd v5, r3 -; P8LE-NEXT: vmrghh v3, v3, v4 -; P8LE-NEXT: vmrghh v2, v5, v2 -; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v2, v2 +; P8LE-NEXT: vspltisw v4, 8 +; P8LE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P8LE-NEXT: vxor v0, v0, v0 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI4_1@toc@l +; P8LE-NEXT: vslw v3, v3, v4 +; P8LE-NEXT: vsraw v3, v3, v4 +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: vspltisw v4, -16 +; P8LE-NEXT: vsrw v4, v4, v4 +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: xxland v4, v2, v4 +; P8LE-NEXT: vadduhm v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI4_2@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI4_2@toc@l +; P8LE-NEXT: vsrah v3, v3, v4 +; P8LE-NEXT: vspltish v4, 15 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI4_3@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI4_3@toc@l +; P8LE-NEXT: vsrh v4, v3, v4 +; P8LE-NEXT: xxland v4, v4, v5 +; P8LE-NEXT: vadduhm v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: vmladduhm v3, v3, v4, v0 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_srem_one: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r3, v2 -; P8BE-NEXT: lis r5, 24749 -; P8BE-NEXT: lis r6, -19946 -; P8BE-NEXT: lis r8, -14230 -; P8BE-NEXT: ori r5, r5, 47143 -; P8BE-NEXT: ori r6, r6, 17097 -; P8BE-NEXT: ori r8, r8, 30865 -; P8BE-NEXT: clrldi r4, r3, 48 -; P8BE-NEXT: rldicl r7, r3, 48, 48 -; P8BE-NEXT: rldicl r3, r3, 32, 48 -; P8BE-NEXT: extsh r4, r4 -; P8BE-NEXT: extsh r7, r7 -; P8BE-NEXT: extsh r3, r3 -; P8BE-NEXT: mulhw r5, r4, r5 -; P8BE-NEXT: mulhw r6, r7, r6 -; P8BE-NEXT: mulhw r8, r3, r8 -; P8BE-NEXT: srwi r9, r5, 31 -; P8BE-NEXT: srawi r5, r5, 11 -; P8BE-NEXT: add r6, r6, r7 -; P8BE-NEXT: add r8, r8, r3 -; P8BE-NEXT: add r5, r5, r9 -; P8BE-NEXT: srwi r9, r6, 31 -; P8BE-NEXT: srawi r6, r6, 4 -; P8BE-NEXT: add r6, r6, r9 -; P8BE-NEXT: srwi r9, r8, 31 -; P8BE-NEXT: srawi r8, r8, 9 -; P8BE-NEXT: mulli r5, r5, 5423 -; P8BE-NEXT: add r8, r8, r9 -; P8BE-NEXT: mulli r6, r6, 23 -; P8BE-NEXT: li r9, 0 -; P8BE-NEXT: mulli r8, r8, 654 -; P8BE-NEXT: sub r4, r4, r5 -; P8BE-NEXT: sldi r5, r9, 48 -; P8BE-NEXT: mtvsrd v2, r5 -; P8BE-NEXT: sub r5, r7, r6 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: sub r3, r3, r8 -; P8BE-NEXT: mtvsrd v3, r4 -; P8BE-NEXT: sldi r4, r5, 48 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v4, r4 -; P8BE-NEXT: mtvsrd v5, r3 -; P8BE-NEXT: vmrghh v3, v4, v3 -; P8BE-NEXT: vmrghh v2, v2, v5 -; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: vmrghh v3, v2, v2 +; P8BE-NEXT: vspltisw v4, 8 +; P8BE-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI4_1@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P8BE-NEXT: lxvw4x vs0, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_2@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI4_2@toc@l +; P8BE-NEXT: vslw v3, v3, v4 +; P8BE-NEXT: vsraw v3, v3, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_3@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI4_3@toc@l +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: xxland v4, v2, vs0 +; P8BE-NEXT: lxvw4x vs0, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_4@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI4_4@toc@l +; P8BE-NEXT: lxvw4x v0, 0, r3 +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vadduhm v3, v3, v4 +; P8BE-NEXT: vspltish v4, 15 +; P8BE-NEXT: vsrah v3, v3, v5 +; P8BE-NEXT: vxor v5, v5, v5 +; P8BE-NEXT: vsrh v4, v3, v4 +; P8BE-NEXT: xxland v4, v4, vs0 +; P8BE-NEXT: vadduhm v3, v3, v4 +; P8BE-NEXT: vmladduhm v3, v3, v0, v5 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -1111,175 +638,144 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { ; P9LE-LABEL: dont_fold_urem_i16_smax: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: lis r4, -19946 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 17097 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: add r4, r4, r3 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 4 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 23 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, 24749 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: ori r4, r4, 47143 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 11 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 5423 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: srawi r4, r3, 15 -; P9LE-NEXT: addze r4, r4 -; P9LE-NEXT: slwi r4, r4, 15 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: vmrghh v2, v2, v4 -; P9LE-NEXT: vmrglw v2, v3, v2 +; P9LE-NEXT: addis r3, r2, .LCPI5_0@toc@ha +; P9LE-NEXT: vmrglh v3, v2, v2 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: addi r3, r3, .LCPI5_0@toc@l +; P9LE-NEXT: vextsh2w v3, v3 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI5_1@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI5_1@toc@l +; P9LE-NEXT: vmuluwm v3, v3, v4 +; P9LE-NEXT: vspltisw v4, 8 +; P9LE-NEXT: vadduwm v4, v4, v4 +; P9LE-NEXT: vsrw v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI5_2@toc@ha +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: addi r3, r3, .LCPI5_2@toc@l +; P9LE-NEXT: vmladduhm v3, v2, v4, v3 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI5_3@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI5_3@toc@l +; P9LE-NEXT: lxvx vs0, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI5_4@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI5_4@toc@l +; P9LE-NEXT: vsrah v3, v3, v4 +; P9LE-NEXT: vspltish v4, 15 +; P9LE-NEXT: vsrh v4, v3, v4 +; P9LE-NEXT: xxland v4, v4, vs0 +; P9LE-NEXT: vadduhm v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: dont_fold_urem_i16_smax: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: lis r4, -19946 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 17097 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 4 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 23 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, 24749 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 47143 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 11 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 5423 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v3, v4 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: srawi r4, r3, 15 -; P9BE-NEXT: addze r4, r4 -; P9BE-NEXT: slwi r4, r4, 15 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: vmrghh v2, v4, v2 -; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: addis r3, r2, .LCPI5_0@toc@ha +; P9BE-NEXT: vmrghh v3, v2, v2 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: addi r3, r3, .LCPI5_0@toc@l +; P9BE-NEXT: vextsh2w v3, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI5_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI5_1@toc@l +; P9BE-NEXT: vmuluwm v3, v3, v4 +; P9BE-NEXT: vspltisw v4, 8 +; P9BE-NEXT: vadduwm v4, v4, v4 +; P9BE-NEXT: vsrw v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI5_2@toc@ha +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: addi r3, r3, .LCPI5_2@toc@l +; P9BE-NEXT: vmladduhm v3, v2, v4, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI5_3@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI5_3@toc@l +; P9BE-NEXT: lxvx vs0, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI5_4@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI5_4@toc@l +; P9BE-NEXT: vsrah v3, v3, v4 +; P9BE-NEXT: vspltish v4, 15 +; P9BE-NEXT: vsrh v4, v3, v4 +; P9BE-NEXT: xxland v4, v4, vs0 +; P9BE-NEXT: vadduhm v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: dont_fold_urem_i16_smax: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r4, 24749 -; P8LE-NEXT: lis r5, -19946 -; P8LE-NEXT: ori r4, r4, 47143 -; P8LE-NEXT: ori r5, r5, 17097 -; P8LE-NEXT: mffprd r3, f0 -; P8LE-NEXT: rldicl r6, r3, 16, 48 -; P8LE-NEXT: rldicl r7, r3, 32, 48 -; P8LE-NEXT: extsh r6, r6 -; P8LE-NEXT: extsh r7, r7 -; P8LE-NEXT: mulhw r4, r6, r4 -; P8LE-NEXT: mulhw r5, r7, r5 -; P8LE-NEXT: rldicl r3, r3, 48, 48 -; P8LE-NEXT: extsh r3, r3 -; P8LE-NEXT: srwi r8, r4, 31 -; P8LE-NEXT: srawi r4, r4, 11 -; P8LE-NEXT: add r5, r5, r7 -; P8LE-NEXT: add r4, r4, r8 -; P8LE-NEXT: srwi r8, r5, 31 -; P8LE-NEXT: srawi r5, r5, 4 -; P8LE-NEXT: mulli r4, r4, 5423 -; P8LE-NEXT: add r5, r5, r8 -; P8LE-NEXT: srawi r9, r3, 15 -; P8LE-NEXT: li r8, 0 -; P8LE-NEXT: mulli r5, r5, 23 -; P8LE-NEXT: mtvsrd v2, r8 -; P8LE-NEXT: sub r4, r6, r4 -; P8LE-NEXT: addze r6, r9 -; P8LE-NEXT: slwi r6, r6, 15 -; P8LE-NEXT: mtvsrd v3, r4 -; P8LE-NEXT: sub r5, r7, r5 -; P8LE-NEXT: sub r3, r3, r6 -; P8LE-NEXT: mtvsrd v4, r5 -; P8LE-NEXT: mtvsrd v5, r3 -; P8LE-NEXT: vmrghh v3, v3, v4 -; P8LE-NEXT: vmrghh v2, v5, v2 -; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v2, v2 +; P8LE-NEXT: vspltisw v4, 8 +; P8LE-NEXT: addis r3, r2, .LCPI5_0@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI5_0@toc@l +; P8LE-NEXT: vxor v0, v0, v0 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI5_1@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI5_1@toc@l +; P8LE-NEXT: vslw v3, v3, v4 +; P8LE-NEXT: vsraw v3, v3, v4 +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI5_2@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI5_2@toc@l +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vmladduhm v3, v2, v4, v3 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI5_3@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI5_3@toc@l +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI5_4@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI5_4@toc@l +; P8LE-NEXT: vsrah v3, v3, v4 +; P8LE-NEXT: vspltish v4, 15 +; P8LE-NEXT: vsrh v4, v3, v4 +; P8LE-NEXT: xxland v4, v4, v5 +; P8LE-NEXT: vadduhm v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: vmladduhm v3, v3, v4, v0 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_urem_i16_smax: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r3, v2 -; P8BE-NEXT: lis r4, 24749 -; P8BE-NEXT: lis r5, -19946 -; P8BE-NEXT: ori r4, r4, 47143 -; P8BE-NEXT: ori r5, r5, 17097 -; P8BE-NEXT: clrldi r6, r3, 48 -; P8BE-NEXT: rldicl r7, r3, 48, 48 -; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: extsh r7, r7 -; P8BE-NEXT: mulhw r4, r6, r4 -; P8BE-NEXT: mulhw r5, r7, r5 -; P8BE-NEXT: rldicl r3, r3, 32, 48 -; P8BE-NEXT: extsh r3, r3 -; P8BE-NEXT: srwi r8, r4, 31 -; P8BE-NEXT: srawi r4, r4, 11 -; P8BE-NEXT: add r5, r5, r7 -; P8BE-NEXT: add r4, r4, r8 -; P8BE-NEXT: srwi r8, r5, 31 -; P8BE-NEXT: srawi r5, r5, 4 -; P8BE-NEXT: mulli r4, r4, 5423 -; P8BE-NEXT: add r5, r5, r8 -; P8BE-NEXT: li r8, 0 -; P8BE-NEXT: mulli r5, r5, 23 -; P8BE-NEXT: srawi r9, r3, 15 -; P8BE-NEXT: sub r4, r6, r4 -; P8BE-NEXT: sldi r6, r8, 48 -; P8BE-NEXT: addze r8, r9 -; P8BE-NEXT: mtvsrd v2, r6 -; P8BE-NEXT: slwi r6, r8, 15 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: sub r5, r7, r5 -; P8BE-NEXT: sub r3, r3, r6 -; P8BE-NEXT: mtvsrd v3, r4 -; P8BE-NEXT: sldi r4, r5, 48 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v4, r4 -; P8BE-NEXT: mtvsrd v5, r3 -; P8BE-NEXT: vmrghh v3, v4, v3 -; P8BE-NEXT: vmrghh v2, v2, v5 -; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: vmrghh v3, v2, v2 +; P8BE-NEXT: vspltisw v4, 8 +; P8BE-NEXT: addis r3, r2, .LCPI5_0@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI5_0@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI5_1@toc@ha +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI5_1@toc@l +; P8BE-NEXT: vslw v3, v3, v4 +; P8BE-NEXT: vsraw v3, v3, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: vxor v5, v5, v5 +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI5_2@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI5_2@toc@l +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vmladduhm v3, v2, v4, v3 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI5_3@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI5_3@toc@l +; P8BE-NEXT: lxvw4x vs0, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI5_4@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI5_4@toc@l +; P8BE-NEXT: lxvw4x v0, 0, r3 +; P8BE-NEXT: vsrah v3, v3, v4 +; P8BE-NEXT: vspltish v4, 15 +; P8BE-NEXT: vsrh v4, v3, v4 +; P8BE-NEXT: xxland v4, v4, vs0 +; P8BE-NEXT: vadduhm v3, v3, v4 +; P8BE-NEXT: vmladduhm v3, v3, v0, v5 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -1289,89 +785,182 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { ; P9LE-LABEL: dont_fold_srem_i64: ; P9LE: # %bb.0: -; P9LE-NEXT: lis r4, 24749 -; P9LE-NEXT: mfvsrd r3, v3 -; P9LE-NEXT: ori r4, r4, 47142 -; P9LE-NEXT: sldi r4, r4, 32 -; P9LE-NEXT: oris r4, r4, 58853 -; P9LE-NEXT: ori r4, r4, 6055 -; P9LE-NEXT: mulhd r4, r3, r4 -; P9LE-NEXT: rldicl r5, r4, 1, 63 -; P9LE-NEXT: sradi r4, r4, 11 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: lis r5, -19946 -; P9LE-NEXT: mulli r4, r4, 5423 -; P9LE-NEXT: ori r5, r5, 17096 -; P9LE-NEXT: sldi r5, r5, 32 -; P9LE-NEXT: oris r5, r5, 22795 -; P9LE-NEXT: sub r3, r3, r4 +; P9LE-NEXT: lis r3, 24749 +; P9LE-NEXT: mfvsrd r4, v3 +; P9LE-NEXT: ori r3, r3, 47142 +; P9LE-NEXT: sradi r5, r4, 63 +; P9LE-NEXT: sldi r3, r3, 32 +; P9LE-NEXT: oris r3, r3, 58853 +; P9LE-NEXT: ori r3, r3, 6055 +; P9LE-NEXT: mulhdu r6, r4, r3 +; P9LE-NEXT: maddld r5, r5, r3, r6 +; P9LE-NEXT: lis r6, -19946 +; P9LE-NEXT: mulld r3, r4, r3 ; P9LE-NEXT: mfvsrld r4, v3 +; P9LE-NEXT: ori r6, r6, 17096 +; P9LE-NEXT: sldi r6, r6, 32 +; P9LE-NEXT: mtvsrdd v4, r5, r3 +; P9LE-NEXT: sradi r3, r4, 63 +; P9LE-NEXT: oris r5, r6, 22795 ; P9LE-NEXT: ori r5, r5, 8549 -; P9LE-NEXT: mulhd r5, r4, r5 -; P9LE-NEXT: add r5, r5, r4 -; P9LE-NEXT: rldicl r6, r5, 1, 63 -; P9LE-NEXT: sradi r5, r5, 4 -; P9LE-NEXT: add r5, r5, r6 -; P9LE-NEXT: mulli r5, r5, 23 -; P9LE-NEXT: sub r4, r4, r5 -; P9LE-NEXT: mtvsrdd v3, r3, r4 -; P9LE-NEXT: lis r4, 25653 -; P9LE-NEXT: mfvsrd r3, v2 -; P9LE-NEXT: ori r4, r4, 15432 -; P9LE-NEXT: sldi r4, r4, 32 -; P9LE-NEXT: oris r4, r4, 1603 -; P9LE-NEXT: ori r4, r4, 21445 -; P9LE-NEXT: mulhd r4, r3, r4 -; P9LE-NEXT: rldicl r5, r4, 1, 63 -; P9LE-NEXT: sradi r4, r4, 8 -; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: mulhdu r6, r4, r5 +; P9LE-NEXT: sub r6, r6, r4 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: maddld r3, r3, r5, r6 +; P9LE-NEXT: lis r6, 25653 +; P9LE-NEXT: mfvsrd r5, v2 +; P9LE-NEXT: ori r6, r6, 15432 +; P9LE-NEXT: sldi r6, r6, 32 +; P9LE-NEXT: mtvsrdd v5, r3, r4 +; P9LE-NEXT: addis r3, r2, .LCPI6_0@toc@ha +; P9LE-NEXT: addis r4, r2, .LCPI6_1@toc@ha +; P9LE-NEXT: oris r6, r6, 1603 +; P9LE-NEXT: addi r3, r3, .LCPI6_0@toc@l +; P9LE-NEXT: addi r4, r4, .LCPI6_1@toc@l +; P9LE-NEXT: ori r6, r6, 21445 +; P9LE-NEXT: lxvx v0, 0, r3 +; P9LE-NEXT: lxvx vs0, 0, r4 +; P9LE-NEXT: addis r3, r2, .LCPI6_2@toc@ha +; P9LE-NEXT: addis r4, r2, .LCPI6_3@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI6_2@toc@l +; P9LE-NEXT: addi r4, r4, .LCPI6_3@toc@l +; P9LE-NEXT: lxvx v6, 0, r4 +; P9LE-NEXT: vspltb v1, v0, 15 +; P9LE-NEXT: vsro v4, v4, v0 +; P9LE-NEXT: vsro v5, v5, v0 +; P9LE-NEXT: vsr v4, v4, v1 +; P9LE-NEXT: vsr v5, v5, v1 +; P9LE-NEXT: xxmrgld v4, v4, v5 +; P9LE-NEXT: xxland v5, v3, vs0 +; P9LE-NEXT: vaddudm v4, v4, v5 +; P9LE-NEXT: lxvx v5, 0, r3 +; P9LE-NEXT: vsrad v6, v4, v6 +; P9LE-NEXT: vsrd v4, v4, v5 +; P9LE-NEXT: vaddudm v4, v6, v4 +; P9LE-NEXT: xxlxor v6, v6, v6 +; P9LE-NEXT: mfvsrld r3, v4 +; P9LE-NEXT: mfvsrd r4, v4 +; P9LE-NEXT: mulli r4, r4, 5423 +; P9LE-NEXT: mulli r3, r3, 23 +; P9LE-NEXT: mtvsrdd v4, r4, r3 +; P9LE-NEXT: mulhdu r3, r5, r6 +; P9LE-NEXT: sradi r4, r5, 63 +; P9LE-NEXT: vsubudm v3, v3, v4 +; P9LE-NEXT: maddld r3, r4, r6, r3 +; P9LE-NEXT: mulld r4, r5, r6 +; P9LE-NEXT: mtvsrdd v4, r3, r4 +; P9LE-NEXT: addis r3, r2, .LCPI6_4@toc@ha +; P9LE-NEXT: vsro v4, v4, v0 +; P9LE-NEXT: vsro v0, v6, v0 +; P9LE-NEXT: addi r3, r3, .LCPI6_4@toc@l +; P9LE-NEXT: vsr v4, v4, v1 +; P9LE-NEXT: vsr v0, v0, v1 +; P9LE-NEXT: xxmrgld v4, v4, v0 +; P9LE-NEXT: xxland v0, v2, vs0 +; P9LE-NEXT: vaddudm v4, v4, v0 +; P9LE-NEXT: lxvx v0, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI6_5@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI6_5@toc@l +; P9LE-NEXT: lxvx vs0, 0, r3 +; P9LE-NEXT: vsrad v0, v4, v0 +; P9LE-NEXT: vsrd v4, v4, v5 +; P9LE-NEXT: xxland v4, v4, vs0 +; P9LE-NEXT: vaddudm v4, v0, v4 +; P9LE-NEXT: mfvsrd r4, v4 +; P9LE-NEXT: mfvsrld r3, v4 ; P9LE-NEXT: mulli r4, r4, 654 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: li r4, 0 -; P9LE-NEXT: mtvsrdd v2, r3, r4 +; P9LE-NEXT: mtvsrdd v4, r4, r3 +; P9LE-NEXT: vsubudm v2, v2, v4 ; P9LE-NEXT: blr ; ; P9BE-LABEL: dont_fold_srem_i64: ; P9BE: # %bb.0: -; P9BE-NEXT: lis r4, 24749 -; P9BE-NEXT: mfvsrld r3, v3 -; P9BE-NEXT: ori r4, r4, 47142 -; P9BE-NEXT: sldi r4, r4, 32 -; P9BE-NEXT: oris r4, r4, 58853 -; P9BE-NEXT: ori r4, r4, 6055 -; P9BE-NEXT: mulhd r4, r3, r4 -; P9BE-NEXT: rldicl r5, r4, 1, 63 -; P9BE-NEXT: sradi r4, r4, 11 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: lis r5, -19946 -; P9BE-NEXT: ori r5, r5, 17096 -; P9BE-NEXT: mulli r4, r4, 5423 -; P9BE-NEXT: sldi r5, r5, 32 -; P9BE-NEXT: oris r5, r5, 22795 -; P9BE-NEXT: sub r3, r3, r4 +; P9BE-NEXT: lis r3, 24749 +; P9BE-NEXT: mfvsrld r4, v3 +; P9BE-NEXT: ori r3, r3, 47142 +; P9BE-NEXT: sradi r5, r4, 63 +; P9BE-NEXT: sldi r3, r3, 32 +; P9BE-NEXT: oris r3, r3, 58853 +; P9BE-NEXT: ori r3, r3, 6055 +; P9BE-NEXT: mulhdu r6, r4, r3 +; P9BE-NEXT: maddld r5, r5, r3, r6 +; P9BE-NEXT: lis r6, -19946 +; P9BE-NEXT: mulld r3, r4, r3 ; P9BE-NEXT: mfvsrd r4, v3 +; P9BE-NEXT: ori r6, r6, 17096 +; P9BE-NEXT: sldi r6, r6, 32 +; P9BE-NEXT: mtvsrdd v4, r5, r3 +; P9BE-NEXT: sradi r3, r4, 63 +; P9BE-NEXT: oris r5, r6, 22795 ; P9BE-NEXT: ori r5, r5, 8549 -; P9BE-NEXT: mulhd r5, r4, r5 -; P9BE-NEXT: add r5, r5, r4 -; P9BE-NEXT: rldicl r6, r5, 1, 63 -; P9BE-NEXT: sradi r5, r5, 4 -; P9BE-NEXT: add r5, r5, r6 -; P9BE-NEXT: mulli r5, r5, 23 -; P9BE-NEXT: sub r4, r4, r5 -; P9BE-NEXT: mtvsrdd v3, r4, r3 -; P9BE-NEXT: lis r4, 25653 -; P9BE-NEXT: mfvsrld r3, v2 -; P9BE-NEXT: ori r4, r4, 15432 -; P9BE-NEXT: sldi r4, r4, 32 -; P9BE-NEXT: oris r4, r4, 1603 -; P9BE-NEXT: ori r4, r4, 21445 -; P9BE-NEXT: mulhd r4, r3, r4 -; P9BE-NEXT: rldicl r5, r4, 1, 63 -; P9BE-NEXT: sradi r4, r4, 8 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 654 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: mtvsrdd v2, 0, r3 +; P9BE-NEXT: mulhdu r6, r4, r5 +; P9BE-NEXT: sub r6, r6, r4 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: maddld r3, r3, r5, r6 +; P9BE-NEXT: lis r6, 25653 +; P9BE-NEXT: mfvsrld r5, v2 +; P9BE-NEXT: ori r6, r6, 15432 +; P9BE-NEXT: sldi r6, r6, 32 +; P9BE-NEXT: mtvsrdd v5, r3, r4 +; P9BE-NEXT: addis r3, r2, .LCPI6_0@toc@ha +; P9BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha +; P9BE-NEXT: oris r6, r6, 1603 +; P9BE-NEXT: addi r3, r3, .LCPI6_0@toc@l +; P9BE-NEXT: addi r4, r4, .LCPI6_1@toc@l +; P9BE-NEXT: ori r6, r6, 21445 +; P9BE-NEXT: lxvx v0, 0, r3 +; P9BE-NEXT: lxvx vs0, 0, r4 +; P9BE-NEXT: addis r3, r2, .LCPI6_2@toc@ha +; P9BE-NEXT: addis r4, r2, .LCPI6_3@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI6_2@toc@l +; P9BE-NEXT: addi r4, r4, .LCPI6_3@toc@l +; P9BE-NEXT: lxvx v6, 0, r4 +; P9BE-NEXT: vspltb v1, v0, 15 +; P9BE-NEXT: vsro v4, v4, v0 +; P9BE-NEXT: vsro v5, v5, v0 +; P9BE-NEXT: vsr v4, v4, v1 +; P9BE-NEXT: vsr v5, v5, v1 +; P9BE-NEXT: xxmrgld v4, v5, v4 +; P9BE-NEXT: xxland v5, v3, vs0 +; P9BE-NEXT: vaddudm v4, v4, v5 +; P9BE-NEXT: lxvx v5, 0, r3 +; P9BE-NEXT: vsrad v6, v4, v6 +; P9BE-NEXT: vsrd v4, v4, v5 +; P9BE-NEXT: vaddudm v4, v6, v4 +; P9BE-NEXT: xxlxor v6, v6, v6 +; P9BE-NEXT: mfvsrld r3, v4 +; P9BE-NEXT: mfvsrd r4, v4 +; P9BE-NEXT: mulli r4, r4, 23 +; P9BE-NEXT: mulli r3, r3, 5423 +; P9BE-NEXT: mtvsrdd v4, r4, r3 +; P9BE-NEXT: mulhdu r3, r5, r6 +; P9BE-NEXT: sradi r4, r5, 63 +; P9BE-NEXT: vsubudm v3, v3, v4 +; P9BE-NEXT: maddld r3, r4, r6, r3 +; P9BE-NEXT: mulld r4, r5, r6 +; P9BE-NEXT: mtvsrdd v4, r3, r4 +; P9BE-NEXT: addis r3, r2, .LCPI6_4@toc@ha +; P9BE-NEXT: vsro v4, v4, v0 +; P9BE-NEXT: vsro v0, v6, v0 +; P9BE-NEXT: addi r3, r3, .LCPI6_4@toc@l +; P9BE-NEXT: vsr v4, v4, v1 +; P9BE-NEXT: vsr v0, v0, v1 +; P9BE-NEXT: xxmrgld v4, v0, v4 +; P9BE-NEXT: xxland v0, v2, vs0 +; P9BE-NEXT: vaddudm v4, v4, v0 +; P9BE-NEXT: lxvx v0, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI6_5@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI6_5@toc@l +; P9BE-NEXT: lxvx vs0, 0, r3 +; P9BE-NEXT: vsrad v0, v4, v0 +; P9BE-NEXT: vsrd v4, v4, v5 +; P9BE-NEXT: xxland v4, v4, vs0 +; P9BE-NEXT: vaddudm v4, v0, v4 +; P9BE-NEXT: mfvsrld r3, v4 +; P9BE-NEXT: mfvsrd r4, v4 +; P9BE-NEXT: mulli r3, r3, 654 +; P9BE-NEXT: mtvsrdd v4, r4, r3 +; P9BE-NEXT: vsubudm v2, v2, v4 ; P9BE-NEXT: blr ; ; P8LE-LABEL: dont_fold_srem_i64: @@ -1390,37 +979,79 @@ ; P8LE-NEXT: sldi r5, r5, 32 ; P8LE-NEXT: oris r3, r3, 58853 ; P8LE-NEXT: oris r4, r4, 22795 -; P8LE-NEXT: mffprd r8, f0 +; P8LE-NEXT: mffprd r10, f0 +; P8LE-NEXT: sradi r8, r6, 63 ; P8LE-NEXT: oris r5, r5, 1603 ; P8LE-NEXT: ori r3, r3, 6055 ; P8LE-NEXT: ori r4, r4, 8549 +; P8LE-NEXT: sradi r9, r7, 63 ; P8LE-NEXT: ori r5, r5, 21445 -; P8LE-NEXT: mulhd r3, r6, r3 -; P8LE-NEXT: mulhd r5, r7, r5 -; P8LE-NEXT: mulhd r4, r8, r4 -; P8LE-NEXT: rldicl r9, r3, 1, 63 -; P8LE-NEXT: sradi r3, r3, 11 -; P8LE-NEXT: add r3, r3, r9 -; P8LE-NEXT: rldicl r9, r5, 1, 63 -; P8LE-NEXT: add r4, r4, r8 -; P8LE-NEXT: sradi r5, r5, 8 +; P8LE-NEXT: mulld r8, r8, r3 +; P8LE-NEXT: mulhdu r3, r6, r3 +; P8LE-NEXT: mulld r6, r9, r5 +; P8LE-NEXT: sradi r9, r10, 63 +; P8LE-NEXT: mulhdu r11, r10, r4 +; P8LE-NEXT: mulhdu r5, r7, r5 +; P8LE-NEXT: addis r7, r2, .LCPI6_0@toc@ha +; P8LE-NEXT: mulld r4, r9, r4 +; P8LE-NEXT: addi r7, r7, .LCPI6_0@toc@l +; P8LE-NEXT: lxvd2x vs0, 0, r7 +; P8LE-NEXT: add r3, r3, r8 +; P8LE-NEXT: li r7, 0 +; P8LE-NEXT: mtfprd f2, r3 +; P8LE-NEXT: sub r3, r11, r10 +; P8LE-NEXT: add r5, r5, r6 +; P8LE-NEXT: mtfprd f1, r7 +; P8LE-NEXT: addis r6, r2, .LCPI6_3@toc@ha +; P8LE-NEXT: add r3, r3, r4 +; P8LE-NEXT: addis r4, r2, .LCPI6_1@toc@ha +; P8LE-NEXT: xxswapd v4, vs0 +; P8LE-NEXT: mtfprd f0, r5 +; P8LE-NEXT: addi r6, r6, .LCPI6_3@toc@l +; P8LE-NEXT: mtfprd f4, r3 +; P8LE-NEXT: addis r3, r2, .LCPI6_2@toc@ha +; P8LE-NEXT: addi r4, r4, .LCPI6_1@toc@l +; P8LE-NEXT: lxvd2x vs3, 0, r6 +; P8LE-NEXT: addi r3, r3, .LCPI6_2@toc@l +; P8LE-NEXT: xxmrghd v5, vs0, vs1 +; P8LE-NEXT: lxvd2x vs0, 0, r4 +; P8LE-NEXT: addis r4, r2, .LCPI6_4@toc@ha +; P8LE-NEXT: xxmrghd v0, vs2, vs4 +; P8LE-NEXT: lxvd2x vs1, 0, r3 +; P8LE-NEXT: xxland v1, v2, v4 +; P8LE-NEXT: addi r4, r4, .LCPI6_4@toc@l +; P8LE-NEXT: xxland v4, v3, v4 +; P8LE-NEXT: xxswapd v6, vs3 +; P8LE-NEXT: lxvd2x vs2, 0, r4 +; P8LE-NEXT: vaddudm v5, v5, v1 +; P8LE-NEXT: xxswapd v1, vs0 +; P8LE-NEXT: vaddudm v4, v0, v4 +; P8LE-NEXT: xxswapd v0, vs1 +; P8LE-NEXT: vsrad v6, v5, v6 +; P8LE-NEXT: xxswapd v7, vs2 +; P8LE-NEXT: vsrd v5, v5, v1 +; P8LE-NEXT: vsrd v1, v4, v1 +; P8LE-NEXT: vsrad v4, v4, v0 +; P8LE-NEXT: xxland v5, v5, v7 +; P8LE-NEXT: vaddudm v4, v4, v1 +; P8LE-NEXT: vaddudm v5, v6, v5 +; P8LE-NEXT: xxswapd vs0, v4 +; P8LE-NEXT: mfvsrd r3, v4 +; P8LE-NEXT: mfvsrd r5, v5 +; P8LE-NEXT: xxswapd vs1, v5 ; P8LE-NEXT: mulli r3, r3, 5423 -; P8LE-NEXT: add r5, r5, r9 -; P8LE-NEXT: rldicl r9, r4, 1, 63 -; P8LE-NEXT: sradi r4, r4, 4 +; P8LE-NEXT: mffprd r4, f0 ; P8LE-NEXT: mulli r5, r5, 654 -; P8LE-NEXT: add r4, r4, r9 +; P8LE-NEXT: mffprd r6, f1 ; P8LE-NEXT: mulli r4, r4, 23 -; P8LE-NEXT: sub r3, r6, r3 -; P8LE-NEXT: mtfprd f0, r3 -; P8LE-NEXT: sub r5, r7, r5 -; P8LE-NEXT: mtfprd f1, r5 -; P8LE-NEXT: sub r3, r8, r4 -; P8LE-NEXT: li r4, 0 -; P8LE-NEXT: mtfprd f2, r3 -; P8LE-NEXT: mtfprd f3, r4 -; P8LE-NEXT: xxmrghd v3, vs0, vs2 -; P8LE-NEXT: xxmrghd v2, vs1, vs3 +; P8LE-NEXT: mtfprd f0, r6 +; P8LE-NEXT: mtfprd f1, r3 +; P8LE-NEXT: mtfprd f3, r5 +; P8LE-NEXT: mtfprd f2, r4 +; P8LE-NEXT: xxmrghd v5, vs3, vs0 +; P8LE-NEXT: xxmrghd v4, vs1, vs2 +; P8LE-NEXT: vsubudm v2, v2, v5 +; P8LE-NEXT: vsubudm v3, v3, v4 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_srem_i64: @@ -1428,49 +1059,86 @@ ; P8BE-NEXT: lis r4, -19946 ; P8BE-NEXT: lis r3, 24749 ; P8BE-NEXT: xxswapd vs0, v3 +; P8BE-NEXT: mfvsrd r6, v3 ; P8BE-NEXT: lis r5, 25653 -; P8BE-NEXT: xxswapd vs1, v2 ; P8BE-NEXT: ori r4, r4, 17096 ; P8BE-NEXT: ori r3, r3, 47142 +; P8BE-NEXT: xxswapd vs1, v2 ; P8BE-NEXT: ori r5, r5, 15432 -; P8BE-NEXT: mfvsrd r6, v3 ; P8BE-NEXT: sldi r4, r4, 32 ; P8BE-NEXT: sldi r3, r3, 32 ; P8BE-NEXT: oris r4, r4, 22795 -; P8BE-NEXT: sldi r5, r5, 32 ; P8BE-NEXT: oris r3, r3, 58853 -; P8BE-NEXT: mffprd r7, f0 +; P8BE-NEXT: mffprd r8, f0 +; P8BE-NEXT: sradi r7, r6, 63 ; P8BE-NEXT: ori r4, r4, 8549 ; P8BE-NEXT: ori r3, r3, 6055 +; P8BE-NEXT: mffprd r9, f1 +; P8BE-NEXT: sldi r5, r5, 32 +; P8BE-NEXT: mulld r7, r7, r4 +; P8BE-NEXT: mulhdu r4, r6, r4 ; P8BE-NEXT: oris r5, r5, 1603 -; P8BE-NEXT: mffprd r8, f1 -; P8BE-NEXT: mulhd r4, r6, r4 -; P8BE-NEXT: mulhd r3, r7, r3 +; P8BE-NEXT: mulhdu r10, r8, r3 +; P8BE-NEXT: sradi r8, r8, 63 ; P8BE-NEXT: ori r5, r5, 21445 -; P8BE-NEXT: mulhd r5, r8, r5 -; P8BE-NEXT: add r4, r4, r6 -; P8BE-NEXT: rldicl r9, r3, 1, 63 -; P8BE-NEXT: sradi r3, r3, 11 -; P8BE-NEXT: rldicl r10, r4, 1, 63 -; P8BE-NEXT: sradi r4, r4, 4 -; P8BE-NEXT: add r3, r3, r9 -; P8BE-NEXT: rldicl r9, r5, 1, 63 -; P8BE-NEXT: add r4, r4, r10 -; P8BE-NEXT: sradi r5, r5, 8 -; P8BE-NEXT: mulli r3, r3, 5423 -; P8BE-NEXT: add r5, r5, r9 -; P8BE-NEXT: mulli r4, r4, 23 -; P8BE-NEXT: mulli r5, r5, 654 -; P8BE-NEXT: sub r3, r7, r3 -; P8BE-NEXT: sub r4, r6, r4 +; P8BE-NEXT: mulld r3, r8, r3 +; P8BE-NEXT: sradi r8, r9, 63 +; P8BE-NEXT: mulhdu r9, r9, r5 +; P8BE-NEXT: mulld r5, r8, r5 +; P8BE-NEXT: sub r4, r4, r6 +; P8BE-NEXT: li r6, 0 +; P8BE-NEXT: mtfprd f0, r6 +; P8BE-NEXT: add r4, r4, r7 +; P8BE-NEXT: addis r6, r2, .LCPI6_0@toc@ha +; P8BE-NEXT: mtfprd f1, r4 +; P8BE-NEXT: addi r4, r6, .LCPI6_0@toc@l +; P8BE-NEXT: add r3, r10, r3 +; P8BE-NEXT: lxvw4x vs2, 0, r4 +; P8BE-NEXT: add r4, r9, r5 +; P8BE-NEXT: mtfprd f3, r3 +; P8BE-NEXT: addis r3, r2, .LCPI6_2@toc@ha +; P8BE-NEXT: addis r5, r2, .LCPI6_3@toc@ha +; P8BE-NEXT: mtfprd f4, r4 +; P8BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI6_2@toc@l +; P8BE-NEXT: xxmrghd v4, vs1, vs3 +; P8BE-NEXT: addi r4, r4, .LCPI6_1@toc@l +; P8BE-NEXT: lxvd2x v1, 0, r3 +; P8BE-NEXT: addi r3, r5, .LCPI6_3@toc@l +; P8BE-NEXT: xxmrghd v0, vs0, vs4 +; P8BE-NEXT: xxland v5, v3, vs2 +; P8BE-NEXT: xxland v6, v2, vs2 +; P8BE-NEXT: vaddudm v4, v4, v5 +; P8BE-NEXT: lxvd2x v5, 0, r4 +; P8BE-NEXT: vaddudm v0, v0, v6 +; P8BE-NEXT: lxvd2x v6, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI6_4@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI6_4@toc@l +; P8BE-NEXT: vsrad v1, v4, v1 +; P8BE-NEXT: lxvw4x vs0, 0, r3 +; P8BE-NEXT: vsrd v4, v4, v5 +; P8BE-NEXT: vsrd v5, v0, v5 +; P8BE-NEXT: vsrad v6, v0, v6 +; P8BE-NEXT: xxland v5, v5, vs0 +; P8BE-NEXT: vaddudm v4, v1, v4 +; P8BE-NEXT: vaddudm v5, v6, v5 +; P8BE-NEXT: mfvsrd r3, v4 +; P8BE-NEXT: xxswapd vs0, v4 +; P8BE-NEXT: xxswapd vs1, v5 +; P8BE-NEXT: mulli r3, r3, 23 +; P8BE-NEXT: mffprd r4, f0 +; P8BE-NEXT: mffprd r5, f1 +; P8BE-NEXT: mulli r4, r4, 5423 ; P8BE-NEXT: mtfprd f0, r3 -; P8BE-NEXT: sub r3, r8, r5 +; P8BE-NEXT: mulli r3, r5, 654 +; P8BE-NEXT: mfvsrd r5, v5 ; P8BE-NEXT: mtfprd f1, r4 -; P8BE-NEXT: li r4, 0 -; P8BE-NEXT: mtfprd f2, r3 -; P8BE-NEXT: mtfprd f3, r4 -; P8BE-NEXT: xxmrghd v3, vs1, vs0 -; P8BE-NEXT: xxmrghd v2, vs3, vs2 +; P8BE-NEXT: mtfprd f2, r5 +; P8BE-NEXT: mtfprd f3, r3 +; P8BE-NEXT: xxmrghd v4, vs0, vs1 +; P8BE-NEXT: xxmrghd v5, vs2, vs3 +; P8BE-NEXT: vsubudm v3, v3, v4 +; P8BE-NEXT: vsubudm v2, v2, v5 ; P8BE-NEXT: blr %1 = srem <4 x i64> %x, ret <4 x i64> %1 diff --git a/llvm/test/CodeGen/PowerPC/urem-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-lkk.ll --- a/llvm/test/CodeGen/PowerPC/urem-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/urem-lkk.ll @@ -87,20 +87,89 @@ ; Don't fold i64 urem define i64 @dont_fold_urem_i64(i64 %x) { -; CHECK-LABEL: dont_fold_urem_i64: -; CHECK: # %bb.0: -; CHECK-NEXT: mflr 0 -; CHECK-NEXT: stw 0, 4(1) -; CHECK-NEXT: stwu 1, -16(1) -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset lr, 4 -; CHECK-NEXT: li 5, 0 -; CHECK-NEXT: li 6, 98 -; CHECK-NEXT: bl __umoddi3 -; CHECK-NEXT: lwz 0, 20(1) -; CHECK-NEXT: addi 1, 1, 16 -; CHECK-NEXT: mtlr 0 -; CHECK-NEXT: blr +; CHECK64-LABEL: dont_fold_urem_i64: +; CHECK64: # %bb.0: +; CHECK64-NEXT: mflr 0 +; CHECK64-NEXT: stw 0, 4(1) +; CHECK64-NEXT: stwu 1, -32(1) +; CHECK64-NEXT: .cfi_def_cfa_offset 32 +; CHECK64-NEXT: .cfi_offset lr, 4 +; CHECK64-NEXT: .cfi_offset r29, -12 +; CHECK64-NEXT: .cfi_offset r30, -8 +; CHECK64-NEXT: rotlwi 6, 4, 31 +; CHECK64-NEXT: stw 29, 20(1) # 4-byte Folded Spill +; CHECK64-NEXT: mr 29, 3 +; CHECK64-NEXT: rlwimi 6, 3, 31, 0, 0 +; CHECK64-NEXT: srwi 5, 3, 1 +; CHECK64-NEXT: stw 30, 24(1) # 4-byte Folded Spill +; CHECK64-NEXT: mr 30, 4 +; CHECK64-NEXT: lis 3, 21399 +; CHECK64-NEXT: lis 4, -17388 +; CHECK64-NEXT: ori 9, 3, 33436 +; CHECK64-NEXT: ori 10, 4, 58849 +; CHECK64-NEXT: li 3, 0 +; CHECK64-NEXT: li 4, 0 +; CHECK64-NEXT: li 7, 0 +; CHECK64-NEXT: li 8, 0 +; CHECK64-NEXT: bl __multi3 +; CHECK64-NEXT: rotlwi 4, 4, 28 +; CHECK64-NEXT: li 5, 98 +; CHECK64-NEXT: rlwimi 4, 3, 28, 0, 3 +; CHECK64-NEXT: srwi 3, 3, 4 +; CHECK64-NEXT: mulhwu 5, 4, 5 +; CHECK64-NEXT: mulli 3, 3, 98 +; CHECK64-NEXT: mulli 4, 4, 98 +; CHECK64-NEXT: add 3, 5, 3 +; CHECK64-NEXT: subc 4, 30, 4 +; CHECK64-NEXT: lwz 30, 24(1) # 4-byte Folded Reload +; CHECK64-NEXT: subfe 3, 3, 29 +; CHECK64-NEXT: lwz 29, 20(1) # 4-byte Folded Reload +; CHECK64-NEXT: lwz 0, 36(1) +; CHECK64-NEXT: addi 1, 1, 32 +; CHECK64-NEXT: mtlr 0 +; CHECK64-NEXT: blr +; +; CHECK32-LABEL: dont_fold_urem_i64: +; CHECK32: # %bb.0: +; CHECK32-NEXT: mflr 0 +; CHECK32-NEXT: stw 0, 4(1) +; CHECK32-NEXT: stwu 1, -32(1) +; CHECK32-NEXT: .cfi_def_cfa_offset 32 +; CHECK32-NEXT: .cfi_offset lr, 4 +; CHECK32-NEXT: .cfi_offset r29, -12 +; CHECK32-NEXT: .cfi_offset r30, -8 +; CHECK32-NEXT: rotlwi 6, 4, 31 +; CHECK32-NEXT: stw 29, 20(1) # 4-byte Folded Spill +; CHECK32-NEXT: mr 29, 3 +; CHECK32-NEXT: stw 30, 24(1) # 4-byte Folded Spill +; CHECK32-NEXT: mr 30, 4 +; CHECK32-NEXT: rlwimi 6, 3, 31, 0, 0 +; CHECK32-NEXT: srwi 5, 3, 1 +; CHECK32-NEXT: lis 3, 21399 +; CHECK32-NEXT: lis 4, -17388 +; CHECK32-NEXT: ori 9, 3, 33436 +; CHECK32-NEXT: ori 10, 4, 58849 +; CHECK32-NEXT: li 3, 0 +; CHECK32-NEXT: li 4, 0 +; CHECK32-NEXT: li 7, 0 +; CHECK32-NEXT: li 8, 0 +; CHECK32-NEXT: bl __multi3 +; CHECK32-NEXT: rotlwi 4, 4, 28 +; CHECK32-NEXT: li 5, 98 +; CHECK32-NEXT: rlwimi 4, 3, 28, 0, 3 +; CHECK32-NEXT: srwi 3, 3, 4 +; CHECK32-NEXT: mulhwu 5, 4, 5 +; CHECK32-NEXT: mulli 3, 3, 98 +; CHECK32-NEXT: add 3, 5, 3 +; CHECK32-NEXT: mulli 4, 4, 98 +; CHECK32-NEXT: subc 4, 30, 4 +; CHECK32-NEXT: subfe 3, 3, 29 +; CHECK32-NEXT: lwz 30, 24(1) # 4-byte Folded Reload +; CHECK32-NEXT: lwz 29, 20(1) # 4-byte Folded Reload +; CHECK32-NEXT: lwz 0, 36(1) +; CHECK32-NEXT: addi 1, 1, 32 +; CHECK32-NEXT: mtlr 0 +; CHECK32-NEXT: blr %1 = urem i64 %x, 98 ret i64 %1 } diff --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll --- a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll @@ -11,209 +11,156 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { ; P9LE-LABEL: fold_urem_vec_1: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: lis r4, 21399 -; P9LE-NEXT: lis r5, 8456 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 33437 -; P9LE-NEXT: ori r5, r5, 16913 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: srwi r4, r4, 5 -; P9LE-NEXT: mulli r4, r4, 98 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, 16727 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: ori r4, r4, 2287 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: srwi r4, r4, 8 -; P9LE-NEXT: mulli r4, r4, 1003 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: clrlwi r4, r3, 16 -; P9LE-NEXT: rlwinm r3, r3, 30, 18, 31 -; P9LE-NEXT: mulhwu r3, r3, r5 -; P9LE-NEXT: srwi r3, r3, 2 -; P9LE-NEXT: mulli r3, r3, 124 -; P9LE-NEXT: sub r3, r4, r3 -; P9LE-NEXT: lis r4, 22765 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: ori r4, r4, 8969 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: sub r5, r3, r4 -; P9LE-NEXT: srwi r5, r5, 1 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v4, v2 -; P9LE-NEXT: vmrglw v2, v3, v2 +; P9LE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; P9LE-NEXT: xxlxor v4, v4, v4 +; P9LE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; P9LE-NEXT: lxvx v3, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI0_1@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI0_1@toc@l +; P9LE-NEXT: lxvx v5, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI0_2@toc@ha +; P9LE-NEXT: vsrh v3, v2, v3 +; P9LE-NEXT: addi r3, r3, .LCPI0_2@toc@l +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: vmuluwm v3, v3, v5 +; P9LE-NEXT: vspltisw v5, 8 +; P9LE-NEXT: vadduwm v5, v5, v5 +; P9LE-NEXT: vsrw v3, v3, v5 +; P9LE-NEXT: vpkuwum v0, v3, v3 +; P9LE-NEXT: vsubuhm v0, v2, v0 +; P9LE-NEXT: vmrglh v4, v4, v0 +; P9LE-NEXT: lxvx v0, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI0_3@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI0_3@toc@l +; P9LE-NEXT: vmuluwm v4, v4, v0 +; P9LE-NEXT: vsrw v4, v4, v5 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: vadduhm v3, v4, v3 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI0_4@toc@ha +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: addi r3, r3, .LCPI0_4@toc@l +; P9LE-NEXT: vsrh v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: fold_urem_vec_1: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r4, 16727 -; P9BE-NEXT: lis r5, 8456 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 2287 -; P9BE-NEXT: ori r5, r5, 16913 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: srwi r4, r4, 8 -; P9BE-NEXT: mulli r4, r4, 1003 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, 21399 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 33437 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: srwi r4, r4, 5 -; P9BE-NEXT: mulli r4, r4, 98 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: clrlwi r4, r3, 16 -; P9BE-NEXT: rlwinm r3, r3, 30, 18, 31 -; P9BE-NEXT: mulhwu r3, r3, r5 -; P9BE-NEXT: srwi r3, r3, 2 -; P9BE-NEXT: mulli r3, r3, 124 -; P9BE-NEXT: sub r3, r4, r3 -; P9BE-NEXT: lis r4, 22765 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 8969 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: sub r5, r3, r4 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; P9BE-NEXT: xxlxor v5, v5, v5 +; P9BE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; P9BE-NEXT: lxvx v3, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI0_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI0_1@toc@l +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI0_2@toc@ha +; P9BE-NEXT: vsrh v3, v2, v3 +; P9BE-NEXT: addi r3, r3, .LCPI0_2@toc@l +; P9BE-NEXT: lxvx v0, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI0_3@toc@ha +; P9BE-NEXT: vperm v3, v5, v3, v4 +; P9BE-NEXT: addi r3, r3, .LCPI0_3@toc@l +; P9BE-NEXT: vmuluwm v3, v3, v0 +; P9BE-NEXT: vspltisw v0, 8 +; P9BE-NEXT: vadduwm v0, v0, v0 +; P9BE-NEXT: vsrw v3, v3, v0 +; P9BE-NEXT: vpkuwum v1, v3, v3 +; P9BE-NEXT: vsubuhm v1, v2, v1 +; P9BE-NEXT: vperm v4, v5, v1, v4 +; P9BE-NEXT: lxvx v5, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI0_4@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI0_4@toc@l +; P9BE-NEXT: vmuluwm v4, v4, v5 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: vsrw v4, v4, v0 +; P9BE-NEXT: vadduhm v3, v4, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI0_5@toc@ha +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: addi r3, r3, .LCPI0_5@toc@l +; P9BE-NEXT: vsrh v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: fold_urem_vec_1: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: lis r7, 21399 -; P8LE-NEXT: lis r9, 16727 -; P8LE-NEXT: lis r10, 8456 -; P8LE-NEXT: ori r3, r3, 8969 -; P8LE-NEXT: ori r7, r7, 33437 -; P8LE-NEXT: ori r9, r9, 2287 -; P8LE-NEXT: ori r10, r10, 16913 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: clrldi r6, r4, 48 -; P8LE-NEXT: rldicl r5, r4, 32, 48 -; P8LE-NEXT: clrlwi r6, r6, 16 -; P8LE-NEXT: rldicl r8, r4, 16, 48 -; P8LE-NEXT: clrlwi r5, r5, 16 -; P8LE-NEXT: mulhwu r3, r6, r3 -; P8LE-NEXT: rldicl r4, r4, 48, 48 -; P8LE-NEXT: clrlwi r8, r8, 16 -; P8LE-NEXT: rlwinm r11, r4, 30, 18, 31 -; P8LE-NEXT: mulhwu r7, r5, r7 -; P8LE-NEXT: clrlwi r4, r4, 16 -; P8LE-NEXT: mulhwu r9, r8, r9 -; P8LE-NEXT: mulhwu r10, r11, r10 -; P8LE-NEXT: sub r11, r6, r3 -; P8LE-NEXT: srwi r11, r11, 1 -; P8LE-NEXT: srwi r7, r7, 5 -; P8LE-NEXT: add r3, r11, r3 -; P8LE-NEXT: srwi r9, r9, 8 -; P8LE-NEXT: srwi r10, r10, 2 -; P8LE-NEXT: srwi r3, r3, 6 -; P8LE-NEXT: mulli r7, r7, 98 -; P8LE-NEXT: mulli r9, r9, 1003 -; P8LE-NEXT: mulli r3, r3, 95 -; P8LE-NEXT: mulli r10, r10, 124 -; P8LE-NEXT: sub r5, r5, r7 -; P8LE-NEXT: sub r7, r8, r9 -; P8LE-NEXT: sub r3, r6, r3 -; P8LE-NEXT: mtvsrd v2, r5 -; P8LE-NEXT: sub r4, r4, r10 -; P8LE-NEXT: mtvsrd v3, r7 -; P8LE-NEXT: mtvsrd v4, r3 -; P8LE-NEXT: mtvsrd v5, r4 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: vmrghh v3, v5, v4 -; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; P8LE-NEXT: xxlxor v4, v4, v4 +; P8LE-NEXT: vspltisw v5, 8 +; P8LE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; P8LE-NEXT: lvx v3, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI0_1@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI0_1@toc@l +; P8LE-NEXT: vadduwm v5, v5, v5 +; P8LE-NEXT: vsrh v3, v2, v3 +; P8LE-NEXT: lvx v0, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI0_2@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI0_2@toc@l +; P8LE-NEXT: vmrglh v3, v4, v3 +; P8LE-NEXT: vmuluwm v3, v3, v0 +; P8LE-NEXT: vsrw v3, v3, v5 +; P8LE-NEXT: vpkuwum v0, v3, v3 +; P8LE-NEXT: vsubuhm v0, v2, v0 +; P8LE-NEXT: vmrglh v4, v4, v0 +; P8LE-NEXT: lvx v0, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI0_3@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI0_3@toc@l +; P8LE-NEXT: vmuluwm v4, v4, v0 +; P8LE-NEXT: vsrw v4, v4, v5 +; P8LE-NEXT: vadduhm v3, v4, v3 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI0_4@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI0_4@toc@l +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: vsrh v3, v3, v4 +; P8LE-NEXT: vxor v4, v4, v4 +; P8LE-NEXT: vmladduhm v3, v3, v5, v4 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: fold_urem_vec_1: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: lis r7, 16727 -; P8BE-NEXT: lis r9, 21399 -; P8BE-NEXT: lis r10, 8456 -; P8BE-NEXT: ori r3, r3, 8969 -; P8BE-NEXT: ori r7, r7, 2287 -; P8BE-NEXT: ori r9, r9, 33437 -; P8BE-NEXT: ori r10, r10, 16913 -; P8BE-NEXT: rldicl r6, r4, 16, 48 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: clrlwi r6, r6, 16 -; P8BE-NEXT: rldicl r8, r4, 48, 48 -; P8BE-NEXT: clrlwi r5, r5, 16 -; P8BE-NEXT: mulhwu r3, r6, r3 -; P8BE-NEXT: rldicl r4, r4, 32, 48 -; P8BE-NEXT: clrlwi r8, r8, 16 -; P8BE-NEXT: mulhwu r7, r5, r7 -; P8BE-NEXT: rlwinm r11, r4, 30, 18, 31 -; P8BE-NEXT: clrlwi r4, r4, 16 -; P8BE-NEXT: mulhwu r9, r8, r9 -; P8BE-NEXT: mulhwu r10, r11, r10 -; P8BE-NEXT: sub r11, r6, r3 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: srwi r7, r7, 8 -; P8BE-NEXT: add r3, r11, r3 -; P8BE-NEXT: srwi r9, r9, 5 -; P8BE-NEXT: srwi r10, r10, 2 -; P8BE-NEXT: mulli r7, r7, 1003 -; P8BE-NEXT: srwi r3, r3, 6 -; P8BE-NEXT: mulli r9, r9, 98 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: mulli r10, r10, 124 -; P8BE-NEXT: sub r5, r5, r7 -; P8BE-NEXT: sub r7, r8, r9 -; P8BE-NEXT: sldi r5, r5, 48 -; P8BE-NEXT: sub r3, r6, r3 -; P8BE-NEXT: sub r4, r4, r10 -; P8BE-NEXT: mtvsrd v2, r5 -; P8BE-NEXT: sldi r5, r7, 48 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: mtvsrd v3, r5 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: mtvsrd v5, r4 -; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: vmrghh v3, v4, v5 -; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; P8BE-NEXT: xxlxor v5, v5, v5 +; P8BE-NEXT: vspltisw v0, 8 +; P8BE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; P8BE-NEXT: lxvw4x v3, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI0_1@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI0_1@toc@l +; P8BE-NEXT: vadduwm v0, v0, v0 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI0_2@toc@ha +; P8BE-NEXT: vsrh v3, v2, v3 +; P8BE-NEXT: addi r3, r3, .LCPI0_2@toc@l +; P8BE-NEXT: lxvw4x v1, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI0_3@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI0_3@toc@l +; P8BE-NEXT: vperm v3, v5, v3, v4 +; P8BE-NEXT: vmuluwm v3, v3, v1 +; P8BE-NEXT: vsrw v3, v3, v0 +; P8BE-NEXT: vpkuwum v1, v3, v3 +; P8BE-NEXT: vsubuhm v1, v2, v1 +; P8BE-NEXT: vperm v4, v5, v1, v4 +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI0_4@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI0_4@toc@l +; P8BE-NEXT: vmuluwm v4, v4, v5 +; P8BE-NEXT: vsrw v4, v4, v0 +; P8BE-NEXT: vadduhm v3, v4, v3 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI0_5@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI0_5@toc@l +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: vsrh v3, v3, v4 +; P8BE-NEXT: vxor v4, v4, v4 +; P8BE-NEXT: vmladduhm v3, v3, v5, v4 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -222,217 +169,88 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { ; P9LE-LABEL: fold_urem_vec_2: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: lis r4, 22765 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 8969 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r5, r3, r4 -; P9LE-NEXT: sub r6, r3, r5 -; P9LE-NEXT: srwi r6, r6, 1 -; P9LE-NEXT: add r5, r6, r5 -; P9LE-NEXT: srwi r5, r5, 6 -; P9LE-NEXT: mulli r5, r5, 95 -; P9LE-NEXT: sub r3, r3, r5 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r5, r3, r4 -; P9LE-NEXT: sub r6, r3, r5 -; P9LE-NEXT: srwi r6, r6, 1 -; P9LE-NEXT: add r5, r6, r5 -; P9LE-NEXT: srwi r5, r5, 6 -; P9LE-NEXT: mulli r5, r5, 95 -; P9LE-NEXT: sub r3, r3, r5 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r5, r3, r4 -; P9LE-NEXT: sub r6, r3, r5 -; P9LE-NEXT: srwi r6, r6, 1 -; P9LE-NEXT: add r5, r6, r5 -; P9LE-NEXT: srwi r5, r5, 6 -; P9LE-NEXT: mulli r5, r5, 95 -; P9LE-NEXT: sub r3, r3, r5 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: sub r5, r3, r4 -; P9LE-NEXT: srwi r5, r5, 1 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v2, v4 -; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P9LE-NEXT: xxlxor v3, v3, v3 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: vmrglh v3, v3, v2 +; P9LE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI1_1@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI1_1@toc@l +; P9LE-NEXT: vmuluwm v3, v3, v4 +; P9LE-NEXT: vspltisw v4, 11 +; P9LE-NEXT: vadduwm v4, v4, v4 +; P9LE-NEXT: vsrw v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: fold_urem_vec_2: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r4, 22765 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 8969 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r5, r3, r4 -; P9BE-NEXT: sub r6, r3, r5 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r5, r6, r5 -; P9BE-NEXT: srwi r5, r5, 6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: sub r3, r3, r5 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r5, r3, r4 -; P9BE-NEXT: sub r6, r3, r5 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r5, r6, r5 -; P9BE-NEXT: srwi r5, r5, 6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: sub r3, r3, r5 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r5, r3, r4 -; P9BE-NEXT: sub r6, r3, r5 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r5, r6, r5 -; P9BE-NEXT: srwi r5, r5, 6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: sub r3, r3, r5 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: sub r5, r3, r4 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P9BE-NEXT: xxlxor v4, v4, v4 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; P9BE-NEXT: lxvx v3, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI1_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI1_1@toc@l +; P9BE-NEXT: vperm v3, v4, v2, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI1_2@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI1_2@toc@l +; P9BE-NEXT: vmuluwm v3, v3, v4 +; P9BE-NEXT: vspltisw v4, 11 +; P9BE-NEXT: vadduwm v4, v4, v4 +; P9BE-NEXT: vsrw v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: fold_urem_vec_2: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: ori r3, r3, 8969 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: clrldi r5, r4, 48 -; P8LE-NEXT: rldicl r6, r4, 48, 48 -; P8LE-NEXT: clrlwi r5, r5, 16 -; P8LE-NEXT: rldicl r7, r4, 32, 48 -; P8LE-NEXT: clrlwi r6, r6, 16 -; P8LE-NEXT: mulhwu r8, r5, r3 -; P8LE-NEXT: rldicl r4, r4, 16, 48 -; P8LE-NEXT: clrlwi r7, r7, 16 -; P8LE-NEXT: mulhwu r9, r6, r3 -; P8LE-NEXT: clrlwi r4, r4, 16 -; P8LE-NEXT: mulhwu r10, r7, r3 -; P8LE-NEXT: mulhwu r3, r4, r3 -; P8LE-NEXT: sub r11, r5, r8 -; P8LE-NEXT: sub r12, r6, r9 -; P8LE-NEXT: srwi r11, r11, 1 -; P8LE-NEXT: add r8, r11, r8 -; P8LE-NEXT: sub r11, r7, r10 -; P8LE-NEXT: srwi r12, r12, 1 -; P8LE-NEXT: add r9, r12, r9 -; P8LE-NEXT: sub r12, r4, r3 -; P8LE-NEXT: srwi r11, r11, 1 -; P8LE-NEXT: srwi r8, r8, 6 -; P8LE-NEXT: add r10, r11, r10 -; P8LE-NEXT: srwi r11, r12, 1 -; P8LE-NEXT: srwi r9, r9, 6 -; P8LE-NEXT: add r3, r11, r3 -; P8LE-NEXT: mulli r8, r8, 95 -; P8LE-NEXT: srwi r10, r10, 6 -; P8LE-NEXT: srwi r3, r3, 6 -; P8LE-NEXT: mulli r9, r9, 95 -; P8LE-NEXT: mulli r10, r10, 95 -; P8LE-NEXT: mulli r3, r3, 95 -; P8LE-NEXT: sub r5, r5, r8 -; P8LE-NEXT: sub r6, r6, r9 -; P8LE-NEXT: mtvsrd v2, r5 -; P8LE-NEXT: sub r5, r7, r10 -; P8LE-NEXT: sub r3, r4, r3 -; P8LE-NEXT: mtvsrd v3, r6 -; P8LE-NEXT: mtvsrd v4, r5 -; P8LE-NEXT: mtvsrd v5, r3 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: vmrghh v3, v5, v4 -; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: xxlxor v3, v3, v3 +; P8LE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P8LE-NEXT: vspltisw v4, 11 +; P8LE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; P8LE-NEXT: vmrglh v3, v3, v2 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI1_1@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI1_1@toc@l +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: vxor v4, v4, v4 +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vmladduhm v3, v3, v5, v4 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: fold_urem_vec_2: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: ori r3, r3, 8969 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: clrlwi r5, r5, 16 -; P8BE-NEXT: rldicl r7, r4, 32, 48 -; P8BE-NEXT: clrlwi r6, r6, 16 -; P8BE-NEXT: mulhwu r8, r5, r3 -; P8BE-NEXT: rldicl r4, r4, 16, 48 -; P8BE-NEXT: clrlwi r7, r7, 16 -; P8BE-NEXT: mulhwu r9, r6, r3 -; P8BE-NEXT: clrlwi r4, r4, 16 -; P8BE-NEXT: mulhwu r10, r7, r3 -; P8BE-NEXT: mulhwu r3, r4, r3 -; P8BE-NEXT: sub r11, r5, r8 -; P8BE-NEXT: sub r12, r6, r9 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: add r8, r11, r8 -; P8BE-NEXT: sub r11, r7, r10 -; P8BE-NEXT: srwi r12, r12, 1 -; P8BE-NEXT: add r9, r12, r9 -; P8BE-NEXT: sub r12, r4, r3 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: srwi r8, r8, 6 -; P8BE-NEXT: add r10, r11, r10 -; P8BE-NEXT: srwi r11, r12, 1 -; P8BE-NEXT: srwi r9, r9, 6 -; P8BE-NEXT: add r3, r11, r3 -; P8BE-NEXT: srwi r10, r10, 6 -; P8BE-NEXT: srwi r3, r3, 6 -; P8BE-NEXT: mulli r8, r8, 95 -; P8BE-NEXT: mulli r9, r9, 95 -; P8BE-NEXT: mulli r10, r10, 95 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: sub r5, r5, r8 -; P8BE-NEXT: sub r6, r6, r9 -; P8BE-NEXT: sub r7, r7, r10 -; P8BE-NEXT: sub r3, r4, r3 -; P8BE-NEXT: sldi r5, r5, 48 -; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: sldi r4, r7, 48 -; P8BE-NEXT: mtvsrd v2, r5 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v3, r6 -; P8BE-NEXT: mtvsrd v4, r4 -; P8BE-NEXT: mtvsrd v5, r3 -; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: vmrghh v3, v5, v4 -; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P8BE-NEXT: xxlxor v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; P8BE-NEXT: lxvw4x v3, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI1_1@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI1_1@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI1_2@toc@ha +; P8BE-NEXT: vperm v3, v4, v2, v3 +; P8BE-NEXT: vspltisw v4, 11 +; P8BE-NEXT: addi r3, r3, .LCPI1_2@toc@l +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: vxor v4, v4, v4 +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vmladduhm v3, v3, v5, v4 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -443,259 +261,92 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; P9LE-LABEL: combine_urem_udiv: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: lis r4, 22765 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 8969 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r5, r3, r4 -; P9LE-NEXT: sub r6, r3, r5 -; P9LE-NEXT: srwi r6, r6, 1 -; P9LE-NEXT: add r5, r6, r5 -; P9LE-NEXT: srwi r5, r5, 6 -; P9LE-NEXT: mulli r6, r5, 95 -; P9LE-NEXT: sub r3, r3, r6 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r6, r3, 16 -; P9LE-NEXT: mulhwu r7, r6, r4 -; P9LE-NEXT: sub r6, r6, r7 -; P9LE-NEXT: srwi r6, r6, 1 -; P9LE-NEXT: add r6, r6, r7 -; P9LE-NEXT: srwi r6, r6, 6 -; P9LE-NEXT: mulli r7, r6, 95 -; P9LE-NEXT: sub r3, r3, r7 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: clrlwi r7, r3, 16 -; P9LE-NEXT: mulhwu r8, r7, r4 -; P9LE-NEXT: sub r7, r7, r8 -; P9LE-NEXT: srwi r7, r7, 1 -; P9LE-NEXT: add r7, r7, r8 -; P9LE-NEXT: srwi r7, r7, 6 -; P9LE-NEXT: mulli r8, r7, 95 -; P9LE-NEXT: sub r3, r3, r8 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r8, r3, 16 -; P9LE-NEXT: mulhwu r4, r8, r4 -; P9LE-NEXT: sub r8, r8, r4 -; P9LE-NEXT: srwi r8, r8, 1 -; P9LE-NEXT: add r4, r8, r4 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r8, r4, 95 -; P9LE-NEXT: mtvsrd v5, r4 -; P9LE-NEXT: sub r3, r3, r8 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v2, v4 -; P9LE-NEXT: mtvsrd v4, r6 -; P9LE-NEXT: vmrglw v2, v2, v3 -; P9LE-NEXT: mtvsrd v3, r5 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: mtvsrd v4, r7 -; P9LE-NEXT: vmrghh v4, v5, v4 -; P9LE-NEXT: vmrglw v3, v4, v3 +; P9LE-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; P9LE-NEXT: xxlxor v4, v4, v4 +; P9LE-NEXT: vspltisw v5, 11 +; P9LE-NEXT: addi r3, r3, .LCPI2_1@toc@l +; P9LE-NEXT: vmrglh v4, v4, v2 +; P9LE-NEXT: vadduwm v5, v5, v5 +; P9LE-NEXT: lxvx v3, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; P9LE-NEXT: vmuluwm v3, v4, v3 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vsrw v3, v3, v5 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: vmladduhm v4, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v4 ; P9LE-NEXT: vadduhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: combine_urem_udiv: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r5, 22765 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r5, r5, 8969 -; P9BE-NEXT: clrlwi r4, r3, 16 -; P9BE-NEXT: mulhwu r6, r4, r5 -; P9BE-NEXT: sub r4, r4, r6 -; P9BE-NEXT: srwi r4, r4, 1 -; P9BE-NEXT: add r4, r4, r6 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r6, r4, 95 -; P9BE-NEXT: sub r3, r3, r6 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r6, r3, 16 -; P9BE-NEXT: mulhwu r7, r6, r5 -; P9BE-NEXT: sub r6, r6, r7 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r6, r6, r7 -; P9BE-NEXT: srwi r6, r6, 6 -; P9BE-NEXT: mulli r7, r6, 95 -; P9BE-NEXT: sub r3, r3, r7 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: clrlwi r7, r3, 16 -; P9BE-NEXT: mulhwu r8, r7, r5 -; P9BE-NEXT: sub r7, r7, r8 -; P9BE-NEXT: srwi r7, r7, 1 -; P9BE-NEXT: add r7, r7, r8 -; P9BE-NEXT: srwi r7, r7, 6 -; P9BE-NEXT: mulli r8, r7, 95 -; P9BE-NEXT: sub r3, r3, r8 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r5, r3, r5 -; P9BE-NEXT: sub r8, r3, r5 -; P9BE-NEXT: srwi r8, r8, 1 -; P9BE-NEXT: add r5, r8, r5 -; P9BE-NEXT: srwi r5, r5, 6 -; P9BE-NEXT: mulli r8, r5, 95 -; P9BE-NEXT: sub r3, r3, r8 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: sldi r3, r4, 48 -; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v2, v3 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: sldi r3, r6, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: sldi r3, r7, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: sldi r3, r5, 48 -; P9BE-NEXT: mtvsrd v5, r3 -; P9BE-NEXT: vmrghh v4, v5, v4 -; P9BE-NEXT: vmrghw v3, v4, v3 +; P9BE-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; P9BE-NEXT: xxlxor v4, v4, v4 +; P9BE-NEXT: vspltisw v5, 11 +; P9BE-NEXT: addi r3, r3, .LCPI2_1@toc@l +; P9BE-NEXT: vadduwm v5, v5, v5 +; P9BE-NEXT: lxvx v3, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI2_2@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI2_2@toc@l +; P9BE-NEXT: vperm v3, v4, v2, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; P9BE-NEXT: vmuluwm v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vsrw v3, v3, v5 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: vmladduhm v4, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v4 ; P9BE-NEXT: vadduhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: combine_urem_udiv: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; P8LE-NEXT: ori r3, r3, 8969 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: clrldi r5, r4, 48 -; P8LE-NEXT: rldicl r6, r4, 48, 48 -; P8LE-NEXT: clrlwi r5, r5, 16 -; P8LE-NEXT: clrlwi r8, r6, 16 -; P8LE-NEXT: rldicl r7, r4, 32, 48 -; P8LE-NEXT: rldicl r4, r4, 16, 48 -; P8LE-NEXT: mulhwu r9, r5, r3 -; P8LE-NEXT: mulhwu r11, r8, r3 -; P8LE-NEXT: clrlwi r10, r7, 16 -; P8LE-NEXT: clrlwi r12, r4, 16 -; P8LE-NEXT: mulhwu r0, r10, r3 -; P8LE-NEXT: mulhwu r3, r12, r3 -; P8LE-NEXT: sub r30, r5, r9 -; P8LE-NEXT: sub r8, r8, r11 -; P8LE-NEXT: srwi r30, r30, 1 -; P8LE-NEXT: srwi r8, r8, 1 -; P8LE-NEXT: sub r10, r10, r0 -; P8LE-NEXT: add r9, r30, r9 -; P8LE-NEXT: add r8, r8, r11 -; P8LE-NEXT: sub r11, r12, r3 -; P8LE-NEXT: srwi r10, r10, 1 -; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload -; P8LE-NEXT: srwi r9, r9, 6 -; P8LE-NEXT: srwi r11, r11, 1 -; P8LE-NEXT: srwi r8, r8, 6 -; P8LE-NEXT: add r10, r10, r0 -; P8LE-NEXT: mulli r12, r9, 95 -; P8LE-NEXT: add r3, r11, r3 -; P8LE-NEXT: mtvsrd v2, r9 -; P8LE-NEXT: srwi r10, r10, 6 -; P8LE-NEXT: mulli r9, r8, 95 -; P8LE-NEXT: srwi r3, r3, 6 -; P8LE-NEXT: mtvsrd v3, r8 -; P8LE-NEXT: mulli r8, r10, 95 -; P8LE-NEXT: mtvsrd v4, r10 -; P8LE-NEXT: mulli r10, r3, 95 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: sub r5, r5, r12 -; P8LE-NEXT: sub r6, r6, r9 -; P8LE-NEXT: mtvsrd v3, r5 -; P8LE-NEXT: mtvsrd v5, r6 -; P8LE-NEXT: sub r5, r7, r8 -; P8LE-NEXT: sub r4, r4, r10 -; P8LE-NEXT: mtvsrd v0, r5 -; P8LE-NEXT: mtvsrd v1, r4 -; P8LE-NEXT: vmrghh v3, v5, v3 -; P8LE-NEXT: mtvsrd v5, r3 -; P8LE-NEXT: vmrghh v0, v1, v0 -; P8LE-NEXT: vmrghh v4, v5, v4 -; P8LE-NEXT: vmrglw v3, v0, v3 -; P8LE-NEXT: vmrglw v2, v4, v2 -; P8LE-NEXT: vadduhm v2, v3, v2 +; P8LE-NEXT: xxlxor v3, v3, v3 +; P8LE-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; P8LE-NEXT: vspltisw v4, 11 +; P8LE-NEXT: addi r3, r3, .LCPI2_1@toc@l +; P8LE-NEXT: vmrglh v3, v3, v2 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: vxor v4, v4, v4 +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vmladduhm v4, v3, v5, v4 +; P8LE-NEXT: vsubuhm v2, v2, v4 +; P8LE-NEXT: vadduhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: combine_urem_udiv: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r5, v2 -; P8BE-NEXT: lis r4, 22765 -; P8BE-NEXT: ori r4, r4, 8969 -; P8BE-NEXT: clrldi r3, r5, 48 -; P8BE-NEXT: rldicl r6, r5, 48, 48 -; P8BE-NEXT: clrlwi r8, r3, 16 -; P8BE-NEXT: rldicl r7, r5, 32, 48 -; P8BE-NEXT: clrlwi r9, r6, 16 -; P8BE-NEXT: rldicl r5, r5, 16, 48 -; P8BE-NEXT: mulhwu r10, r8, r4 -; P8BE-NEXT: clrlwi r11, r7, 16 -; P8BE-NEXT: mulhwu r12, r9, r4 -; P8BE-NEXT: clrlwi r5, r5, 16 -; P8BE-NEXT: mulhwu r0, r11, r4 -; P8BE-NEXT: mulhwu r4, r5, r4 -; P8BE-NEXT: sub r8, r8, r10 -; P8BE-NEXT: sub r9, r9, r12 -; P8BE-NEXT: srwi r8, r8, 1 -; P8BE-NEXT: add r8, r8, r10 -; P8BE-NEXT: sub r10, r11, r0 -; P8BE-NEXT: srwi r9, r9, 1 -; P8BE-NEXT: sub r11, r5, r4 -; P8BE-NEXT: add r9, r9, r12 -; P8BE-NEXT: srwi r8, r8, 6 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: srwi r10, r10, 1 -; P8BE-NEXT: srwi r9, r9, 6 -; P8BE-NEXT: mulli r12, r8, 95 -; P8BE-NEXT: add r4, r11, r4 -; P8BE-NEXT: add r10, r10, r0 -; P8BE-NEXT: mulli r11, r9, 95 -; P8BE-NEXT: srwi r4, r4, 6 -; P8BE-NEXT: srwi r10, r10, 6 -; P8BE-NEXT: sldi r9, r9, 48 -; P8BE-NEXT: sldi r8, r8, 48 -; P8BE-NEXT: mtvsrd v3, r9 -; P8BE-NEXT: mulli r9, r4, 95 -; P8BE-NEXT: mtvsrd v2, r8 -; P8BE-NEXT: mulli r8, r10, 95 -; P8BE-NEXT: sub r3, r3, r12 -; P8BE-NEXT: sub r6, r6, r11 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: sldi r10, r10, 48 -; P8BE-NEXT: mtvsrd v3, r3 -; P8BE-NEXT: sub r3, r5, r9 -; P8BE-NEXT: sub r7, r7, r8 -; P8BE-NEXT: mtvsrd v5, r6 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: sldi r5, r7, 48 -; P8BE-NEXT: mtvsrd v1, r3 -; P8BE-NEXT: sldi r3, r4, 48 -; P8BE-NEXT: mtvsrd v4, r10 -; P8BE-NEXT: mtvsrd v0, r5 -; P8BE-NEXT: vmrghh v3, v5, v3 -; P8BE-NEXT: mtvsrd v5, r3 -; P8BE-NEXT: vmrghh v0, v1, v0 -; P8BE-NEXT: vmrghh v4, v5, v4 -; P8BE-NEXT: vmrghw v3, v0, v3 -; P8BE-NEXT: vmrghw v2, v4, v2 -; P8BE-NEXT: vadduhm v2, v3, v2 +; P8BE-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; P8BE-NEXT: xxlxor v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI2_1@toc@l +; P8BE-NEXT: lxvw4x v3, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI2_2@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI2_2@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P8BE-NEXT: vperm v3, v4, v2, v3 +; P8BE-NEXT: vspltisw v4, 11 +; P8BE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: vxor v4, v4, v4 +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vmladduhm v4, v3, v5, v4 +; P8BE-NEXT: vsubuhm v2, v2, v4 +; P8BE-NEXT: vadduhm v2, v2, v3 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, %2 = udiv <4 x i16> %x, @@ -707,133 +358,104 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; P9LE-LABEL: dont_fold_urem_power_of_two: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: lis r4, 22765 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 8969 -; P9LE-NEXT: clrlwi r3, r3, 26 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r3, r3, 27 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: sub r5, r3, r4 -; P9LE-NEXT: srwi r5, r5, 1 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r3, r3, 29 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v4, v2 -; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P9LE-NEXT: xxlxor v3, v3, v3 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: vmrglh v3, v3, v2 +; P9LE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI3_1@toc@l +; P9LE-NEXT: vmuluwm v3, v3, v4 +; P9LE-NEXT: vspltisw v4, 8 +; P9LE-NEXT: vadduwm v4, v4, v4 +; P9LE-NEXT: vsrw v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI3_2@toc@ha +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: addi r3, r3, .LCPI3_2@toc@l +; P9LE-NEXT: vsrh v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: dont_fold_urem_power_of_two: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: lis r4, 22765 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 8969 -; P9BE-NEXT: clrlwi r3, r3, 27 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r3, r3, 26 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: sub r5, r3, r4 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r3, r3, 29 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v3, v2 +; P9BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P9BE-NEXT: xxlxor v4, v4, v4 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; P9BE-NEXT: lxvx v3, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI3_1@toc@l +; P9BE-NEXT: vperm v3, v4, v2, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI3_2@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI3_2@toc@l +; P9BE-NEXT: vmuluwm v3, v3, v4 +; P9BE-NEXT: vspltisw v4, 8 +; P9BE-NEXT: vadduwm v4, v4, v4 +; P9BE-NEXT: vsrw v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI3_3@toc@ha +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: addi r3, r3, .LCPI3_3@toc@l +; P9BE-NEXT: vsrh v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: dont_fold_urem_power_of_two: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: ori r3, r3, 8969 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: rldicl r5, r4, 16, 48 -; P8LE-NEXT: rldicl r7, r4, 48, 48 -; P8LE-NEXT: clrlwi r5, r5, 16 -; P8LE-NEXT: mulhwu r3, r5, r3 -; P8LE-NEXT: sub r6, r5, r3 -; P8LE-NEXT: srwi r6, r6, 1 -; P8LE-NEXT: add r3, r6, r3 -; P8LE-NEXT: clrldi r6, r4, 48 -; P8LE-NEXT: srwi r3, r3, 6 -; P8LE-NEXT: clrlwi r6, r6, 26 -; P8LE-NEXT: mulli r3, r3, 95 -; P8LE-NEXT: rldicl r4, r4, 32, 48 -; P8LE-NEXT: mtvsrd v2, r6 -; P8LE-NEXT: clrlwi r6, r7, 27 -; P8LE-NEXT: clrlwi r4, r4, 29 -; P8LE-NEXT: mtvsrd v3, r6 -; P8LE-NEXT: mtvsrd v5, r4 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: sub r3, r5, r3 -; P8LE-NEXT: mtvsrd v4, r3 -; P8LE-NEXT: vmrghh v3, v4, v5 -; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: xxlxor v3, v3, v3 +; P8LE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P8LE-NEXT: vspltisw v4, 8 +; P8LE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; P8LE-NEXT: vmrglh v3, v3, v2 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI3_1@toc@l +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI3_2@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI3_2@toc@l +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vsrh v3, v3, v4 +; P8LE-NEXT: vxor v4, v4, v4 +; P8LE-NEXT: vmladduhm v3, v3, v5, v4 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_urem_power_of_two: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: ori r3, r3, 8969 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r7, r4, 16, 48 -; P8BE-NEXT: clrlwi r5, r5, 16 -; P8BE-NEXT: clrlwi r7, r7, 26 -; P8BE-NEXT: mulhwu r3, r5, r3 -; P8BE-NEXT: sub r6, r5, r3 -; P8BE-NEXT: srwi r6, r6, 1 -; P8BE-NEXT: add r3, r6, r3 -; P8BE-NEXT: rldicl r6, r4, 32, 48 -; P8BE-NEXT: srwi r3, r3, 6 -; P8BE-NEXT: rldicl r4, r4, 48, 48 -; P8BE-NEXT: clrlwi r6, r6, 27 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: clrlwi r4, r4, 29 -; P8BE-NEXT: mtvsrd v2, r6 -; P8BE-NEXT: sldi r6, r7, 48 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: mtvsrd v3, r6 -; P8BE-NEXT: mtvsrd v5, r4 -; P8BE-NEXT: sub r3, r5, r3 -; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: vmrghh v3, v5, v4 -; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P8BE-NEXT: xxlxor v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; P8BE-NEXT: lxvw4x v3, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI3_1@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI3_2@toc@ha +; P8BE-NEXT: vperm v3, v4, v2, v3 +; P8BE-NEXT: vspltisw v4, 8 +; P8BE-NEXT: addi r3, r3, .LCPI3_2@toc@l +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI3_3@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI3_3@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vsrh v3, v3, v4 +; P8BE-NEXT: vxor v4, v4, v4 +; P8BE-NEXT: vmladduhm v3, v3, v5, v4 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -843,167 +465,156 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { ; P9LE-LABEL: dont_fold_urem_one: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: lis r4, -19946 -; P9LE-NEXT: lis r5, -14230 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 17097 -; P9LE-NEXT: ori r5, r5, 30865 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: srwi r4, r4, 4 -; P9LE-NEXT: mulli r4, r4, 23 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, 24749 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: ori r4, r4, 47143 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: srwi r4, r4, 11 -; P9LE-NEXT: mulli r4, r4, 5423 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: clrlwi r4, r3, 16 -; P9LE-NEXT: rlwinm r3, r3, 31, 17, 31 -; P9LE-NEXT: mulhwu r3, r3, r5 -; P9LE-NEXT: srwi r3, r3, 8 -; P9LE-NEXT: mulli r3, r3, 654 -; P9LE-NEXT: sub r3, r4, r3 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: vmrghh v2, v2, v4 -; P9LE-NEXT: vmrglw v2, v3, v2 +; P9LE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P9LE-NEXT: xxlxor v3, v3, v3 +; P9LE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P9LE-NEXT: vmrglh v4, v3, v2 +; P9LE-NEXT: lxvx v5, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI4_1@toc@l +; P9LE-NEXT: vmuluwm v4, v4, v5 +; P9LE-NEXT: vspltisw v5, 8 +; P9LE-NEXT: vadduwm v5, v5, v5 +; P9LE-NEXT: vsrw v4, v4, v5 +; P9LE-NEXT: vpkuwum v0, v4, v4 +; P9LE-NEXT: vsubuhm v0, v2, v0 +; P9LE-NEXT: vmrglh v3, v3, v0 +; P9LE-NEXT: lxvx v0, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI4_2@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI4_2@toc@l +; P9LE-NEXT: vmuluwm v3, v3, v0 +; P9LE-NEXT: vsrw v3, v3, v5 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: vadduhm v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI4_3@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI4_3@toc@l +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: lxvx vs0, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI4_4@toc@ha +; P9LE-NEXT: vsrh v3, v3, v4 +; P9LE-NEXT: addi r3, r3, .LCPI4_4@toc@l +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: xxsel v3, v3, v2, vs0 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: dont_fold_urem_one: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r4, 24749 -; P9BE-NEXT: lis r5, -14230 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 47143 -; P9BE-NEXT: ori r5, r5, 30865 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: srwi r4, r4, 11 -; P9BE-NEXT: mulli r4, r4, 5423 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, -19946 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 17097 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: srwi r4, r4, 4 -; P9BE-NEXT: mulli r4, r4, 23 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: clrlwi r4, r3, 16 -; P9BE-NEXT: rlwinm r3, r3, 31, 17, 31 -; P9BE-NEXT: mulhwu r3, r3, r5 -; P9BE-NEXT: srwi r3, r3, 8 -; P9BE-NEXT: mulli r3, r3, 654 -; P9BE-NEXT: sub r3, r4, r3 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: vmrghh v2, v4, v2 -; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P9BE-NEXT: xxlxor v4, v4, v4 +; P9BE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P9BE-NEXT: lxvx v3, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI4_1@toc@l +; P9BE-NEXT: lxvx v0, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_2@toc@ha +; P9BE-NEXT: vperm v5, v4, v2, v3 +; P9BE-NEXT: addi r3, r3, .LCPI4_2@toc@l +; P9BE-NEXT: vmuluwm v5, v5, v0 +; P9BE-NEXT: vspltisw v0, 8 +; P9BE-NEXT: vadduwm v0, v0, v0 +; P9BE-NEXT: vsrw v5, v5, v0 +; P9BE-NEXT: vpkuwum v1, v5, v5 +; P9BE-NEXT: vsubuhm v1, v2, v1 +; P9BE-NEXT: vperm v3, v4, v1, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_3@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI4_3@toc@l +; P9BE-NEXT: vmuluwm v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_4@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI4_4@toc@l +; P9BE-NEXT: lxvx vs0, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_5@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI4_5@toc@l +; P9BE-NEXT: vsrw v3, v3, v0 +; P9BE-NEXT: vadduhm v3, v3, v5 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: vsrh v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: xxsel v3, v3, v2, vs0 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: dont_fold_urem_one: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, -14230 -; P8LE-NEXT: lis r7, -19946 -; P8LE-NEXT: lis r9, 24749 -; P8LE-NEXT: ori r3, r3, 30865 -; P8LE-NEXT: ori r7, r7, 17097 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: rldicl r5, r4, 48, 48 -; P8LE-NEXT: rldicl r6, r4, 32, 48 -; P8LE-NEXT: rldicl r4, r4, 16, 48 -; P8LE-NEXT: rlwinm r8, r5, 31, 17, 31 -; P8LE-NEXT: clrlwi r6, r6, 16 -; P8LE-NEXT: clrlwi r5, r5, 16 -; P8LE-NEXT: mulhwu r3, r8, r3 -; P8LE-NEXT: ori r8, r9, 47143 -; P8LE-NEXT: clrlwi r4, r4, 16 -; P8LE-NEXT: li r9, 0 -; P8LE-NEXT: mulhwu r7, r6, r7 -; P8LE-NEXT: mulhwu r8, r4, r8 -; P8LE-NEXT: mtvsrd v2, r9 -; P8LE-NEXT: srwi r3, r3, 8 -; P8LE-NEXT: srwi r7, r7, 4 -; P8LE-NEXT: mulli r3, r3, 654 -; P8LE-NEXT: srwi r8, r8, 11 -; P8LE-NEXT: mulli r7, r7, 23 -; P8LE-NEXT: mulli r8, r8, 5423 -; P8LE-NEXT: sub r3, r5, r3 -; P8LE-NEXT: sub r5, r6, r7 -; P8LE-NEXT: mtvsrd v3, r3 -; P8LE-NEXT: sub r3, r4, r8 -; P8LE-NEXT: mtvsrd v4, r5 -; P8LE-NEXT: mtvsrd v5, r3 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: vmrghh v3, v5, v4 -; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: xxlxor v3, v3, v3 +; P8LE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P8LE-NEXT: vspltisw v5, 8 +; P8LE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P8LE-NEXT: vmrglh v4, v3, v2 +; P8LE-NEXT: lvx v0, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; P8LE-NEXT: vadduwm v5, v5, v5 +; P8LE-NEXT: addi r3, r3, .LCPI4_1@toc@l +; P8LE-NEXT: vmuluwm v4, v4, v0 +; P8LE-NEXT: vsrw v4, v4, v5 +; P8LE-NEXT: vpkuwum v0, v4, v4 +; P8LE-NEXT: vsubuhm v0, v2, v0 +; P8LE-NEXT: vmrglh v3, v3, v0 +; P8LE-NEXT: lvx v0, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI4_2@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI4_2@toc@l +; P8LE-NEXT: vmuluwm v3, v3, v0 +; P8LE-NEXT: vsrw v3, v3, v5 +; P8LE-NEXT: vxor v5, v5, v5 +; P8LE-NEXT: vadduhm v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI4_3@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI4_3@toc@l +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vsrh v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI4_4@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI4_4@toc@l +; P8LE-NEXT: xxsel v3, v3, v2, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: vmladduhm v3, v3, v4, v5 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_urem_one: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 24749 -; P8BE-NEXT: lis r7, -19946 -; P8BE-NEXT: lis r8, -14230 -; P8BE-NEXT: ori r3, r3, 47143 -; P8BE-NEXT: ori r7, r7, 17097 -; P8BE-NEXT: ori r8, r8, 30865 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: rldicl r4, r4, 32, 48 -; P8BE-NEXT: clrlwi r5, r5, 16 -; P8BE-NEXT: clrlwi r6, r6, 16 -; P8BE-NEXT: mulhwu r3, r5, r3 -; P8BE-NEXT: rlwinm r9, r4, 31, 17, 31 -; P8BE-NEXT: clrlwi r4, r4, 16 -; P8BE-NEXT: mulhwu r7, r6, r7 -; P8BE-NEXT: mulhwu r8, r9, r8 -; P8BE-NEXT: li r9, 0 -; P8BE-NEXT: srwi r3, r3, 11 -; P8BE-NEXT: srwi r7, r7, 4 -; P8BE-NEXT: mulli r3, r3, 5423 -; P8BE-NEXT: srwi r8, r8, 8 -; P8BE-NEXT: mulli r7, r7, 23 -; P8BE-NEXT: mulli r8, r8, 654 -; P8BE-NEXT: sub r3, r5, r3 -; P8BE-NEXT: sldi r5, r9, 48 -; P8BE-NEXT: mtvsrd v2, r5 -; P8BE-NEXT: sub r5, r6, r7 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: sub r4, r4, r8 -; P8BE-NEXT: sldi r5, r5, 48 -; P8BE-NEXT: mtvsrd v3, r3 -; P8BE-NEXT: sldi r3, r4, 48 -; P8BE-NEXT: mtvsrd v4, r5 -; P8BE-NEXT: mtvsrd v5, r3 -; P8BE-NEXT: vmrghh v3, v4, v3 -; P8BE-NEXT: vmrghh v2, v2, v5 -; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P8BE-NEXT: xxlxor v4, v4, v4 +; P8BE-NEXT: vspltisw v1, 8 +; P8BE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P8BE-NEXT: lxvw4x v3, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI4_1@toc@l +; P8BE-NEXT: lxvw4x v0, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_2@toc@ha +; P8BE-NEXT: vperm v5, v4, v2, v3 +; P8BE-NEXT: addi r3, r3, .LCPI4_2@toc@l +; P8BE-NEXT: vmuluwm v5, v5, v0 +; P8BE-NEXT: vadduwm v0, v1, v1 +; P8BE-NEXT: vsrw v5, v5, v0 +; P8BE-NEXT: vpkuwum v1, v5, v5 +; P8BE-NEXT: vsubuhm v1, v2, v1 +; P8BE-NEXT: vperm v3, v4, v1, v3 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_3@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI4_3@toc@l +; P8BE-NEXT: vmuluwm v3, v3, v4 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_4@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI4_4@toc@l +; P8BE-NEXT: vsrw v3, v3, v0 +; P8BE-NEXT: lxvw4x vs0, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_5@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI4_5@toc@l +; P8BE-NEXT: vadduhm v3, v3, v5 +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vsrh v3, v3, v4 +; P8BE-NEXT: vxor v4, v4, v4 +; P8BE-NEXT: xxsel v3, v3, v2, vs0 +; P8BE-NEXT: vmladduhm v3, v3, v5, v4 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -1022,129 +633,251 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { ; P9LE-LABEL: dont_fold_urem_i64: ; P9LE: # %bb.0: -; P9LE-NEXT: lis r4, 25644 -; P9LE-NEXT: mfvsrld r3, v3 -; P9LE-NEXT: ori r4, r4, 34192 +; P9LE-NEXT: lis r3, 25644 +; P9LE-NEXT: mfvsrld r4, v3 +; P9LE-NEXT: xxlxor v5, v5, v5 +; P9LE-NEXT: ori r3, r3, 34192 +; P9LE-NEXT: sldi r3, r3, 32 +; P9LE-NEXT: oris r3, r3, 45590 +; P9LE-NEXT: ori r3, r3, 17097 +; P9LE-NEXT: mulld r5, r4, r3 +; P9LE-NEXT: mulhdu r3, r4, r3 +; P9LE-NEXT: addis r4, r2, .LCPI6_0@toc@ha +; P9LE-NEXT: addi r4, r4, .LCPI6_0@toc@l +; P9LE-NEXT: lxvx v0, 0, r4 +; P9LE-NEXT: mtvsrdd v4, r3, r5 +; P9LE-NEXT: lis r3, -16037 +; P9LE-NEXT: mfvsrd r5, v3 +; P9LE-NEXT: ori r3, r3, 28749 +; P9LE-NEXT: sldi r3, r3, 32 +; P9LE-NEXT: vspltb v1, v0, 15 +; P9LE-NEXT: vsro v4, v4, v0 +; P9LE-NEXT: vsro v5, v5, v0 +; P9LE-NEXT: oris r3, r3, 52170 +; P9LE-NEXT: vsr v4, v4, v1 +; P9LE-NEXT: vsr v5, v5, v1 +; P9LE-NEXT: ori r3, r3, 12109 +; P9LE-NEXT: mulld r4, r5, r3 +; P9LE-NEXT: mulhdu r3, r5, r3 +; P9LE-NEXT: mtvsrdd v6, r3, r4 +; P9LE-NEXT: vsro v6, v6, v0 +; P9LE-NEXT: vsr v6, v6, v1 +; P9LE-NEXT: xxmrgld v6, v6, v4 +; P9LE-NEXT: vsubudm v4, v3, v4 +; P9LE-NEXT: mfvsrld r3, v4 +; P9LE-NEXT: sldi r4, r3, 63 +; P9LE-NEXT: rldicl r3, r3, 63, 1 +; P9LE-NEXT: mtvsrdd v4, r3, r4 +; P9LE-NEXT: addis r3, r2, .LCPI6_1@toc@ha +; P9LE-NEXT: vsro v4, v4, v0 +; P9LE-NEXT: addi r3, r3, .LCPI6_1@toc@l +; P9LE-NEXT: vsr v4, v4, v1 +; P9LE-NEXT: xxmrgld v4, v5, v4 +; P9LE-NEXT: vaddudm v4, v4, v6 +; P9LE-NEXT: lxvx v6, 0, r3 +; P9LE-NEXT: vsrd v4, v4, v6 +; P9LE-NEXT: mfvsrld r3, v4 +; P9LE-NEXT: mfvsrd r4, v4 +; P9LE-NEXT: mulli r4, r4, 5423 +; P9LE-NEXT: mulli r3, r3, 23 +; P9LE-NEXT: mtvsrdd v4, r4, r3 +; P9LE-NEXT: addis r3, r2, .LCPI6_2@toc@ha +; P9LE-NEXT: lis r4, 25653 +; P9LE-NEXT: addi r3, r3, .LCPI6_2@toc@l +; P9LE-NEXT: vsubudm v3, v3, v4 +; P9LE-NEXT: ori r4, r4, 15432 +; P9LE-NEXT: lxvx v4, 0, r3 ; P9LE-NEXT: sldi r4, r4, 32 -; P9LE-NEXT: oris r4, r4, 45590 -; P9LE-NEXT: ori r4, r4, 17097 -; P9LE-NEXT: mulhdu r4, r3, r4 -; P9LE-NEXT: sub r5, r3, r4 -; P9LE-NEXT: rldicl r5, r5, 63, 1 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: lis r5, -16037 -; P9LE-NEXT: rldicl r4, r4, 60, 4 -; P9LE-NEXT: ori r5, r5, 28749 -; P9LE-NEXT: mulli r4, r4, 23 -; P9LE-NEXT: sldi r5, r5, 32 -; P9LE-NEXT: oris r5, r5, 52170 -; P9LE-NEXT: ori r5, r5, 12109 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mfvsrd r4, v3 -; P9LE-NEXT: mulhdu r5, r4, r5 -; P9LE-NEXT: rldicl r5, r5, 52, 12 -; P9LE-NEXT: mulli r5, r5, 5423 -; P9LE-NEXT: sub r4, r4, r5 -; P9LE-NEXT: lis r5, 25653 -; P9LE-NEXT: ori r5, r5, 15432 -; P9LE-NEXT: mtvsrdd v3, r4, r3 -; P9LE-NEXT: mfvsrd r3, v2 -; P9LE-NEXT: sldi r5, r5, 32 -; P9LE-NEXT: rldicl r4, r3, 63, 1 -; P9LE-NEXT: oris r5, r5, 1603 -; P9LE-NEXT: ori r5, r5, 21445 -; P9LE-NEXT: mulhdu r4, r4, r5 -; P9LE-NEXT: rldicl r4, r4, 57, 7 +; P9LE-NEXT: oris r4, r4, 1603 +; P9LE-NEXT: ori r4, r4, 21445 +; P9LE-NEXT: vsrd v4, v2, v4 +; P9LE-NEXT: mfvsrd r3, v4 +; P9LE-NEXT: mulld r5, r3, r4 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: mtvsrdd v4, r3, r5 +; P9LE-NEXT: addis r3, r2, .LCPI6_3@toc@ha +; P9LE-NEXT: vsro v4, v4, v0 +; P9LE-NEXT: addi r3, r3, .LCPI6_3@toc@l +; P9LE-NEXT: vsr v4, v4, v1 +; P9LE-NEXT: xxmrgld v4, v4, v5 +; P9LE-NEXT: xxspltd v5, v5, 1 +; P9LE-NEXT: xxlor v4, v5, v4 +; P9LE-NEXT: lxvx v5, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI6_4@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI6_4@toc@l +; P9LE-NEXT: vsrd v4, v4, v5 +; P9LE-NEXT: lxvx v5, 0, r3 +; P9LE-NEXT: xxsel vs0, v4, v2, v5 +; P9LE-NEXT: mffprd r4, f0 +; P9LE-NEXT: mfvsrld r3, vs0 ; P9LE-NEXT: mulli r4, r4, 654 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: li r4, 0 -; P9LE-NEXT: mtvsrdd v2, r3, r4 +; P9LE-NEXT: mtvsrdd v4, r4, r3 +; P9LE-NEXT: vsubudm v2, v2, v4 ; P9LE-NEXT: blr ; ; P9BE-LABEL: dont_fold_urem_i64: ; P9BE: # %bb.0: +; P9BE-NEXT: lis r3, -16037 +; P9BE-NEXT: mfvsrld r4, v3 +; P9BE-NEXT: xxlxor v5, v5, v5 +; P9BE-NEXT: ori r3, r3, 28749 +; P9BE-NEXT: sldi r3, r3, 32 +; P9BE-NEXT: oris r3, r3, 52170 +; P9BE-NEXT: ori r3, r3, 12109 +; P9BE-NEXT: mulld r5, r4, r3 +; P9BE-NEXT: mulhdu r3, r4, r3 ; P9BE-NEXT: lis r4, 25644 -; P9BE-NEXT: mfvsrd r3, v3 ; P9BE-NEXT: ori r4, r4, 34192 ; P9BE-NEXT: sldi r4, r4, 32 +; P9BE-NEXT: mtvsrdd v4, r3, r5 +; P9BE-NEXT: mfvsrd r3, v3 ; P9BE-NEXT: oris r4, r4, 45590 ; P9BE-NEXT: ori r4, r4, 17097 -; P9BE-NEXT: mulhdu r4, r3, r4 -; P9BE-NEXT: sub r5, r3, r4 -; P9BE-NEXT: rldicl r5, r5, 63, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: lis r5, -16037 -; P9BE-NEXT: rldicl r4, r4, 60, 4 -; P9BE-NEXT: ori r5, r5, 28749 +; P9BE-NEXT: mulld r5, r3, r4 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: addis r4, r2, .LCPI6_0@toc@ha +; P9BE-NEXT: addi r4, r4, .LCPI6_0@toc@l +; P9BE-NEXT: lxvx v0, 0, r4 +; P9BE-NEXT: vspltb v1, v0, 15 +; P9BE-NEXT: mtvsrdd v6, r3, r5 +; P9BE-NEXT: vsro v4, v4, v0 +; P9BE-NEXT: vsro v6, v6, v0 +; P9BE-NEXT: vsro v5, v5, v0 +; P9BE-NEXT: vsr v4, v4, v1 +; P9BE-NEXT: vsr v6, v6, v1 +; P9BE-NEXT: vsr v5, v5, v1 +; P9BE-NEXT: xxmrgld v4, v6, v4 +; P9BE-NEXT: vsubudm v6, v3, v4 +; P9BE-NEXT: mfvsrd r3, v6 +; P9BE-NEXT: sldi r4, r3, 63 +; P9BE-NEXT: rldicl r3, r3, 63, 1 +; P9BE-NEXT: mtvsrdd v6, r3, r4 +; P9BE-NEXT: addis r3, r2, .LCPI6_1@toc@ha +; P9BE-NEXT: vsro v6, v6, v0 +; P9BE-NEXT: addi r3, r3, .LCPI6_1@toc@l +; P9BE-NEXT: vsr v6, v6, v1 +; P9BE-NEXT: xxmrgld v6, v6, v5 +; P9BE-NEXT: vaddudm v4, v6, v4 +; P9BE-NEXT: lxvx v6, 0, r3 +; P9BE-NEXT: vsrd v4, v4, v6 +; P9BE-NEXT: mfvsrld r3, v4 +; P9BE-NEXT: mfvsrd r4, v4 ; P9BE-NEXT: mulli r4, r4, 23 -; P9BE-NEXT: sldi r5, r5, 32 -; P9BE-NEXT: oris r5, r5, 52170 -; P9BE-NEXT: ori r5, r5, 12109 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: mfvsrld r4, v3 -; P9BE-NEXT: mulhdu r5, r4, r5 -; P9BE-NEXT: rldicl r5, r5, 52, 12 -; P9BE-NEXT: mulli r5, r5, 5423 -; P9BE-NEXT: sub r4, r4, r5 -; P9BE-NEXT: lis r5, 25653 -; P9BE-NEXT: ori r5, r5, 15432 -; P9BE-NEXT: mtvsrdd v3, r3, r4 -; P9BE-NEXT: mfvsrld r3, v2 -; P9BE-NEXT: sldi r5, r5, 32 -; P9BE-NEXT: rldicl r4, r3, 63, 1 -; P9BE-NEXT: oris r5, r5, 1603 -; P9BE-NEXT: ori r5, r5, 21445 -; P9BE-NEXT: mulhdu r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 57, 7 -; P9BE-NEXT: mulli r4, r4, 654 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: mtvsrdd v2, 0, r3 +; P9BE-NEXT: mulli r3, r3, 5423 +; P9BE-NEXT: mtvsrdd v4, r4, r3 +; P9BE-NEXT: addis r3, r2, .LCPI6_2@toc@ha +; P9BE-NEXT: lis r4, 25653 +; P9BE-NEXT: addi r3, r3, .LCPI6_2@toc@l +; P9BE-NEXT: vsubudm v3, v3, v4 +; P9BE-NEXT: ori r4, r4, 15432 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: sldi r4, r4, 32 +; P9BE-NEXT: oris r4, r4, 1603 +; P9BE-NEXT: ori r4, r4, 21445 +; P9BE-NEXT: vsrd v4, v2, v4 +; P9BE-NEXT: mfvsrld r3, v4 +; P9BE-NEXT: mulld r5, r3, r4 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: mtvsrdd v4, r3, r5 +; P9BE-NEXT: addis r3, r2, .LCPI6_3@toc@ha +; P9BE-NEXT: vsro v4, v4, v0 +; P9BE-NEXT: addi r3, r3, .LCPI6_3@toc@l +; P9BE-NEXT: vsr v4, v4, v1 +; P9BE-NEXT: xxmrgld v4, v5, v4 +; P9BE-NEXT: xxspltd v5, v5, 1 +; P9BE-NEXT: xxlor v4, v5, v4 +; P9BE-NEXT: lxvx v5, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI6_4@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI6_4@toc@l +; P9BE-NEXT: vsrd v4, v4, v5 +; P9BE-NEXT: lxvx v5, 0, r3 +; P9BE-NEXT: xxsel vs0, v4, v2, v5 +; P9BE-NEXT: mfvsrld r3, vs0 +; P9BE-NEXT: mffprd r4, f0 +; P9BE-NEXT: mulli r3, r3, 654 +; P9BE-NEXT: mtvsrdd v4, r4, r3 +; P9BE-NEXT: vsubudm v2, v2, v4 ; P9BE-NEXT: blr ; ; P8LE-LABEL: dont_fold_urem_i64: ; P8LE: # %bb.0: ; P8LE-NEXT: lis r3, 25644 ; P8LE-NEXT: xxswapd vs0, v3 -; P8LE-NEXT: lis r4, -16037 -; P8LE-NEXT: lis r5, 25653 -; P8LE-NEXT: mfvsrd r6, v2 +; P8LE-NEXT: addis r4, r2, .LCPI6_1@toc@ha +; P8LE-NEXT: xxlxor vs2, vs2, vs2 ; P8LE-NEXT: ori r3, r3, 34192 -; P8LE-NEXT: ori r4, r4, 28749 -; P8LE-NEXT: ori r5, r5, 15432 -; P8LE-NEXT: mfvsrd r8, v3 +; P8LE-NEXT: addi r4, r4, .LCPI6_1@toc@l ; P8LE-NEXT: sldi r3, r3, 32 -; P8LE-NEXT: sldi r4, r4, 32 ; P8LE-NEXT: oris r3, r3, 45590 -; P8LE-NEXT: mffprd r7, f0 -; P8LE-NEXT: sldi r5, r5, 32 -; P8LE-NEXT: oris r4, r4, 52170 +; P8LE-NEXT: mffprd r5, f0 +; P8LE-NEXT: lxvd2x vs0, 0, r4 +; P8LE-NEXT: li r4, 0 ; P8LE-NEXT: ori r3, r3, 17097 -; P8LE-NEXT: oris r5, r5, 1603 -; P8LE-NEXT: ori r4, r4, 12109 -; P8LE-NEXT: mulhdu r3, r7, r3 -; P8LE-NEXT: rldicl r9, r6, 63, 1 -; P8LE-NEXT: ori r5, r5, 21445 -; P8LE-NEXT: mulhdu r4, r8, r4 -; P8LE-NEXT: mulhdu r5, r9, r5 -; P8LE-NEXT: sub r9, r7, r3 -; P8LE-NEXT: rldicl r9, r9, 63, 1 -; P8LE-NEXT: rldicl r4, r4, 52, 12 -; P8LE-NEXT: add r3, r9, r3 -; P8LE-NEXT: rldicl r5, r5, 57, 7 -; P8LE-NEXT: mulli r4, r4, 5423 -; P8LE-NEXT: rldicl r3, r3, 60, 4 -; P8LE-NEXT: mulli r5, r5, 654 -; P8LE-NEXT: mulli r3, r3, 23 -; P8LE-NEXT: sub r4, r8, r4 -; P8LE-NEXT: sub r5, r6, r5 +; P8LE-NEXT: mtfprd f1, r4 +; P8LE-NEXT: lis r4, -16037 +; P8LE-NEXT: mulhdu r3, r5, r3 +; P8LE-NEXT: mfvsrd r5, v3 +; P8LE-NEXT: xxswapd v4, vs0 +; P8LE-NEXT: mtfprd f0, r3 +; P8LE-NEXT: ori r3, r4, 28749 +; P8LE-NEXT: lis r4, 25653 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: vsrd v4, v2, v4 +; P8LE-NEXT: ori r4, r4, 15432 +; P8LE-NEXT: xxmrghd v5, vs1, vs0 +; P8LE-NEXT: oris r3, r3, 52170 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: ori r3, r3, 12109 +; P8LE-NEXT: mfvsrd r6, v4 +; P8LE-NEXT: oris r4, r4, 1603 +; P8LE-NEXT: mulhdu r3, r5, r3 +; P8LE-NEXT: ori r4, r4, 21445 +; P8LE-NEXT: addis r5, r2, .LCPI6_2@toc@ha +; P8LE-NEXT: vsubudm v4, v3, v5 +; P8LE-NEXT: mulhdu r4, r6, r4 +; P8LE-NEXT: addi r5, r5, .LCPI6_2@toc@l +; P8LE-NEXT: xxswapd vs0, v4 +; P8LE-NEXT: mtfprd f1, r3 +; P8LE-NEXT: mffprd r3, f0 +; P8LE-NEXT: xxspltd v4, vs1, 0 +; P8LE-NEXT: lxvd2x vs1, 0, r5 +; P8LE-NEXT: addis r5, r2, .LCPI6_3@toc@ha ; P8LE-NEXT: mtfprd f0, r4 -; P8LE-NEXT: sub r3, r7, r3 -; P8LE-NEXT: li r4, 0 -; P8LE-NEXT: mtfprd f1, r5 -; P8LE-NEXT: mtfprd f2, r3 -; P8LE-NEXT: mtfprd f3, r4 -; P8LE-NEXT: xxmrghd v3, vs0, vs2 -; P8LE-NEXT: xxmrghd v2, vs1, vs3 +; P8LE-NEXT: addis r4, r2, .LCPI6_0@toc@ha +; P8LE-NEXT: rldicl r3, r3, 63, 1 +; P8LE-NEXT: addi r4, r4, .LCPI6_0@toc@l +; P8LE-NEXT: mtfprd f3, r3 +; P8LE-NEXT: xxspltd v0, vs0, 0 +; P8LE-NEXT: lxvd2x vs0, 0, r4 +; P8LE-NEXT: addi r3, r5, .LCPI6_3@toc@l +; P8LE-NEXT: xxmrgld v4, v4, v5 +; P8LE-NEXT: lxvd2x vs4, 0, r3 +; P8LE-NEXT: xxswapd v1, vs1 +; P8LE-NEXT: xxpermdi v5, vs2, vs3, 2 +; P8LE-NEXT: xxmrgld v0, v0, vs2 +; P8LE-NEXT: xxswapd v6, vs0 +; P8LE-NEXT: vaddudm v4, v5, v4 +; P8LE-NEXT: xxswapd v5, vs4 +; P8LE-NEXT: vsrd v0, v0, v1 +; P8LE-NEXT: vsrd v4, v4, v6 +; P8LE-NEXT: xxsel vs0, v0, v2, v5 +; P8LE-NEXT: xxswapd vs1, v4 +; P8LE-NEXT: mfvsrd r3, v4 +; P8LE-NEXT: mffprd r5, f0 +; P8LE-NEXT: xxswapd vs2, vs0 +; P8LE-NEXT: mulli r3, r3, 5423 +; P8LE-NEXT: mffprd r4, f1 +; P8LE-NEXT: mulli r5, r5, 654 +; P8LE-NEXT: mulli r4, r4, 23 +; P8LE-NEXT: mffprd r6, f2 +; P8LE-NEXT: mtfprd f1, r3 +; P8LE-NEXT: mtfprd f0, r6 +; P8LE-NEXT: mtfprd f3, r5 +; P8LE-NEXT: mtfprd f2, r4 +; P8LE-NEXT: xxmrghd v5, vs3, vs0 +; P8LE-NEXT: xxmrghd v4, vs1, vs2 +; P8LE-NEXT: vsubudm v2, v2, v5 +; P8LE-NEXT: vsubudm v3, v3, v4 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_urem_i64: @@ -1152,46 +885,73 @@ ; P8BE-NEXT: lis r3, 25644 ; P8BE-NEXT: lis r4, -16037 ; P8BE-NEXT: xxswapd vs0, v3 -; P8BE-NEXT: xxswapd vs1, v2 -; P8BE-NEXT: lis r5, 25653 +; P8BE-NEXT: addis r5, r2, .LCPI6_1@toc@ha +; P8BE-NEXT: mfvsrd r6, v3 ; P8BE-NEXT: ori r3, r3, 34192 ; P8BE-NEXT: ori r4, r4, 28749 -; P8BE-NEXT: mfvsrd r6, v3 -; P8BE-NEXT: ori r5, r5, 15432 +; P8BE-NEXT: addi r5, r5, .LCPI6_1@toc@l ; P8BE-NEXT: sldi r3, r3, 32 ; P8BE-NEXT: sldi r4, r4, 32 +; P8BE-NEXT: lxvd2x v4, 0, r5 ; P8BE-NEXT: oris r3, r3, 45590 -; P8BE-NEXT: sldi r5, r5, 32 -; P8BE-NEXT: mffprd r7, f0 ; P8BE-NEXT: oris r4, r4, 52170 +; P8BE-NEXT: mffprd r7, f0 ; P8BE-NEXT: ori r3, r3, 17097 -; P8BE-NEXT: mffprd r8, f1 -; P8BE-NEXT: oris r5, r5, 1603 ; P8BE-NEXT: ori r4, r4, 12109 ; P8BE-NEXT: mulhdu r3, r6, r3 -; P8BE-NEXT: ori r5, r5, 21445 ; P8BE-NEXT: mulhdu r4, r7, r4 -; P8BE-NEXT: rldicl r9, r8, 63, 1 -; P8BE-NEXT: mulhdu r5, r9, r5 -; P8BE-NEXT: sub r9, r6, r3 -; P8BE-NEXT: rldicl r9, r9, 63, 1 -; P8BE-NEXT: rldicl r4, r4, 52, 12 -; P8BE-NEXT: add r3, r9, r3 -; P8BE-NEXT: mulli r4, r4, 5423 -; P8BE-NEXT: rldicl r5, r5, 57, 7 -; P8BE-NEXT: rldicl r3, r3, 60, 4 -; P8BE-NEXT: mulli r5, r5, 654 +; P8BE-NEXT: vsrd v4, v2, v4 +; P8BE-NEXT: xxswapd vs1, v4 +; P8BE-NEXT: mtfprd f0, r3 +; P8BE-NEXT: lis r3, 25653 +; P8BE-NEXT: mtfprd f2, r4 +; P8BE-NEXT: ori r3, r3, 15432 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: mffprd r4, f1 +; P8BE-NEXT: xxmrghd v4, vs0, vs2 +; P8BE-NEXT: oris r3, r3, 1603 +; P8BE-NEXT: ori r3, r3, 21445 +; P8BE-NEXT: mulhdu r3, r4, r3 +; P8BE-NEXT: vsubudm v5, v3, v4 +; P8BE-NEXT: mfvsrd r4, v5 +; P8BE-NEXT: mtfprd f0, r3 +; P8BE-NEXT: rldicl r3, r4, 63, 1 +; P8BE-NEXT: addis r4, r2, .LCPI6_3@toc@ha +; P8BE-NEXT: mtfprd f1, r3 +; P8BE-NEXT: xxspltd v5, vs0, 0 +; P8BE-NEXT: addis r3, r2, .LCPI6_2@toc@ha +; P8BE-NEXT: addi r4, r4, .LCPI6_3@toc@l +; P8BE-NEXT: xxlxor vs0, vs0, vs0 +; P8BE-NEXT: addi r3, r3, .LCPI6_2@toc@l +; P8BE-NEXT: lxvd2x v6, 0, r4 +; P8BE-NEXT: xxspltd v0, vs1, 0 +; P8BE-NEXT: lxvd2x v1, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI6_0@toc@ha +; P8BE-NEXT: xxmrgld v5, vs0, v5 +; P8BE-NEXT: addi r3, r3, .LCPI6_0@toc@l +; P8BE-NEXT: xxmrgld v0, v0, vs0 +; P8BE-NEXT: vsrd v5, v5, v1 +; P8BE-NEXT: lxvd2x v1, 0, r3 +; P8BE-NEXT: vaddudm v4, v0, v4 +; P8BE-NEXT: xxsel vs0, v5, v2, v6 +; P8BE-NEXT: vsrd v4, v4, v1 +; P8BE-NEXT: xxswapd vs2, vs0 +; P8BE-NEXT: mffprd r6, f0 +; P8BE-NEXT: xxswapd vs1, v4 +; P8BE-NEXT: mfvsrd r3, v4 +; P8BE-NEXT: mtfprd f0, r6 +; P8BE-NEXT: mffprd r5, f2 ; P8BE-NEXT: mulli r3, r3, 23 -; P8BE-NEXT: sub r4, r7, r4 -; P8BE-NEXT: mtfprd f0, r4 -; P8BE-NEXT: sub r4, r8, r5 -; P8BE-NEXT: sub r3, r6, r3 -; P8BE-NEXT: mtfprd f1, r4 -; P8BE-NEXT: li r4, 0 -; P8BE-NEXT: mtfprd f2, r3 -; P8BE-NEXT: mtfprd f3, r4 -; P8BE-NEXT: xxmrghd v3, vs2, vs0 -; P8BE-NEXT: xxmrghd v2, vs3, vs1 +; P8BE-NEXT: mffprd r4, f1 +; P8BE-NEXT: mulli r5, r5, 654 +; P8BE-NEXT: mulli r4, r4, 5423 +; P8BE-NEXT: mtfprd f1, r3 +; P8BE-NEXT: mtfprd f3, r5 +; P8BE-NEXT: mtfprd f2, r4 +; P8BE-NEXT: xxmrghd v5, vs0, vs3 +; P8BE-NEXT: xxmrghd v4, vs1, vs2 +; P8BE-NEXT: vsubudm v2, v2, v5 +; P8BE-NEXT: vsubudm v3, v3, v4 ; P8BE-NEXT: blr %1 = urem <4 x i64> %x, ret <4 x i64> %1 diff --git a/llvm/test/CodeGen/RISCV/div.ll b/llvm/test/CodeGen/RISCV/div.ll --- a/llvm/test/CodeGen/RISCV/div.ll +++ b/llvm/test/CodeGen/RISCV/div.ll @@ -49,8 +49,12 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) -; RV32I-NEXT: addi a1, zero, 5 -; RV32I-NEXT: call __udivsi3 +; RV32I-NEXT: lui a1, 838861 +; RV32I-NEXT: addi a2, a1, -819 +; RV32I-NEXT: mv a1, zero +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: srli a0, a1, 2 ; RV32I-NEXT: lw ra, 12(sp) ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -69,8 +73,12 @@ ; RV64I-NEXT: sd ra, 8(sp) ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 -; RV64I-NEXT: addi a1, zero, 5 -; RV64I-NEXT: call __udivdi3 +; RV64I-NEXT: lui a1, 205 +; RV64I-NEXT: addiw a1, a1, -819 +; RV64I-NEXT: slli a1, a1, 12 +; RV64I-NEXT: addi a1, a1, -819 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 34 ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -79,16 +87,12 @@ ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a0, a0, 32 ; RV64IM-NEXT: srli a0, a0, 32 -; RV64IM-NEXT: lui a1, 1035469 +; RV64IM-NEXT: lui a1, 205 ; RV64IM-NEXT: addiw a1, a1, -819 ; RV64IM-NEXT: slli a1, a1, 12 ; RV64IM-NEXT: addi a1, a1, -819 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -819 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -819 -; RV64IM-NEXT: mulhu a0, a0, a1 -; RV64IM-NEXT: srli a0, a0, 2 +; RV64IM-NEXT: mul a0, a0, a1 +; RV64IM-NEXT: srli a0, a0, 34 ; RV64IM-NEXT: ret %1 = udiv i32 %a, 5 ret i32 %1 @@ -157,32 +161,78 @@ define i64 @udiv64_constant(i64 %a) nounwind { ; RV32I-LABEL: udiv64_constant: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -16 -; RV32I-NEXT: sw ra, 12(sp) -; RV32I-NEXT: addi a2, zero, 5 -; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __udivdi3 -; RV32I-NEXT: lw ra, 12(sp) -; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: addi sp, sp, -64 +; RV32I-NEXT: sw ra, 60(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw a1, 28(sp) +; RV32I-NEXT: sw a0, 24(sp) +; RV32I-NEXT: lui a0, 838861 +; RV32I-NEXT: addi a1, a0, -820 +; RV32I-NEXT: sw a1, 12(sp) +; RV32I-NEXT: addi a3, a0, -819 +; RV32I-NEXT: addi a0, sp, 40 +; RV32I-NEXT: addi a1, sp, 24 +; RV32I-NEXT: addi a2, sp, 8 +; RV32I-NEXT: sw a3, 8(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: lw a1, 52(sp) +; RV32I-NEXT: lw a0, 48(sp) +; RV32I-NEXT: slli a2, a1, 30 +; RV32I-NEXT: srli a0, a0, 2 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a1, a1, 2 +; RV32I-NEXT: lw ra, 60(sp) +; RV32I-NEXT: addi sp, sp, 64 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: udiv64_constant: ; RV32IM: # %bb.0: -; RV32IM-NEXT: addi sp, sp, -16 -; RV32IM-NEXT: sw ra, 12(sp) -; RV32IM-NEXT: addi a2, zero, 5 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __udivdi3 -; RV32IM-NEXT: lw ra, 12(sp) -; RV32IM-NEXT: addi sp, sp, 16 +; RV32IM-NEXT: addi sp, sp, -64 +; RV32IM-NEXT: sw ra, 60(sp) +; RV32IM-NEXT: sw zero, 20(sp) +; RV32IM-NEXT: sw zero, 16(sp) +; RV32IM-NEXT: sw zero, 36(sp) +; RV32IM-NEXT: sw zero, 32(sp) +; RV32IM-NEXT: sw a1, 28(sp) +; RV32IM-NEXT: sw a0, 24(sp) +; RV32IM-NEXT: lui a0, 838861 +; RV32IM-NEXT: addi a1, a0, -820 +; RV32IM-NEXT: sw a1, 12(sp) +; RV32IM-NEXT: addi a3, a0, -819 +; RV32IM-NEXT: addi a0, sp, 40 +; RV32IM-NEXT: addi a1, sp, 24 +; RV32IM-NEXT: addi a2, sp, 8 +; RV32IM-NEXT: sw a3, 8(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: lw a1, 52(sp) +; RV32IM-NEXT: lw a0, 48(sp) +; RV32IM-NEXT: slli a2, a1, 30 +; RV32IM-NEXT: srli a0, a0, 2 +; RV32IM-NEXT: or a0, a0, a2 +; RV32IM-NEXT: srli a1, a1, 2 +; RV32IM-NEXT: lw ra, 60(sp) +; RV32IM-NEXT: addi sp, sp, 64 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: udiv64_constant: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) -; RV64I-NEXT: addi a1, zero, 5 -; RV64I-NEXT: call __udivdi3 +; RV64I-NEXT: lui a1, 1035469 +; RV64I-NEXT: addiw a1, a1, -819 +; RV64I-NEXT: slli a1, a1, 12 +; RV64I-NEXT: addi a1, a1, -819 +; RV64I-NEXT: slli a1, a1, 12 +; RV64I-NEXT: addi a1, a1, -819 +; RV64I-NEXT: slli a1, a1, 12 +; RV64I-NEXT: addi a2, a1, -819 +; RV64I-NEXT: mv a1, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: srli a0, a1, 2 ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -243,8 +293,14 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) -; RV32I-NEXT: addi a1, zero, 5 -; RV32I-NEXT: call __divsi3 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: lui a2, 419430 +; RV32I-NEXT: addi a2, a2, 1639 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: srli a0, a1, 31 +; RV32I-NEXT: srai a1, a1, 1 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: lw ra, 12(sp) ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -264,8 +320,12 @@ ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) ; RV64I-NEXT: sext.w a0, a0 -; RV64I-NEXT: addi a1, zero, 5 -; RV64I-NEXT: call __divdi3 +; RV64I-NEXT: lui a1, 419430 +; RV64I-NEXT: addiw a1, a1, 1639 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a1, a0, 63 +; RV64I-NEXT: srai a0, a0, 33 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -273,17 +333,11 @@ ; RV64IM-LABEL: sdiv_constant: ; RV64IM: # %bb.0: ; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 13107 -; RV64IM-NEXT: addiw a1, a1, 819 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 819 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 819 -; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, 1639 -; RV64IM-NEXT: mulh a0, a0, a1 +; RV64IM-NEXT: lui a1, 419430 +; RV64IM-NEXT: addiw a1, a1, 1639 +; RV64IM-NEXT: mul a0, a0, a1 ; RV64IM-NEXT: srli a1, a0, 63 -; RV64IM-NEXT: srai a0, a0, 1 +; RV64IM-NEXT: srai a0, a0, 33 ; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret %1 = sdiv i32 %a, 5 @@ -367,32 +421,90 @@ define i64 @sdiv64_constant(i64 %a) nounwind { ; RV32I-LABEL: sdiv64_constant: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -16 -; RV32I-NEXT: sw ra, 12(sp) -; RV32I-NEXT: addi a2, zero, 5 -; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __divdi3 -; RV32I-NEXT: lw ra, 12(sp) -; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: addi sp, sp, -64 +; RV32I-NEXT: sw ra, 60(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw a1, 28(sp) +; RV32I-NEXT: sw a0, 24(sp) +; RV32I-NEXT: lui a0, 419430 +; RV32I-NEXT: addi a2, a0, 1638 +; RV32I-NEXT: sw a2, 12(sp) +; RV32I-NEXT: addi a0, a0, 1639 +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: srai a3, a1, 31 +; RV32I-NEXT: sw a3, 36(sp) +; RV32I-NEXT: addi a0, sp, 40 +; RV32I-NEXT: addi a1, sp, 24 +; RV32I-NEXT: addi a2, sp, 8 +; RV32I-NEXT: sw a3, 32(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: lw a1, 52(sp) +; RV32I-NEXT: lw a0, 48(sp) +; RV32I-NEXT: slli a2, a1, 31 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: or a2, a0, a2 +; RV32I-NEXT: srli a0, a1, 31 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: srai a1, a1, 1 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: lw ra, 60(sp) +; RV32I-NEXT: addi sp, sp, 64 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: sdiv64_constant: ; RV32IM: # %bb.0: -; RV32IM-NEXT: addi sp, sp, -16 -; RV32IM-NEXT: sw ra, 12(sp) -; RV32IM-NEXT: addi a2, zero, 5 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __divdi3 -; RV32IM-NEXT: lw ra, 12(sp) -; RV32IM-NEXT: addi sp, sp, 16 +; RV32IM-NEXT: addi sp, sp, -64 +; RV32IM-NEXT: sw ra, 60(sp) +; RV32IM-NEXT: sw zero, 20(sp) +; RV32IM-NEXT: sw zero, 16(sp) +; RV32IM-NEXT: sw a1, 28(sp) +; RV32IM-NEXT: sw a0, 24(sp) +; RV32IM-NEXT: lui a0, 419430 +; RV32IM-NEXT: addi a2, a0, 1638 +; RV32IM-NEXT: sw a2, 12(sp) +; RV32IM-NEXT: addi a0, a0, 1639 +; RV32IM-NEXT: sw a0, 8(sp) +; RV32IM-NEXT: srai a3, a1, 31 +; RV32IM-NEXT: sw a3, 36(sp) +; RV32IM-NEXT: addi a0, sp, 40 +; RV32IM-NEXT: addi a1, sp, 24 +; RV32IM-NEXT: addi a2, sp, 8 +; RV32IM-NEXT: sw a3, 32(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: lw a1, 52(sp) +; RV32IM-NEXT: lw a0, 48(sp) +; RV32IM-NEXT: slli a2, a1, 31 +; RV32IM-NEXT: srli a0, a0, 1 +; RV32IM-NEXT: or a2, a0, a2 +; RV32IM-NEXT: srli a0, a1, 31 +; RV32IM-NEXT: add a0, a2, a0 +; RV32IM-NEXT: sltu a2, a0, a2 +; RV32IM-NEXT: srai a1, a1, 1 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: lw ra, 60(sp) +; RV32IM-NEXT: addi sp, sp, 64 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: sdiv64_constant: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) -; RV64I-NEXT: addi a1, zero, 5 -; RV64I-NEXT: call __divdi3 +; RV64I-NEXT: srai a1, a0, 63 +; RV64I-NEXT: lui a2, 13107 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: slli a2, a2, 12 +; RV64I-NEXT: addi a2, a2, 819 +; RV64I-NEXT: slli a2, a2, 12 +; RV64I-NEXT: addi a2, a2, 819 +; RV64I-NEXT: slli a2, a2, 13 +; RV64I-NEXT: addi a2, a2, 1639 +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: srli a0, a1, 63 +; RV64I-NEXT: srai a1, a1, 1 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/srem-lkk.ll b/llvm/test/CodeGen/RISCV/srem-lkk.ll --- a/llvm/test/CodeGen/RISCV/srem-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-lkk.ll @@ -13,8 +13,22 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: lui a0, 706409 +; RV32I-NEXT: addi a2, a0, 389 +; RV32I-NEXT: addi a3, zero, -1 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: add a0, a1, s0 +; RV32I-NEXT: srli a1, a0, 31 +; RV32I-NEXT: srai a0, a0, 6 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: lw s0, 8(sp) ; RV32I-NEXT: lw ra, 12(sp) ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -37,32 +51,39 @@ ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: sd s0, 0(sp) +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: lui a1, 706409 +; RV64I-NEXT: addiw a1, a1, 389 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: addw a0, a0, s0 +; RV64I-NEXT: srliw a1, a0, 31 +; RV64I-NEXT: srli a0, a0, 6 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: subw a0, s0, a0 +; RV64I-NEXT: ld s0, 0(sp) ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: fold_srem_positive_odd: ; RV64IM: # %bb.0: -; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 1045903 -; RV64IM-NEXT: addiw a1, a1, -733 -; RV64IM-NEXT: slli a1, a1, 15 -; RV64IM-NEXT: addi a1, a1, 1035 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -905 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -1767 -; RV64IM-NEXT: mulh a1, a0, a1 -; RV64IM-NEXT: add a1, a1, a0 -; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 6 +; RV64IM-NEXT: sext.w a1, a0 +; RV64IM-NEXT: lui a2, 706409 +; RV64IM-NEXT: addiw a2, a2, 389 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: srli a1, a1, 32 +; RV64IM-NEXT: addw a1, a1, a0 +; RV64IM-NEXT: srliw a2, a1, 31 +; RV64IM-NEXT: srli a1, a1, 6 ; RV64IM-NEXT: add a1, a1, a2 ; RV64IM-NEXT: addi a2, zero, 95 ; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: subw a0, a0, a1 ; RV64IM-NEXT: ret %1 = srem i32 %x, 95 ret i32 %1 @@ -74,8 +95,21 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: lui a0, 253241 +; RV32I-NEXT: addi a2, a0, -15 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: srli a0, a1, 31 +; RV32I-NEXT: srai a1, a1, 8 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: addi a1, zero, 1060 -; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: lw s0, 8(sp) ; RV32I-NEXT: lw ra, 12(sp) ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -97,29 +131,35 @@ ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: sd s0, 0(sp) +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: lui a1, 253241 +; RV64I-NEXT: addiw a1, a1, -15 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a1, a0, 63 +; RV64I-NEXT: srai a0, a0, 40 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 1060 -; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: subw a0, s0, a0 +; RV64I-NEXT: ld s0, 0(sp) ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: fold_srem_positive_even: ; RV64IM: # %bb.0: -; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 506482 -; RV64IM-NEXT: addiw a1, a1, -31 -; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, 711 -; RV64IM-NEXT: slli a1, a1, 19 -; RV64IM-NEXT: addi a1, a1, 1979 -; RV64IM-NEXT: mulh a1, a0, a1 +; RV64IM-NEXT: sext.w a1, a0 +; RV64IM-NEXT: lui a2, 253241 +; RV64IM-NEXT: addiw a2, a2, -15 +; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 9 +; RV64IM-NEXT: srai a1, a1, 40 ; RV64IM-NEXT: add a1, a1, a2 ; RV64IM-NEXT: addi a2, zero, 1060 ; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: subw a0, a0, a1 ; RV64IM-NEXT: ret %1 = srem i32 %x, 1060 ret i32 %1 @@ -131,8 +171,21 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: lui a0, 677296 +; RV32I-NEXT: addi a2, a0, -91 +; RV32I-NEXT: addi a3, zero, -1 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: srli a0, a1, 31 +; RV32I-NEXT: srai a1, a1, 8 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: addi a1, zero, -723 -; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: lw s0, 8(sp) ; RV32I-NEXT: lw ra, 12(sp) ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -154,32 +207,35 @@ ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: sd s0, 0(sp) +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: lui a1, 677296 +; RV64I-NEXT: addiw a1, a1, -91 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a1, a0, 63 +; RV64I-NEXT: srai a0, a0, 40 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, -723 -; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: subw a0, s0, a0 +; RV64I-NEXT: ld s0, 0(sp) ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: fold_srem_negative_odd: ; RV64IM: # %bb.0: -; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 4781 -; RV64IM-NEXT: addiw a1, a1, 2045 -; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, 1371 -; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, -11 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -1355 -; RV64IM-NEXT: mulh a1, a0, a1 -; RV64IM-NEXT: sub a1, a1, a0 +; RV64IM-NEXT: sext.w a1, a0 +; RV64IM-NEXT: lui a2, 677296 +; RV64IM-NEXT: addiw a2, a2, -91 +; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 9 +; RV64IM-NEXT: srai a1, a1, 40 ; RV64IM-NEXT: add a1, a1, a2 ; RV64IM-NEXT: addi a2, zero, -723 ; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: subw a0, a0, a1 ; RV64IM-NEXT: ret %1 = srem i32 %x, -723 ret i32 %1 @@ -191,9 +247,22 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: lui a0, 1036895 +; RV32I-NEXT: addi a2, a0, 999 +; RV32I-NEXT: addi a3, zero, -1 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: srli a0, a1, 31 +; RV32I-NEXT: srai a1, a1, 8 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: lui a1, 1048570 ; RV32I-NEXT: addi a1, a1, 1595 -; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: lw s0, 8(sp) ; RV32I-NEXT: lw ra, 12(sp) ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -216,33 +285,37 @@ ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: sd s0, 0(sp) +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: lui a1, 1036895 +; RV64I-NEXT: addiw a1, a1, 999 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a1, a0, 63 +; RV64I-NEXT: srai a0, a0, 40 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 1048570 ; RV64I-NEXT: addiw a1, a1, 1595 -; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: subw a0, s0, a0 +; RV64I-NEXT: ld s0, 0(sp) ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: fold_srem_negative_even: ; RV64IM: # %bb.0: -; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 1036895 -; RV64IM-NEXT: addiw a1, a1, 999 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 11 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -523 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -481 -; RV64IM-NEXT: mulh a1, a0, a1 +; RV64IM-NEXT: sext.w a1, a0 +; RV64IM-NEXT: lui a2, 1036895 +; RV64IM-NEXT: addiw a2, a2, 999 +; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 12 +; RV64IM-NEXT: srai a1, a1, 40 ; RV64IM-NEXT: add a1, a1, a2 ; RV64IM-NEXT: lui a2, 1048570 ; RV64IM-NEXT: addiw a2, a2, 1595 ; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: subw a0, a0, a1 ; RV64IM-NEXT: ret %1 = srem i32 %x, -22981 ret i32 %1 @@ -258,13 +331,21 @@ ; RV32I-NEXT: sw s0, 8(sp) ; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: lui a0, 706409 +; RV32I-NEXT: addi a2, a0, 389 +; RV32I-NEXT: addi a3, zero, -1 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call __divsi3 -; RV32I-NEXT: add a0, s1, a0 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: add a0, a1, s0 +; RV32I-NEXT: srli a1, a0, 31 +; RV32I-NEXT: srai a0, a0, 6 +; RV32I-NEXT: add s1, a0, a1 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: add a0, a0, s1 ; RV32I-NEXT: lw s1, 4(sp) ; RV32I-NEXT: lw s0, 8(sp) ; RV32I-NEXT: lw ra, 12(sp) @@ -292,15 +373,22 @@ ; RV64I-NEXT: sd ra, 24(sp) ; RV64I-NEXT: sd s0, 16(sp) ; RV64I-NEXT: sd s1, 8(sp) -; RV64I-NEXT: sext.w s0, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: lui a1, 706409 +; RV64I-NEXT: addiw a1, a1, 389 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: addw a1, a0, s0 +; RV64I-NEXT: srliw a1, a1, 31 +; RV64I-NEXT: add a0, a0, s0 +; RV64I-NEXT: sraiw a0, a0, 6 +; RV64I-NEXT: add s1, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __divdi3 -; RV64I-NEXT: addw a0, s1, a0 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s0, a0 +; RV64I-NEXT: addw a0, a0, s1 ; RV64I-NEXT: ld s1, 8(sp) ; RV64I-NEXT: ld s0, 16(sp) ; RV64I-NEXT: ld ra, 24(sp) @@ -310,18 +398,14 @@ ; RV64IM-LABEL: combine_srem_sdiv: ; RV64IM: # %bb.0: ; RV64IM-NEXT: sext.w a1, a0 -; RV64IM-NEXT: lui a2, 1045903 -; RV64IM-NEXT: addiw a2, a2, -733 -; RV64IM-NEXT: slli a2, a2, 15 -; RV64IM-NEXT: addi a2, a2, 1035 -; RV64IM-NEXT: slli a2, a2, 12 -; RV64IM-NEXT: addi a2, a2, -905 -; RV64IM-NEXT: slli a2, a2, 12 -; RV64IM-NEXT: addi a2, a2, -1767 -; RV64IM-NEXT: mulh a2, a1, a2 -; RV64IM-NEXT: add a1, a2, a1 -; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 6 +; RV64IM-NEXT: lui a2, 706409 +; RV64IM-NEXT: addiw a2, a2, 389 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: srli a1, a1, 32 +; RV64IM-NEXT: addw a2, a1, a0 +; RV64IM-NEXT: srliw a2, a2, 31 +; RV64IM-NEXT: add a1, a1, a0 +; RV64IM-NEXT: sraiw a1, a1, 6 ; RV64IM-NEXT: add a1, a1, a2 ; RV64IM-NEXT: addi a2, zero, 95 ; RV64IM-NEXT: mul a2, a1, a2 @@ -450,32 +534,127 @@ define i64 @dont_fold_srem_i64(i64 %x) nounwind { ; RV32I-LABEL: dont_fold_srem_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -16 -; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, -64 +; RV32I-NEXT: sw ra, 60(sp) +; RV32I-NEXT: sw s0, 56(sp) +; RV32I-NEXT: sw s1, 52(sp) +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: sw zero, 8(sp) +; RV32I-NEXT: sw a1, 20(sp) +; RV32I-NEXT: sw a0, 16(sp) +; RV32I-NEXT: lui a0, 342392 +; RV32I-NEXT: addi a0, a0, 668 +; RV32I-NEXT: sw a0, 4(sp) +; RV32I-NEXT: lui a0, 770382 +; RV32I-NEXT: addi a0, a0, 1505 +; RV32I-NEXT: sw a0, 0(sp) +; RV32I-NEXT: srai a3, a1, 31 +; RV32I-NEXT: sw a3, 28(sp) +; RV32I-NEXT: addi a0, sp, 32 +; RV32I-NEXT: addi a1, sp, 16 +; RV32I-NEXT: mv a2, sp +; RV32I-NEXT: sw a3, 24(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: lw a1, 44(sp) +; RV32I-NEXT: lw a0, 40(sp) +; RV32I-NEXT: slli a2, a1, 27 +; RV32I-NEXT: srli a0, a0, 5 +; RV32I-NEXT: or a2, a0, a2 +; RV32I-NEXT: srli a0, a1, 31 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: srai a1, a1, 5 +; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: addi a2, zero, 98 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __moddi3 -; RV32I-NEXT: lw ra, 12(sp) -; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: sltu a2, s1, a0 +; RV32I-NEXT: sub a1, s0, a1 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sub a0, s1, a0 +; RV32I-NEXT: lw s1, 52(sp) +; RV32I-NEXT: lw s0, 56(sp) +; RV32I-NEXT: lw ra, 60(sp) +; RV32I-NEXT: addi sp, sp, 64 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: dont_fold_srem_i64: ; RV32IM: # %bb.0: -; RV32IM-NEXT: addi sp, sp, -16 -; RV32IM-NEXT: sw ra, 12(sp) -; RV32IM-NEXT: addi a2, zero, 98 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __moddi3 -; RV32IM-NEXT: lw ra, 12(sp) -; RV32IM-NEXT: addi sp, sp, 16 +; RV32IM-NEXT: addi sp, sp, -64 +; RV32IM-NEXT: sw ra, 60(sp) +; RV32IM-NEXT: sw s0, 56(sp) +; RV32IM-NEXT: sw s1, 52(sp) +; RV32IM-NEXT: mv s0, a1 +; RV32IM-NEXT: mv s1, a0 +; RV32IM-NEXT: sw zero, 12(sp) +; RV32IM-NEXT: sw zero, 8(sp) +; RV32IM-NEXT: sw a1, 20(sp) +; RV32IM-NEXT: sw a0, 16(sp) +; RV32IM-NEXT: lui a0, 342392 +; RV32IM-NEXT: addi a0, a0, 668 +; RV32IM-NEXT: sw a0, 4(sp) +; RV32IM-NEXT: lui a0, 770382 +; RV32IM-NEXT: addi a0, a0, 1505 +; RV32IM-NEXT: sw a0, 0(sp) +; RV32IM-NEXT: srai a3, a1, 31 +; RV32IM-NEXT: sw a3, 28(sp) +; RV32IM-NEXT: addi a0, sp, 32 +; RV32IM-NEXT: addi a1, sp, 16 +; RV32IM-NEXT: mv a2, sp +; RV32IM-NEXT: sw a3, 24(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: lw a0, 44(sp) +; RV32IM-NEXT: lw a1, 40(sp) +; RV32IM-NEXT: slli a2, a0, 27 +; RV32IM-NEXT: srli a1, a1, 5 +; RV32IM-NEXT: or a1, a1, a2 +; RV32IM-NEXT: srli a2, a0, 31 +; RV32IM-NEXT: add a2, a1, a2 +; RV32IM-NEXT: sltu a1, a2, a1 +; RV32IM-NEXT: srai a0, a0, 5 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: addi a1, zero, 98 +; RV32IM-NEXT: mul a0, a0, a1 +; RV32IM-NEXT: mulhu a3, a2, a1 +; RV32IM-NEXT: add a0, a3, a0 +; RV32IM-NEXT: sub a0, s0, a0 +; RV32IM-NEXT: mul a2, a2, a1 +; RV32IM-NEXT: sltu a1, s1, a2 +; RV32IM-NEXT: sub a1, a0, a1 +; RV32IM-NEXT: sub a0, s1, a2 +; RV32IM-NEXT: lw s1, 52(sp) +; RV32IM-NEXT: lw s0, 56(sp) +; RV32IM-NEXT: lw ra, 60(sp) +; RV32IM-NEXT: addi sp, sp, 64 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_srem_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: sd s0, 0(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: srai a1, a0, 63 +; RV64I-NEXT: lui a0, 2675 +; RV64I-NEXT: addiw a0, a0, -251 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a0, a0, 1839 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a0, a0, 167 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a2, a0, 1505 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: srli a0, a1, 63 +; RV64I-NEXT: srai a1, a1, 5 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: addi a1, zero, 98 -; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s0, a0 +; RV64I-NEXT: ld s0, 0(sp) ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll --- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -19,30 +19,66 @@ ; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: sw s5, 4(sp) -; RV32I-NEXT: lh s2, 12(a1) -; RV32I-NEXT: lh s3, 8(a1) -; RV32I-NEXT: lh s0, 4(a1) -; RV32I-NEXT: lh a2, 0(a1) -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lh s0, 0(a1) +; RV32I-NEXT: lui a0, 1048571 +; RV32I-NEXT: addi a1, a0, -905 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s0 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s3, s0, a0 +; RV32I-NEXT: lh s0, 12(s1) +; RV32I-NEXT: lui a0, 8 +; RV32I-NEXT: addi a1, a0, -687 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: sub a0, a0, s0 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 25 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: addi a1, zero, -1003 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s4, s0, a0 +; RV32I-NEXT: lh s0, 4(s1) +; RV32I-NEXT: lui a0, 1048572 +; RV32I-NEXT: addi a1, a0, -529 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srai a0, a0, 21 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, -124 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s5, s0, a0 +; RV32I-NEXT: lh s0, 8(s1) +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a1, a0, -1421 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srai a0, a0, 18 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 98 -; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a1, zero, -1003 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: sh a0, 6(s1) -; RV32I-NEXT: sh s0, 4(s1) -; RV32I-NEXT: sh s5, 2(s1) -; RV32I-NEXT: sh s4, 0(s1) +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: sh a0, 4(s2) +; RV32I-NEXT: sh s5, 2(s2) +; RV32I-NEXT: sh s4, 6(s2) +; RV32I-NEXT: sh s3, 0(s2) ; RV32I-NEXT: lw s5, 4(sp) ; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) @@ -55,52 +91,60 @@ ; ; RV32IM-LABEL: fold_srem_vec_1: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a6, 12(a1) -; RV32IM-NEXT: lh a3, 8(a1) -; RV32IM-NEXT: lh a4, 0(a1) -; RV32IM-NEXT: lh a1, 4(a1) -; RV32IM-NEXT: lui a5, 706409 -; RV32IM-NEXT: addi a5, a5, 389 -; RV32IM-NEXT: mulh a5, a4, a5 -; RV32IM-NEXT: add a5, a5, a4 -; RV32IM-NEXT: srli a2, a5, 31 -; RV32IM-NEXT: srli a5, a5, 6 -; RV32IM-NEXT: add a2, a5, a2 -; RV32IM-NEXT: addi a5, zero, 95 -; RV32IM-NEXT: mul a2, a2, a5 -; RV32IM-NEXT: sub a2, a4, a2 -; RV32IM-NEXT: lui a4, 507375 -; RV32IM-NEXT: addi a4, a4, 1981 -; RV32IM-NEXT: mulh a4, a1, a4 -; RV32IM-NEXT: sub a4, a4, a1 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 6 -; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: addi a5, zero, -124 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a1, a1, a4 -; RV32IM-NEXT: lui a4, 342392 -; RV32IM-NEXT: addi a4, a4, 669 -; RV32IM-NEXT: mulh a4, a3, a4 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 5 -; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: addi a5, zero, 98 -; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: lh a2, 0(a1) +; RV32IM-NEXT: lui a3, 1048571 +; RV32IM-NEXT: addi a3, a3, -905 +; RV32IM-NEXT: mul a3, a2, a3 +; RV32IM-NEXT: srli a3, a3, 16 +; RV32IM-NEXT: add a3, a3, a2 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 22 +; RV32IM-NEXT: srli a4, a3, 15 +; RV32IM-NEXT: andi a4, a4, 1 +; RV32IM-NEXT: add a3, a3, a4 +; RV32IM-NEXT: addi a4, zero, 95 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: lh a4, 12(a1) +; RV32IM-NEXT: sub a6, a2, a3 +; RV32IM-NEXT: lui a3, 8 +; RV32IM-NEXT: addi a3, a3, -687 +; RV32IM-NEXT: mul a3, a4, a3 +; RV32IM-NEXT: srli a3, a3, 16 ; RV32IM-NEXT: sub a3, a3, a4 -; RV32IM-NEXT: lui a4, 780943 -; RV32IM-NEXT: addi a4, a4, 1809 -; RV32IM-NEXT: mulh a4, a6, a4 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 8 -; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 25 +; RV32IM-NEXT: srli a5, a3, 15 +; RV32IM-NEXT: andi a5, a5, 1 +; RV32IM-NEXT: add a3, a3, a5 ; RV32IM-NEXT: addi a5, zero, -1003 +; RV32IM-NEXT: mul a3, a3, a5 +; RV32IM-NEXT: lh a5, 4(a1) +; RV32IM-NEXT: sub a3, a4, a3 +; RV32IM-NEXT: lui a4, 1048572 +; RV32IM-NEXT: addi a4, a4, -529 +; RV32IM-NEXT: mul a4, a5, a4 +; RV32IM-NEXT: srai a4, a4, 21 +; RV32IM-NEXT: srli a2, a4, 15 +; RV32IM-NEXT: andi a2, a2, 1 +; RV32IM-NEXT: add a2, a4, a2 +; RV32IM-NEXT: addi a4, zero, -124 +; RV32IM-NEXT: mul a2, a2, a4 +; RV32IM-NEXT: lh a1, 8(a1) +; RV32IM-NEXT: sub a2, a5, a2 +; RV32IM-NEXT: lui a4, 1 +; RV32IM-NEXT: addi a4, a4, -1421 +; RV32IM-NEXT: mul a4, a1, a4 +; RV32IM-NEXT: srai a4, a4, 18 +; RV32IM-NEXT: srli a5, a4, 15 +; RV32IM-NEXT: andi a5, a5, 1 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: addi a5, zero, 98 ; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a4, a6, a4 -; RV32IM-NEXT: sh a4, 6(a0) -; RV32IM-NEXT: sh a3, 4(a0) -; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: sh a1, 4(a0) +; RV32IM-NEXT: sh a2, 2(a0) +; RV32IM-NEXT: sh a3, 6(a0) +; RV32IM-NEXT: sh a6, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: fold_srem_vec_1: @@ -113,30 +157,66 @@ ; RV64I-NEXT: sd s3, 24(sp) ; RV64I-NEXT: sd s4, 16(sp) ; RV64I-NEXT: sd s5, 8(sp) -; RV64I-NEXT: lh s2, 24(a1) -; RV64I-NEXT: lh s3, 16(a1) -; RV64I-NEXT: lh s0, 8(a1) -; RV64I-NEXT: lh a2, 0(a1) -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: lh s0, 0(a1) +; RV64I-NEXT: lui a0, 1048571 +; RV64I-NEXT: addiw a1, a0, -905 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s0 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s3, s0, a0 +; RV64I-NEXT: lh s0, 24(s1) +; RV64I-NEXT: lui a0, 8 +; RV64I-NEXT: addiw a1, a0, -687 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: sub a0, a0, s0 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 57 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: addi a1, zero, -1003 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s4, s0, a0 +; RV64I-NEXT: lh s0, 8(s1) +; RV64I-NEXT: lui a0, 1048572 +; RV64I-NEXT: addiw a1, a0, -529 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srai a0, a0, 21 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, -124 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s5, s0, a0 +; RV64I-NEXT: lh s0, 16(s1) +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a1, a0, -1421 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srai a0, a0, 18 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 98 -; RV64I-NEXT: mv a0, s3 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: addi a1, zero, -1003 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: sh a0, 6(s1) -; RV64I-NEXT: sh s0, 4(s1) -; RV64I-NEXT: sh s5, 2(s1) -; RV64I-NEXT: sh s4, 0(s1) +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s0, a0 +; RV64I-NEXT: sh a0, 4(s2) +; RV64I-NEXT: sh s5, 2(s2) +; RV64I-NEXT: sh s4, 6(s2) +; RV64I-NEXT: sh s3, 0(s2) ; RV64I-NEXT: ld s5, 8(sp) ; RV64I-NEXT: ld s4, 16(sp) ; RV64I-NEXT: ld s3, 24(sp) @@ -149,76 +229,60 @@ ; ; RV64IM-LABEL: fold_srem_vec_1: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a6, 24(a1) -; RV64IM-NEXT: lh a3, 16(a1) -; RV64IM-NEXT: lh a4, 8(a1) -; RV64IM-NEXT: lh a1, 0(a1) -; RV64IM-NEXT: lui a5, 1045903 -; RV64IM-NEXT: addiw a5, a5, -733 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1035 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -905 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -1767 -; RV64IM-NEXT: mulh a5, a1, a5 -; RV64IM-NEXT: add a5, a5, a1 -; RV64IM-NEXT: srli a2, a5, 63 -; RV64IM-NEXT: srli a5, a5, 6 -; RV64IM-NEXT: add a2, a5, a2 -; RV64IM-NEXT: addi a5, zero, 95 -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: sub a1, a1, a2 -; RV64IM-NEXT: lui a2, 248 -; RV64IM-NEXT: addiw a2, a2, -1057 -; RV64IM-NEXT: slli a2, a2, 15 -; RV64IM-NEXT: addi a2, a2, -1057 -; RV64IM-NEXT: slli a2, a2, 15 -; RV64IM-NEXT: addi a2, a2, -1057 -; RV64IM-NEXT: slli a2, a2, 13 -; RV64IM-NEXT: addi a2, a2, -265 -; RV64IM-NEXT: mulh a2, a4, a2 -; RV64IM-NEXT: sub a2, a2, a4 -; RV64IM-NEXT: srli a5, a2, 63 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: add a2, a2, a5 -; RV64IM-NEXT: addi a5, zero, -124 -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: sub a2, a4, a2 -; RV64IM-NEXT: lui a4, 2675 -; RV64IM-NEXT: addiw a4, a4, -251 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 1839 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 167 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 1505 -; RV64IM-NEXT: mulh a4, a3, a4 -; RV64IM-NEXT: srli a5, a4, 63 -; RV64IM-NEXT: srli a4, a4, 5 -; RV64IM-NEXT: add a4, a4, a5 -; RV64IM-NEXT: addi a5, zero, 98 -; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: lh a2, 0(a1) +; RV64IM-NEXT: lui a3, 1048571 +; RV64IM-NEXT: addiw a3, a3, -905 +; RV64IM-NEXT: mul a3, a2, a3 +; RV64IM-NEXT: srli a3, a3, 16 +; RV64IM-NEXT: add a3, a3, a2 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 54 +; RV64IM-NEXT: srli a4, a3, 15 +; RV64IM-NEXT: andi a4, a4, 1 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: addi a4, zero, 95 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: lh a4, 24(a1) +; RV64IM-NEXT: sub a6, a2, a3 +; RV64IM-NEXT: lui a3, 8 +; RV64IM-NEXT: addiw a3, a3, -687 +; RV64IM-NEXT: mul a3, a4, a3 +; RV64IM-NEXT: srli a3, a3, 16 ; RV64IM-NEXT: sub a3, a3, a4 -; RV64IM-NEXT: lui a4, 1040212 -; RV64IM-NEXT: addiw a4, a4, 1977 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, -1907 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, -453 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, -1213 -; RV64IM-NEXT: mulh a4, a6, a4 -; RV64IM-NEXT: srli a5, a4, 63 -; RV64IM-NEXT: srli a4, a4, 7 -; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 57 +; RV64IM-NEXT: srli a5, a3, 15 +; RV64IM-NEXT: andi a5, a5, 1 +; RV64IM-NEXT: add a3, a3, a5 ; RV64IM-NEXT: addi a5, zero, -1003 +; RV64IM-NEXT: mul a3, a3, a5 +; RV64IM-NEXT: lh a5, 8(a1) +; RV64IM-NEXT: sub a3, a4, a3 +; RV64IM-NEXT: lui a4, 1048572 +; RV64IM-NEXT: addiw a4, a4, -529 +; RV64IM-NEXT: mul a4, a5, a4 +; RV64IM-NEXT: srai a4, a4, 21 +; RV64IM-NEXT: srli a2, a4, 15 +; RV64IM-NEXT: andi a2, a2, 1 +; RV64IM-NEXT: add a2, a4, a2 +; RV64IM-NEXT: addi a4, zero, -124 +; RV64IM-NEXT: mul a2, a2, a4 +; RV64IM-NEXT: lh a1, 16(a1) +; RV64IM-NEXT: sub a2, a5, a2 +; RV64IM-NEXT: lui a4, 1 +; RV64IM-NEXT: addiw a4, a4, -1421 +; RV64IM-NEXT: mul a4, a1, a4 +; RV64IM-NEXT: srai a4, a4, 18 +; RV64IM-NEXT: srli a5, a4, 15 +; RV64IM-NEXT: andi a5, a5, 1 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: addi a5, zero, 98 ; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a4, a6, a4 -; RV64IM-NEXT: sh a4, 6(a0) -; RV64IM-NEXT: sh a3, 4(a0) +; RV64IM-NEXT: sub a1, a1, a4 +; RV64IM-NEXT: sh a1, 4(a0) ; RV64IM-NEXT: sh a2, 2(a0) -; RV64IM-NEXT: sh a1, 0(a0) +; RV64IM-NEXT: sh a3, 6(a0) +; RV64IM-NEXT: sh a6, 0(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -235,30 +299,72 @@ ; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: sw s5, 4(sp) -; RV32I-NEXT: lh s2, 12(a1) -; RV32I-NEXT: lh s3, 8(a1) -; RV32I-NEXT: lh s0, 4(a1) -; RV32I-NEXT: lh a2, 0(a1) -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: sw s6, 0(sp) +; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lh s1, 0(a1) +; RV32I-NEXT: lui a0, 1048571 +; RV32I-NEXT: addi s0, a0, -905 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s4, s1, a0 +; RV32I-NEXT: lh s1, 4(s3) +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s5, s1, a0 +; RV32I-NEXT: lh s1, 8(s3) +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s6, s1, a0 +; RV32I-NEXT: lh s1, 12(s3) +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: sh a0, 6(s1) -; RV32I-NEXT: sh s0, 4(s1) -; RV32I-NEXT: sh s5, 2(s1) -; RV32I-NEXT: sh s4, 0(s1) +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s1, a0 +; RV32I-NEXT: sh a0, 6(s2) +; RV32I-NEXT: sh s6, 4(s2) +; RV32I-NEXT: sh s5, 2(s2) +; RV32I-NEXT: sh s4, 0(s2) +; RV32I-NEXT: lw s6, 0(sp) ; RV32I-NEXT: lw s5, 4(sp) ; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) @@ -271,45 +377,57 @@ ; ; RV32IM-LABEL: fold_srem_vec_2: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a6, 12(a1) -; RV32IM-NEXT: lh a3, 8(a1) -; RV32IM-NEXT: lh a4, 0(a1) -; RV32IM-NEXT: lh a1, 4(a1) -; RV32IM-NEXT: lui a5, 706409 -; RV32IM-NEXT: addi a5, a5, 389 -; RV32IM-NEXT: mulh a2, a4, a5 -; RV32IM-NEXT: add a2, a2, a4 -; RV32IM-NEXT: srli a7, a2, 31 -; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: add a2, a2, a7 -; RV32IM-NEXT: addi a7, zero, 95 -; RV32IM-NEXT: mul a2, a2, a7 -; RV32IM-NEXT: sub t0, a4, a2 -; RV32IM-NEXT: mulh a4, a1, a5 -; RV32IM-NEXT: add a4, a4, a1 -; RV32IM-NEXT: srli a2, a4, 31 -; RV32IM-NEXT: srli a4, a4, 6 -; RV32IM-NEXT: add a2, a4, a2 -; RV32IM-NEXT: mul a2, a2, a7 -; RV32IM-NEXT: sub a1, a1, a2 -; RV32IM-NEXT: mulh a2, a3, a5 -; RV32IM-NEXT: add a2, a2, a3 -; RV32IM-NEXT: srli a4, a2, 31 -; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: add a2, a2, a4 -; RV32IM-NEXT: mul a2, a2, a7 -; RV32IM-NEXT: sub a2, a3, a2 -; RV32IM-NEXT: mulh a3, a6, a5 -; RV32IM-NEXT: add a3, a3, a6 -; RV32IM-NEXT: srli a4, a3, 31 -; RV32IM-NEXT: srli a3, a3, 6 +; RV32IM-NEXT: lh a2, 0(a1) +; RV32IM-NEXT: lui a3, 1048571 +; RV32IM-NEXT: addi a3, a3, -905 +; RV32IM-NEXT: mul a4, a2, a3 +; RV32IM-NEXT: srli a4, a4, 16 +; RV32IM-NEXT: add a4, a4, a2 +; RV32IM-NEXT: slli a4, a4, 16 +; RV32IM-NEXT: srai a4, a4, 22 +; RV32IM-NEXT: srli a5, a4, 15 +; RV32IM-NEXT: andi a5, a5, 1 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: lh a5, 4(a1) +; RV32IM-NEXT: addi a6, zero, 95 +; RV32IM-NEXT: mul a4, a4, a6 +; RV32IM-NEXT: sub a7, a2, a4 +; RV32IM-NEXT: mul a4, a5, a3 +; RV32IM-NEXT: srli a4, a4, 16 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: slli a4, a4, 16 +; RV32IM-NEXT: srai a4, a4, 22 +; RV32IM-NEXT: srli a2, a4, 15 +; RV32IM-NEXT: andi t0, a2, 1 +; RV32IM-NEXT: lh a2, 8(a1) +; RV32IM-NEXT: add a4, a4, t0 +; RV32IM-NEXT: mul a4, a4, a6 +; RV32IM-NEXT: sub t0, a5, a4 +; RV32IM-NEXT: mul a5, a2, a3 +; RV32IM-NEXT: srli a5, a5, 16 +; RV32IM-NEXT: add a5, a5, a2 +; RV32IM-NEXT: slli a5, a5, 16 +; RV32IM-NEXT: srai a5, a5, 22 +; RV32IM-NEXT: srli a4, a5, 15 +; RV32IM-NEXT: andi a4, a4, 1 +; RV32IM-NEXT: lh a1, 12(a1) +; RV32IM-NEXT: add a4, a5, a4 +; RV32IM-NEXT: mul a4, a4, a6 +; RV32IM-NEXT: sub a2, a2, a4 +; RV32IM-NEXT: mul a3, a1, a3 +; RV32IM-NEXT: srli a3, a3, 16 +; RV32IM-NEXT: add a3, a3, a1 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 22 +; RV32IM-NEXT: srli a4, a3, 15 +; RV32IM-NEXT: andi a4, a4, 1 ; RV32IM-NEXT: add a3, a3, a4 -; RV32IM-NEXT: mul a3, a3, a7 -; RV32IM-NEXT: sub a3, a6, a3 -; RV32IM-NEXT: sh a3, 6(a0) +; RV32IM-NEXT: mul a3, a3, a6 +; RV32IM-NEXT: sub a1, a1, a3 +; RV32IM-NEXT: sh a1, 6(a0) ; RV32IM-NEXT: sh a2, 4(a0) -; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh t0, 0(a0) +; RV32IM-NEXT: sh t0, 2(a0) +; RV32IM-NEXT: sh a7, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: fold_srem_vec_2: @@ -322,30 +440,72 @@ ; RV64I-NEXT: sd s3, 24(sp) ; RV64I-NEXT: sd s4, 16(sp) ; RV64I-NEXT: sd s5, 8(sp) -; RV64I-NEXT: lh s2, 24(a1) -; RV64I-NEXT: lh s3, 16(a1) -; RV64I-NEXT: lh s0, 8(a1) -; RV64I-NEXT: lh a2, 0(a1) -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: sd s6, 0(sp) +; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: lh s1, 0(a1) +; RV64I-NEXT: lui a0, 1048571 +; RV64I-NEXT: addiw s0, a0, -905 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s4, s1, a0 +; RV64I-NEXT: lh s1, 8(s3) +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s5, s1, a0 +; RV64I-NEXT: lh s1, 16(s3) +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s3 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s6, s1, a0 +; RV64I-NEXT: lh s1, 24(s3) +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: sh a0, 6(s1) -; RV64I-NEXT: sh s0, 4(s1) -; RV64I-NEXT: sh s5, 2(s1) -; RV64I-NEXT: sh s4, 0(s1) +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s1, a0 +; RV64I-NEXT: sh a0, 6(s2) +; RV64I-NEXT: sh s6, 4(s2) +; RV64I-NEXT: sh s5, 2(s2) +; RV64I-NEXT: sh s4, 0(s2) +; RV64I-NEXT: ld s6, 0(sp) ; RV64I-NEXT: ld s5, 8(sp) ; RV64I-NEXT: ld s4, 16(sp) ; RV64I-NEXT: ld s3, 24(sp) @@ -358,51 +518,57 @@ ; ; RV64IM-LABEL: fold_srem_vec_2: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a6, 24(a1) -; RV64IM-NEXT: lh a7, 16(a1) -; RV64IM-NEXT: lh a4, 8(a1) -; RV64IM-NEXT: lh a1, 0(a1) -; RV64IM-NEXT: lui a5, 1045903 -; RV64IM-NEXT: addiw a5, a5, -733 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1035 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -905 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -1767 -; RV64IM-NEXT: mulh a2, a1, a5 -; RV64IM-NEXT: add a2, a2, a1 -; RV64IM-NEXT: srli a3, a2, 63 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: add a2, a2, a3 -; RV64IM-NEXT: addi a3, zero, 95 -; RV64IM-NEXT: mul a2, a2, a3 -; RV64IM-NEXT: sub t0, a1, a2 -; RV64IM-NEXT: mulh a2, a4, a5 -; RV64IM-NEXT: add a2, a2, a4 -; RV64IM-NEXT: srli a1, a2, 63 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: add a1, a2, a1 -; RV64IM-NEXT: mul a1, a1, a3 -; RV64IM-NEXT: sub a1, a4, a1 -; RV64IM-NEXT: mulh a2, a7, a5 -; RV64IM-NEXT: add a2, a2, a7 -; RV64IM-NEXT: srli a4, a2, 63 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: add a2, a2, a4 -; RV64IM-NEXT: mul a2, a2, a3 -; RV64IM-NEXT: sub a2, a7, a2 -; RV64IM-NEXT: mulh a4, a6, a5 -; RV64IM-NEXT: add a4, a4, a6 -; RV64IM-NEXT: srli a5, a4, 63 -; RV64IM-NEXT: srli a4, a4, 6 +; RV64IM-NEXT: lh a2, 0(a1) +; RV64IM-NEXT: lui a3, 1048571 +; RV64IM-NEXT: addiw a3, a3, -905 +; RV64IM-NEXT: mul a4, a2, a3 +; RV64IM-NEXT: srli a4, a4, 16 +; RV64IM-NEXT: add a4, a4, a2 +; RV64IM-NEXT: slli a4, a4, 48 +; RV64IM-NEXT: srai a4, a4, 54 +; RV64IM-NEXT: srli a5, a4, 15 +; RV64IM-NEXT: andi a5, a5, 1 ; RV64IM-NEXT: add a4, a4, a5 -; RV64IM-NEXT: mul a3, a4, a3 -; RV64IM-NEXT: sub a3, a6, a3 -; RV64IM-NEXT: sh a3, 6(a0) +; RV64IM-NEXT: lh a5, 8(a1) +; RV64IM-NEXT: addi a6, zero, 95 +; RV64IM-NEXT: mul a4, a4, a6 +; RV64IM-NEXT: sub a7, a2, a4 +; RV64IM-NEXT: mul a4, a5, a3 +; RV64IM-NEXT: srli a4, a4, 16 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: slli a4, a4, 48 +; RV64IM-NEXT: srai a4, a4, 54 +; RV64IM-NEXT: srli a2, a4, 15 +; RV64IM-NEXT: andi t0, a2, 1 +; RV64IM-NEXT: lh a2, 16(a1) +; RV64IM-NEXT: add a4, a4, t0 +; RV64IM-NEXT: mul a4, a4, a6 +; RV64IM-NEXT: sub t0, a5, a4 +; RV64IM-NEXT: mul a5, a2, a3 +; RV64IM-NEXT: srli a5, a5, 16 +; RV64IM-NEXT: add a5, a5, a2 +; RV64IM-NEXT: slli a5, a5, 48 +; RV64IM-NEXT: srai a5, a5, 54 +; RV64IM-NEXT: srli a4, a5, 15 +; RV64IM-NEXT: andi a4, a4, 1 +; RV64IM-NEXT: lh a1, 24(a1) +; RV64IM-NEXT: add a4, a5, a4 +; RV64IM-NEXT: mul a4, a4, a6 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: mul a3, a1, a3 +; RV64IM-NEXT: srli a3, a3, 16 +; RV64IM-NEXT: add a3, a3, a1 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 54 +; RV64IM-NEXT: srli a4, a3, 15 +; RV64IM-NEXT: andi a4, a4, 1 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: mul a3, a3, a6 +; RV64IM-NEXT: sub a1, a1, a3 +; RV64IM-NEXT: sh a1, 6(a0) ; RV64IM-NEXT: sh a2, 4(a0) -; RV64IM-NEXT: sh a1, 2(a0) -; RV64IM-NEXT: sh t0, 0(a0) +; RV64IM-NEXT: sh t0, 2(a0) +; RV64IM-NEXT: sh a7, 0(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -425,50 +591,78 @@ ; RV32I-NEXT: sw s7, 12(sp) ; RV32I-NEXT: sw s8, 8(sp) ; RV32I-NEXT: sw s9, 4(sp) -; RV32I-NEXT: lh s2, 0(a1) -; RV32I-NEXT: lh s3, 4(a1) -; RV32I-NEXT: lh s4, 8(a1) +; RV32I-NEXT: mv s4, a1 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lh s1, 12(a1) -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: lui a0, 1048571 +; RV32I-NEXT: addi s0, a0, -905 ; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s5, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s6, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s7, a0 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add s2, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 ; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s8, a0 -; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s7, s1, a0 +; RV32I-NEXT: lh s1, 8(s4) ; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: call __divsi3 -; RV32I-NEXT: mv s9, a0 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add s5, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: call __divsi3 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv a0, s5 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s8, s1, a0 +; RV32I-NEXT: lh s1, 4(s4) +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add s6, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: call __divsi3 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv a0, s6 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s9, s1, a0 +; RV32I-NEXT: lh s1, 0(s4) +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add s0, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __divsi3 -; RV32I-NEXT: add a0, s8, a0 -; RV32I-NEXT: add a1, s7, s1 -; RV32I-NEXT: add a2, s6, s4 -; RV32I-NEXT: add a3, s5, s9 -; RV32I-NEXT: sh a3, 6(s0) -; RV32I-NEXT: sh a2, 4(s0) -; RV32I-NEXT: sh a1, 2(s0) -; RV32I-NEXT: sh a0, 0(s0) +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s1, a0 +; RV32I-NEXT: add a0, a0, s0 +; RV32I-NEXT: add a1, s9, s6 +; RV32I-NEXT: add a2, s8, s5 +; RV32I-NEXT: add a3, s7, s2 +; RV32I-NEXT: sh a3, 6(s3) +; RV32I-NEXT: sh a2, 4(s3) +; RV32I-NEXT: sh a1, 2(s3) +; RV32I-NEXT: sh a0, 0(s3) ; RV32I-NEXT: lw s9, 4(sp) ; RV32I-NEXT: lw s8, 8(sp) ; RV32I-NEXT: lw s7, 12(sp) @@ -485,49 +679,61 @@ ; ; RV32IM-LABEL: combine_srem_sdiv: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a6, 0(a1) -; RV32IM-NEXT: lh a3, 4(a1) -; RV32IM-NEXT: lh a4, 12(a1) -; RV32IM-NEXT: lh a1, 8(a1) -; RV32IM-NEXT: lui a5, 706409 -; RV32IM-NEXT: addi a5, a5, 389 -; RV32IM-NEXT: mulh a2, a4, a5 -; RV32IM-NEXT: add a2, a2, a4 -; RV32IM-NEXT: srli a7, a2, 31 -; RV32IM-NEXT: srai a2, a2, 6 -; RV32IM-NEXT: add t0, a2, a7 -; RV32IM-NEXT: addi a7, zero, 95 -; RV32IM-NEXT: mul a2, t0, a7 -; RV32IM-NEXT: sub t1, a4, a2 -; RV32IM-NEXT: mulh a4, a1, a5 -; RV32IM-NEXT: add a4, a4, a1 -; RV32IM-NEXT: srli a2, a4, 31 -; RV32IM-NEXT: srai a4, a4, 6 -; RV32IM-NEXT: add a2, a4, a2 -; RV32IM-NEXT: mul a4, a2, a7 -; RV32IM-NEXT: sub t2, a1, a4 -; RV32IM-NEXT: mulh a4, a3, a5 -; RV32IM-NEXT: add a4, a4, a3 -; RV32IM-NEXT: srli a1, a4, 31 -; RV32IM-NEXT: srai a4, a4, 6 -; RV32IM-NEXT: add a1, a4, a1 -; RV32IM-NEXT: mul a4, a1, a7 -; RV32IM-NEXT: sub a3, a3, a4 -; RV32IM-NEXT: mulh a4, a6, a5 -; RV32IM-NEXT: add a4, a4, a6 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srai a4, a4, 6 +; RV32IM-NEXT: lh a2, 12(a1) +; RV32IM-NEXT: lui a3, 1048571 +; RV32IM-NEXT: addi a3, a3, -905 +; RV32IM-NEXT: mul a4, a2, a3 +; RV32IM-NEXT: srli a4, a4, 16 +; RV32IM-NEXT: add a4, a4, a2 +; RV32IM-NEXT: slli a4, a4, 16 +; RV32IM-NEXT: srai a4, a4, 22 +; RV32IM-NEXT: srli a5, a4, 15 +; RV32IM-NEXT: andi a5, a5, 1 +; RV32IM-NEXT: add a7, a4, a5 +; RV32IM-NEXT: lh a5, 8(a1) +; RV32IM-NEXT: addi a6, zero, 95 +; RV32IM-NEXT: mul a4, a7, a6 +; RV32IM-NEXT: sub t0, a2, a4 +; RV32IM-NEXT: mul a4, a5, a3 +; RV32IM-NEXT: srli a4, a4, 16 ; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: mul a5, a4, a7 -; RV32IM-NEXT: sub a5, a6, a5 +; RV32IM-NEXT: slli a4, a4, 16 +; RV32IM-NEXT: srai a4, a4, 22 +; RV32IM-NEXT: srli a2, a4, 15 +; RV32IM-NEXT: andi t1, a2, 1 +; RV32IM-NEXT: lh a2, 4(a1) +; RV32IM-NEXT: add t1, a4, t1 +; RV32IM-NEXT: mul a4, t1, a6 +; RV32IM-NEXT: sub t2, a5, a4 +; RV32IM-NEXT: mul a5, a2, a3 +; RV32IM-NEXT: srli a5, a5, 16 +; RV32IM-NEXT: add a5, a5, a2 +; RV32IM-NEXT: slli a5, a5, 16 +; RV32IM-NEXT: srai a5, a5, 22 +; RV32IM-NEXT: srli a4, a5, 15 +; RV32IM-NEXT: andi a4, a4, 1 +; RV32IM-NEXT: lh a1, 0(a1) ; RV32IM-NEXT: add a4, a5, a4 -; RV32IM-NEXT: add a1, a3, a1 -; RV32IM-NEXT: add a2, t2, a2 -; RV32IM-NEXT: add a3, t1, t0 -; RV32IM-NEXT: sh a3, 6(a0) -; RV32IM-NEXT: sh a2, 4(a0) -; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh a4, 0(a0) +; RV32IM-NEXT: mul a5, a4, a6 +; RV32IM-NEXT: sub a2, a2, a5 +; RV32IM-NEXT: mul a3, a1, a3 +; RV32IM-NEXT: srli a3, a3, 16 +; RV32IM-NEXT: add a3, a3, a1 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 22 +; RV32IM-NEXT: srli a5, a3, 15 +; RV32IM-NEXT: andi a5, a5, 1 +; RV32IM-NEXT: add a3, a3, a5 +; RV32IM-NEXT: mul a5, a3, a6 +; RV32IM-NEXT: sub a1, a1, a5 +; RV32IM-NEXT: add a1, a1, a3 +; RV32IM-NEXT: add a2, a2, a4 +; RV32IM-NEXT: add a3, t2, t1 +; RV32IM-NEXT: add a4, t0, a7 +; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a2, 2(a0) +; RV32IM-NEXT: sh a1, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: combine_srem_sdiv: @@ -544,50 +750,78 @@ ; RV64I-NEXT: sd s7, 24(sp) ; RV64I-NEXT: sd s8, 16(sp) ; RV64I-NEXT: sd s9, 8(sp) -; RV64I-NEXT: lh s2, 0(a1) -; RV64I-NEXT: lh s3, 8(a1) -; RV64I-NEXT: lh s4, 16(a1) +; RV64I-NEXT: mv s4, a1 +; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: lh s1, 24(a1) -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: lui a0, 1048571 +; RV64I-NEXT: addiw s0, a0, -905 ; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s5, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s6, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s3 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s7, a0 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add s2, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 ; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s8, a0 -; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s7, s1, a0 +; RV64I-NEXT: lh s1, 16(s4) ; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: call __divdi3 -; RV64I-NEXT: mv s9, a0 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add s5, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: call __divdi3 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: mv a0, s5 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s8, s1, a0 +; RV64I-NEXT: lh s1, 8(s4) +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add s6, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s3 -; RV64I-NEXT: call __divdi3 -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: mv a0, s6 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s9, s1, a0 +; RV64I-NEXT: lh s1, 0(s4) +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add s0, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __divdi3 -; RV64I-NEXT: add a0, s8, a0 -; RV64I-NEXT: add a1, s7, s1 -; RV64I-NEXT: add a2, s6, s4 -; RV64I-NEXT: add a3, s5, s9 -; RV64I-NEXT: sh a3, 6(s0) -; RV64I-NEXT: sh a2, 4(s0) -; RV64I-NEXT: sh a1, 2(s0) -; RV64I-NEXT: sh a0, 0(s0) +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s1, a0 +; RV64I-NEXT: add a0, a0, s0 +; RV64I-NEXT: add a1, s9, s6 +; RV64I-NEXT: add a2, s8, s5 +; RV64I-NEXT: add a3, s7, s2 +; RV64I-NEXT: sh a3, 6(s3) +; RV64I-NEXT: sh a2, 4(s3) +; RV64I-NEXT: sh a1, 2(s3) +; RV64I-NEXT: sh a0, 0(s3) ; RV64I-NEXT: ld s9, 8(sp) ; RV64I-NEXT: ld s8, 16(sp) ; RV64I-NEXT: ld s7, 24(sp) @@ -604,55 +838,61 @@ ; ; RV64IM-LABEL: combine_srem_sdiv: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a6, 0(a1) -; RV64IM-NEXT: lh a7, 8(a1) -; RV64IM-NEXT: lh a4, 16(a1) -; RV64IM-NEXT: lh a1, 24(a1) -; RV64IM-NEXT: lui a5, 1045903 -; RV64IM-NEXT: addiw a5, a5, -733 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1035 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -905 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -1767 -; RV64IM-NEXT: mulh a2, a1, a5 -; RV64IM-NEXT: add a2, a2, a1 -; RV64IM-NEXT: srli a3, a2, 63 -; RV64IM-NEXT: srai a2, a2, 6 -; RV64IM-NEXT: add t3, a2, a3 -; RV64IM-NEXT: addi t0, zero, 95 -; RV64IM-NEXT: mul a3, t3, t0 -; RV64IM-NEXT: sub t1, a1, a3 -; RV64IM-NEXT: mulh a3, a4, a5 -; RV64IM-NEXT: add a3, a3, a4 -; RV64IM-NEXT: srli a1, a3, 63 -; RV64IM-NEXT: srai a3, a3, 6 -; RV64IM-NEXT: add a1, a3, a1 -; RV64IM-NEXT: mul a3, a1, t0 -; RV64IM-NEXT: sub t2, a4, a3 -; RV64IM-NEXT: mulh a4, a7, a5 -; RV64IM-NEXT: add a4, a4, a7 -; RV64IM-NEXT: srli a3, a4, 63 -; RV64IM-NEXT: srai a4, a4, 6 -; RV64IM-NEXT: add a3, a4, a3 -; RV64IM-NEXT: mul a4, a3, t0 -; RV64IM-NEXT: sub a4, a7, a4 -; RV64IM-NEXT: mulh a5, a6, a5 -; RV64IM-NEXT: add a5, a5, a6 -; RV64IM-NEXT: srli a2, a5, 63 -; RV64IM-NEXT: srai a5, a5, 6 -; RV64IM-NEXT: add a2, a5, a2 -; RV64IM-NEXT: mul a5, a2, t0 -; RV64IM-NEXT: sub a5, a6, a5 -; RV64IM-NEXT: add a2, a5, a2 -; RV64IM-NEXT: add a3, a4, a3 -; RV64IM-NEXT: add a1, t2, a1 -; RV64IM-NEXT: add a4, t1, t3 +; RV64IM-NEXT: lh a2, 24(a1) +; RV64IM-NEXT: lui a3, 1048571 +; RV64IM-NEXT: addiw a3, a3, -905 +; RV64IM-NEXT: mul a4, a2, a3 +; RV64IM-NEXT: srli a4, a4, 16 +; RV64IM-NEXT: add a4, a4, a2 +; RV64IM-NEXT: slli a4, a4, 48 +; RV64IM-NEXT: srai a4, a4, 54 +; RV64IM-NEXT: srli a5, a4, 15 +; RV64IM-NEXT: andi a5, a5, 1 +; RV64IM-NEXT: add a7, a4, a5 +; RV64IM-NEXT: lh a5, 16(a1) +; RV64IM-NEXT: addi a6, zero, 95 +; RV64IM-NEXT: mul a4, a7, a6 +; RV64IM-NEXT: sub t0, a2, a4 +; RV64IM-NEXT: mul a4, a5, a3 +; RV64IM-NEXT: srli a4, a4, 16 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: slli a4, a4, 48 +; RV64IM-NEXT: srai a4, a4, 54 +; RV64IM-NEXT: srli a2, a4, 15 +; RV64IM-NEXT: andi t1, a2, 1 +; RV64IM-NEXT: lh a2, 8(a1) +; RV64IM-NEXT: add t1, a4, t1 +; RV64IM-NEXT: mul a4, t1, a6 +; RV64IM-NEXT: sub t2, a5, a4 +; RV64IM-NEXT: mul a5, a2, a3 +; RV64IM-NEXT: srli a5, a5, 16 +; RV64IM-NEXT: add a5, a5, a2 +; RV64IM-NEXT: slli a5, a5, 48 +; RV64IM-NEXT: srai a5, a5, 54 +; RV64IM-NEXT: srli a4, a5, 15 +; RV64IM-NEXT: andi a4, a4, 1 +; RV64IM-NEXT: lh a1, 0(a1) +; RV64IM-NEXT: add a4, a5, a4 +; RV64IM-NEXT: mul a5, a4, a6 +; RV64IM-NEXT: sub a2, a2, a5 +; RV64IM-NEXT: mul a3, a1, a3 +; RV64IM-NEXT: srli a3, a3, 16 +; RV64IM-NEXT: add a3, a3, a1 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 54 +; RV64IM-NEXT: srli a5, a3, 15 +; RV64IM-NEXT: andi a5, a5, 1 +; RV64IM-NEXT: add a3, a3, a5 +; RV64IM-NEXT: mul a5, a3, a6 +; RV64IM-NEXT: sub a1, a1, a5 +; RV64IM-NEXT: add a1, a1, a3 +; RV64IM-NEXT: add a2, a2, a4 +; RV64IM-NEXT: add a3, t2, t1 +; RV64IM-NEXT: add a4, t0, a7 ; RV64IM-NEXT: sh a4, 6(a0) -; RV64IM-NEXT: sh a1, 4(a0) -; RV64IM-NEXT: sh a3, 2(a0) -; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: sh a3, 4(a0) +; RV64IM-NEXT: sh a2, 2(a0) +; RV64IM-NEXT: sh a1, 0(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, %2 = sdiv <4 x i16> %x, @@ -670,33 +910,64 @@ ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: lh a2, 0(a1) -; RV32I-NEXT: lh a0, 12(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: slli a0, a2, 15 +; RV32I-NEXT: sub a0, a2, a0 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 21 +; RV32I-NEXT: srli a3, a0, 15 +; RV32I-NEXT: andi a3, a3, 1 +; RV32I-NEXT: lh a4, 4(a1) +; RV32I-NEXT: add a0, a0, a3 +; RV32I-NEXT: slli a0, a0, 6 +; RV32I-NEXT: sub s2, a2, a0 +; RV32I-NEXT: slli a0, a4, 15 +; RV32I-NEXT: sub a0, a4, a0 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, a4 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 20 +; RV32I-NEXT: srli a2, a0, 15 +; RV32I-NEXT: andi a2, a2, 1 ; RV32I-NEXT: lh a3, 8(a1) -; RV32I-NEXT: lh a1, 4(a1) -; RV32I-NEXT: srli a4, a2, 26 -; RV32I-NEXT: add a4, a2, a4 -; RV32I-NEXT: lui a6, 16 -; RV32I-NEXT: addi a5, a6, -64 -; RV32I-NEXT: and a4, a4, a5 -; RV32I-NEXT: sub s2, a2, a4 -; RV32I-NEXT: srli a2, a1, 27 -; RV32I-NEXT: add a2, a1, a2 -; RV32I-NEXT: addi a4, a6, -32 -; RV32I-NEXT: and a2, a2, a4 -; RV32I-NEXT: sub s3, a1, a2 -; RV32I-NEXT: srli a1, a3, 29 -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: addi a2, a6, -8 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: sub s1, a3, a1 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: slli a0, a0, 5 +; RV32I-NEXT: sub s3, a4, a0 +; RV32I-NEXT: slli a0, a3, 15 +; RV32I-NEXT: sub a0, a3, a0 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, a3 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 18 +; RV32I-NEXT: srli a2, a0, 15 +; RV32I-NEXT: andi a2, a2, 1 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: slli a0, a0, 3 +; RV32I-NEXT: sub s4, a3, a0 +; RV32I-NEXT: lh s1, 12(a1) +; RV32I-NEXT: lui a0, 1048571 +; RV32I-NEXT: addi a1, a0, -905 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s1, a0 ; RV32I-NEXT: sh a0, 6(s0) -; RV32I-NEXT: sh s1, 4(s0) +; RV32I-NEXT: sh s4, 4(s0) ; RV32I-NEXT: sh s3, 2(s0) ; RV32I-NEXT: sh s2, 0(s0) +; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -707,40 +978,59 @@ ; ; RV32IM-LABEL: dont_fold_srem_power_of_two: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a6, 8(a1) -; RV32IM-NEXT: lh a3, 4(a1) -; RV32IM-NEXT: lh a4, 12(a1) -; RV32IM-NEXT: lh a1, 0(a1) -; RV32IM-NEXT: lui a5, 706409 -; RV32IM-NEXT: addi a5, a5, 389 -; RV32IM-NEXT: mulh a5, a4, a5 -; RV32IM-NEXT: add a5, a5, a4 -; RV32IM-NEXT: srli a2, a5, 31 -; RV32IM-NEXT: srli a5, a5, 6 -; RV32IM-NEXT: add a2, a5, a2 +; RV32IM-NEXT: lh a2, 0(a1) +; RV32IM-NEXT: lui a3, 1048568 +; RV32IM-NEXT: addi a3, a3, 1 +; RV32IM-NEXT: mul a4, a2, a3 +; RV32IM-NEXT: srli a4, a4, 16 +; RV32IM-NEXT: add a4, a4, a2 +; RV32IM-NEXT: slli a4, a4, 16 +; RV32IM-NEXT: srai a4, a4, 21 +; RV32IM-NEXT: srli a5, a4, 15 +; RV32IM-NEXT: andi a6, a5, 1 +; RV32IM-NEXT: lh a5, 4(a1) +; RV32IM-NEXT: add a4, a4, a6 +; RV32IM-NEXT: slli a4, a4, 6 +; RV32IM-NEXT: sub a6, a2, a4 +; RV32IM-NEXT: mul a4, a5, a3 +; RV32IM-NEXT: srli a4, a4, 16 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: slli a4, a4, 16 +; RV32IM-NEXT: srai a4, a4, 20 +; RV32IM-NEXT: srli a2, a4, 15 +; RV32IM-NEXT: andi a7, a2, 1 +; RV32IM-NEXT: lh a2, 8(a1) +; RV32IM-NEXT: add a4, a4, a7 +; RV32IM-NEXT: slli a4, a4, 5 +; RV32IM-NEXT: sub a4, a5, a4 +; RV32IM-NEXT: mul a3, a2, a3 +; RV32IM-NEXT: srli a3, a3, 16 +; RV32IM-NEXT: add a3, a3, a2 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 18 +; RV32IM-NEXT: srli a5, a3, 15 +; RV32IM-NEXT: andi a5, a5, 1 +; RV32IM-NEXT: add a3, a3, a5 +; RV32IM-NEXT: slli a3, a3, 3 +; RV32IM-NEXT: lh a1, 12(a1) +; RV32IM-NEXT: sub a2, a2, a3 +; RV32IM-NEXT: lui a3, 1048571 +; RV32IM-NEXT: addi a3, a3, -905 +; RV32IM-NEXT: mul a3, a1, a3 +; RV32IM-NEXT: srli a3, a3, 16 +; RV32IM-NEXT: add a3, a3, a1 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 22 +; RV32IM-NEXT: srli a5, a3, 15 +; RV32IM-NEXT: andi a5, a5, 1 +; RV32IM-NEXT: add a3, a3, a5 ; RV32IM-NEXT: addi a5, zero, 95 -; RV32IM-NEXT: mul a2, a2, a5 -; RV32IM-NEXT: sub a7, a4, a2 -; RV32IM-NEXT: srli a4, a1, 26 -; RV32IM-NEXT: add a4, a1, a4 -; RV32IM-NEXT: lui a5, 16 -; RV32IM-NEXT: addi a2, a5, -64 -; RV32IM-NEXT: and a2, a4, a2 -; RV32IM-NEXT: sub a1, a1, a2 -; RV32IM-NEXT: srli a2, a3, 27 -; RV32IM-NEXT: add a2, a3, a2 -; RV32IM-NEXT: addi a4, a5, -32 -; RV32IM-NEXT: and a2, a2, a4 -; RV32IM-NEXT: sub a2, a3, a2 -; RV32IM-NEXT: srli a3, a6, 29 -; RV32IM-NEXT: add a3, a6, a3 -; RV32IM-NEXT: addi a4, a5, -8 -; RV32IM-NEXT: and a3, a3, a4 -; RV32IM-NEXT: sub a3, a6, a3 -; RV32IM-NEXT: sh a3, 4(a0) -; RV32IM-NEXT: sh a2, 2(a0) -; RV32IM-NEXT: sh a1, 0(a0) -; RV32IM-NEXT: sh a7, 6(a0) +; RV32IM-NEXT: mul a3, a3, a5 +; RV32IM-NEXT: sub a1, a1, a3 +; RV32IM-NEXT: sh a1, 6(a0) +; RV32IM-NEXT: sh a2, 4(a0) +; RV32IM-NEXT: sh a4, 2(a0) +; RV32IM-NEXT: sh a6, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_srem_power_of_two: @@ -751,33 +1041,64 @@ ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) ; RV64I-NEXT: sd s3, 8(sp) -; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: sd s4, 0(sp) ; RV64I-NEXT: lh a2, 0(a1) -; RV64I-NEXT: lh a0, 24(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: slli a0, a2, 15 +; RV64I-NEXT: sub a0, a2, a0 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 53 +; RV64I-NEXT: srli a3, a0, 15 +; RV64I-NEXT: andi a3, a3, 1 +; RV64I-NEXT: lh a4, 8(a1) +; RV64I-NEXT: add a0, a0, a3 +; RV64I-NEXT: slli a0, a0, 6 +; RV64I-NEXT: sub s2, a2, a0 +; RV64I-NEXT: slli a0, a4, 15 +; RV64I-NEXT: sub a0, a4, a0 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 52 +; RV64I-NEXT: srli a2, a0, 15 +; RV64I-NEXT: andi a2, a2, 1 ; RV64I-NEXT: lh a3, 16(a1) -; RV64I-NEXT: lh a1, 8(a1) -; RV64I-NEXT: srli a4, a2, 58 -; RV64I-NEXT: add a4, a2, a4 -; RV64I-NEXT: lui a6, 16 -; RV64I-NEXT: addiw a5, a6, -64 -; RV64I-NEXT: and a4, a4, a5 -; RV64I-NEXT: sub s2, a2, a4 -; RV64I-NEXT: srli a2, a1, 59 -; RV64I-NEXT: add a2, a1, a2 -; RV64I-NEXT: addiw a4, a6, -32 -; RV64I-NEXT: and a2, a2, a4 -; RV64I-NEXT: sub s3, a1, a2 -; RV64I-NEXT: srli a1, a3, 61 -; RV64I-NEXT: add a1, a3, a1 -; RV64I-NEXT: addiw a2, a6, -8 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub s1, a3, a1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: slli a0, a0, 5 +; RV64I-NEXT: sub s3, a4, a0 +; RV64I-NEXT: slli a0, a3, 15 +; RV64I-NEXT: sub a0, a3, a0 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, a3 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 50 +; RV64I-NEXT: srli a2, a0, 15 +; RV64I-NEXT: andi a2, a2, 1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: slli a0, a0, 3 +; RV64I-NEXT: sub s4, a3, a0 +; RV64I-NEXT: lh s1, 24(a1) +; RV64I-NEXT: lui a0, 1048571 +; RV64I-NEXT: addiw a1, a0, -905 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s1, a0 ; RV64I-NEXT: sh a0, 6(s0) -; RV64I-NEXT: sh s1, 4(s0) +; RV64I-NEXT: sh s4, 4(s0) ; RV64I-NEXT: sh s3, 2(s0) ; RV64I-NEXT: sh s2, 0(s0) +; RV64I-NEXT: ld s4, 0(sp) ; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) @@ -788,46 +1109,60 @@ ; ; RV64IM-LABEL: dont_fold_srem_power_of_two: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a6, 16(a1) -; RV64IM-NEXT: lh a3, 8(a1) -; RV64IM-NEXT: lh a4, 0(a1) +; RV64IM-NEXT: lh a2, 0(a1) +; RV64IM-NEXT: slli a3, a2, 15 +; RV64IM-NEXT: sub a3, a2, a3 +; RV64IM-NEXT: srli a3, a3, 16 +; RV64IM-NEXT: add a3, a3, a2 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 53 +; RV64IM-NEXT: srli a4, a3, 15 +; RV64IM-NEXT: andi a4, a4, 1 +; RV64IM-NEXT: lh a5, 8(a1) +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: slli a3, a3, 6 +; RV64IM-NEXT: sub a6, a2, a3 +; RV64IM-NEXT: slli a3, a5, 15 +; RV64IM-NEXT: sub a3, a5, a3 +; RV64IM-NEXT: srli a3, a3, 16 +; RV64IM-NEXT: add a3, a3, a5 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 52 +; RV64IM-NEXT: srli a4, a3, 15 +; RV64IM-NEXT: andi a4, a4, 1 +; RV64IM-NEXT: lh a2, 16(a1) +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: slli a3, a3, 5 +; RV64IM-NEXT: sub a3, a5, a3 +; RV64IM-NEXT: slli a4, a2, 15 +; RV64IM-NEXT: sub a4, a2, a4 +; RV64IM-NEXT: srli a4, a4, 16 +; RV64IM-NEXT: add a4, a4, a2 +; RV64IM-NEXT: slli a4, a4, 48 +; RV64IM-NEXT: srai a4, a4, 50 +; RV64IM-NEXT: srli a5, a4, 15 +; RV64IM-NEXT: andi a5, a5, 1 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: slli a4, a4, 3 ; RV64IM-NEXT: lh a1, 24(a1) -; RV64IM-NEXT: lui a5, 1045903 -; RV64IM-NEXT: addiw a5, a5, -733 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1035 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -905 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -1767 -; RV64IM-NEXT: mulh a5, a1, a5 -; RV64IM-NEXT: add a5, a5, a1 -; RV64IM-NEXT: srli a2, a5, 63 -; RV64IM-NEXT: srli a5, a5, 6 -; RV64IM-NEXT: add a2, a5, a2 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: lui a4, 1048571 +; RV64IM-NEXT: addiw a4, a4, -905 +; RV64IM-NEXT: mul a4, a1, a4 +; RV64IM-NEXT: srli a4, a4, 16 +; RV64IM-NEXT: add a4, a4, a1 +; RV64IM-NEXT: slli a4, a4, 48 +; RV64IM-NEXT: srai a4, a4, 54 +; RV64IM-NEXT: srli a5, a4, 15 +; RV64IM-NEXT: andi a5, a5, 1 +; RV64IM-NEXT: add a4, a4, a5 ; RV64IM-NEXT: addi a5, zero, 95 -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: sub a7, a1, a2 -; RV64IM-NEXT: srli a2, a4, 58 -; RV64IM-NEXT: add a2, a4, a2 -; RV64IM-NEXT: lui a5, 16 -; RV64IM-NEXT: addiw a1, a5, -64 -; RV64IM-NEXT: and a1, a2, a1 -; RV64IM-NEXT: sub a1, a4, a1 -; RV64IM-NEXT: srli a2, a3, 59 -; RV64IM-NEXT: add a2, a3, a2 -; RV64IM-NEXT: addiw a4, a5, -32 -; RV64IM-NEXT: and a2, a2, a4 -; RV64IM-NEXT: sub a2, a3, a2 -; RV64IM-NEXT: srli a3, a6, 61 -; RV64IM-NEXT: add a3, a6, a3 -; RV64IM-NEXT: addiw a4, a5, -8 -; RV64IM-NEXT: and a3, a3, a4 -; RV64IM-NEXT: sub a3, a6, a3 -; RV64IM-NEXT: sh a3, 4(a0) -; RV64IM-NEXT: sh a2, 2(a0) -; RV64IM-NEXT: sh a1, 0(a0) -; RV64IM-NEXT: sh a7, 6(a0) +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a1, a1, a4 +; RV64IM-NEXT: sh a1, 6(a0) +; RV64IM-NEXT: sh a2, 4(a0) +; RV64IM-NEXT: sh a3, 2(a0) +; RV64IM-NEXT: sh a6, 0(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -843,26 +1178,55 @@ ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) -; RV32I-NEXT: lh s2, 12(a1) -; RV32I-NEXT: lh s1, 8(a1) -; RV32I-NEXT: lh a2, 4(a1) -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a1, zero, 654 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: sw s4, 8(sp) +; RV32I-NEXT: sw s5, 4(sp) +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lh s0, 8(a1) +; RV32I-NEXT: lui a0, 1048571 +; RV32I-NEXT: addi a1, a0, 535 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s0 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 20 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 23 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a1, a0, 1327 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: sh zero, 0(s0) -; RV32I-NEXT: sh a0, 6(s0) -; RV32I-NEXT: sh s1, 4(s0) -; RV32I-NEXT: sh s3, 2(s0) +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s3, s0, a0 +; RV32I-NEXT: lh s0, 4(s1) +; RV32I-NEXT: lui s4, 3 +; RV32I-NEXT: addi a1, s4, 539 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srai a0, a0, 23 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: addi a1, zero, 654 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s5, s0, a0 +; RV32I-NEXT: lh s0, 12(s1) +; RV32I-NEXT: addi a1, s4, 87 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srai a0, a0, 26 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lui a1, 1 +; RV32I-NEXT: addi a1, a1, 1327 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: sh zero, 0(s2) +; RV32I-NEXT: sh a0, 6(s2) +; RV32I-NEXT: sh s5, 2(s2) +; RV32I-NEXT: sh s3, 4(s2) +; RV32I-NEXT: lw s5, 4(sp) +; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -873,137 +1237,155 @@ ; ; RV32IM-LABEL: dont_fold_srem_one: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a2, 12(a1) -; RV32IM-NEXT: lh a3, 4(a1) -; RV32IM-NEXT: lh a1, 8(a1) -; RV32IM-NEXT: lui a4, 820904 -; RV32IM-NEXT: addi a4, a4, -1903 -; RV32IM-NEXT: mulh a4, a3, a4 -; RV32IM-NEXT: add a4, a4, a3 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 9 -; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: lh a2, 8(a1) +; RV32IM-NEXT: lui a3, 1048571 +; RV32IM-NEXT: addi a3, a3, 535 +; RV32IM-NEXT: mul a3, a2, a3 +; RV32IM-NEXT: srli a3, a3, 16 +; RV32IM-NEXT: add a3, a3, a2 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 20 +; RV32IM-NEXT: srli a4, a3, 15 +; RV32IM-NEXT: andi a4, a4, 1 +; RV32IM-NEXT: add a3, a3, a4 +; RV32IM-NEXT: addi a4, zero, 23 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: lh a4, 4(a1) +; RV32IM-NEXT: sub a6, a2, a3 +; RV32IM-NEXT: lui a3, 3 +; RV32IM-NEXT: addi a5, a3, 539 +; RV32IM-NEXT: mul a5, a4, a5 +; RV32IM-NEXT: srai a5, a5, 23 +; RV32IM-NEXT: srli a2, a5, 15 +; RV32IM-NEXT: andi a2, a2, 1 +; RV32IM-NEXT: add a2, a5, a2 ; RV32IM-NEXT: addi a5, zero, 654 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a3, a3, a4 -; RV32IM-NEXT: lui a4, 729444 -; RV32IM-NEXT: addi a4, a4, 713 -; RV32IM-NEXT: mulh a4, a1, a4 -; RV32IM-NEXT: add a4, a4, a1 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 4 -; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: addi a5, zero, 23 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a1, a1, a4 -; RV32IM-NEXT: lui a4, 395996 -; RV32IM-NEXT: addi a4, a4, -2009 -; RV32IM-NEXT: mulh a4, a2, a4 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 11 -; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: lui a5, 1 -; RV32IM-NEXT: addi a5, a5, 1327 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a2, a2, a4 +; RV32IM-NEXT: lh a1, 12(a1) +; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: sub a2, a4, a2 +; RV32IM-NEXT: addi a3, a3, 87 +; RV32IM-NEXT: mul a3, a1, a3 +; RV32IM-NEXT: srai a3, a3, 26 +; RV32IM-NEXT: srli a4, a3, 15 +; RV32IM-NEXT: andi a4, a4, 1 +; RV32IM-NEXT: add a3, a3, a4 +; RV32IM-NEXT: lui a4, 1 +; RV32IM-NEXT: addi a4, a4, 1327 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: sub a1, a1, a3 ; RV32IM-NEXT: sh zero, 0(a0) -; RV32IM-NEXT: sh a2, 6(a0) -; RV32IM-NEXT: sh a1, 4(a0) -; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: sh a1, 6(a0) +; RV32IM-NEXT: sh a2, 2(a0) +; RV32IM-NEXT: sh a6, 4(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_srem_one: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) -; RV64I-NEXT: lh s2, 24(a1) -; RV64I-NEXT: lh s1, 16(a1) -; RV64I-NEXT: lh a2, 8(a1) -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: addi a1, zero, 654 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: sd ra, 56(sp) +; RV64I-NEXT: sd s0, 48(sp) +; RV64I-NEXT: sd s1, 40(sp) +; RV64I-NEXT: sd s2, 32(sp) +; RV64I-NEXT: sd s3, 24(sp) +; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: sd s5, 8(sp) +; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: lh s0, 16(a1) +; RV64I-NEXT: lui a0, 1048571 +; RV64I-NEXT: addiw a1, a0, 535 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s0 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 52 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 23 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a1, a0, 1327 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: sh zero, 0(s0) -; RV64I-NEXT: sh a0, 6(s0) -; RV64I-NEXT: sh s1, 4(s0) -; RV64I-NEXT: sh s3, 2(s0) -; RV64I-NEXT: ld s3, 8(sp) -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s3, s0, a0 +; RV64I-NEXT: lh s0, 8(s1) +; RV64I-NEXT: lui s4, 3 +; RV64I-NEXT: addiw a1, s4, 539 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srai a0, a0, 23 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: addi a1, zero, 654 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s5, s0, a0 +; RV64I-NEXT: lh s0, 24(s1) +; RV64I-NEXT: addiw a1, s4, 87 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srai a0, a0, 26 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lui a1, 1 +; RV64I-NEXT: addiw a1, a1, 1327 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s0, a0 +; RV64I-NEXT: sh zero, 0(s2) +; RV64I-NEXT: sh a0, 6(s2) +; RV64I-NEXT: sh s5, 2(s2) +; RV64I-NEXT: sh s3, 4(s2) +; RV64I-NEXT: ld s5, 8(sp) +; RV64I-NEXT: ld s4, 16(sp) +; RV64I-NEXT: ld s3, 24(sp) +; RV64I-NEXT: ld s2, 32(sp) +; RV64I-NEXT: ld s1, 40(sp) +; RV64I-NEXT: ld s0, 48(sp) +; RV64I-NEXT: ld ra, 56(sp) +; RV64I-NEXT: addi sp, sp, 64 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: dont_fold_srem_one: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a2, 24(a1) -; RV64IM-NEXT: lh a3, 8(a1) -; RV64IM-NEXT: lh a1, 16(a1) -; RV64IM-NEXT: lui a4, 1043590 -; RV64IM-NEXT: addiw a4, a4, -1781 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 1069 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, -1959 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 357 -; RV64IM-NEXT: mulh a4, a1, a4 -; RV64IM-NEXT: add a4, a4, a1 -; RV64IM-NEXT: srli a5, a4, 63 -; RV64IM-NEXT: srli a4, a4, 4 -; RV64IM-NEXT: add a4, a4, a5 -; RV64IM-NEXT: addi a5, zero, 23 -; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a1, a1, a4 -; RV64IM-NEXT: lui a4, 6413 -; RV64IM-NEXT: addiw a4, a4, 1265 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 1027 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 1077 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 965 -; RV64IM-NEXT: mulh a4, a3, a4 -; RV64IM-NEXT: srli a5, a4, 63 -; RV64IM-NEXT: srli a4, a4, 8 -; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: lh a2, 16(a1) +; RV64IM-NEXT: lui a3, 1048571 +; RV64IM-NEXT: addiw a3, a3, 535 +; RV64IM-NEXT: mul a3, a2, a3 +; RV64IM-NEXT: srli a3, a3, 16 +; RV64IM-NEXT: add a3, a3, a2 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 52 +; RV64IM-NEXT: srli a4, a3, 15 +; RV64IM-NEXT: andi a4, a4, 1 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: addi a4, zero, 23 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: lh a4, 8(a1) +; RV64IM-NEXT: sub a6, a2, a3 +; RV64IM-NEXT: lui a3, 3 +; RV64IM-NEXT: addiw a5, a3, 539 +; RV64IM-NEXT: mul a5, a4, a5 +; RV64IM-NEXT: srai a5, a5, 23 +; RV64IM-NEXT: srli a2, a5, 15 +; RV64IM-NEXT: andi a2, a2, 1 +; RV64IM-NEXT: add a2, a5, a2 ; RV64IM-NEXT: addi a5, zero, 654 -; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a3, a3, a4 -; RV64IM-NEXT: lui a4, 12375 -; RV64IM-NEXT: addiw a4, a4, -575 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 883 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, -431 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 1959 -; RV64IM-NEXT: mulh a4, a2, a4 -; RV64IM-NEXT: srli a5, a4, 63 -; RV64IM-NEXT: srli a4, a4, 11 -; RV64IM-NEXT: add a4, a4, a5 -; RV64IM-NEXT: lui a5, 1 -; RV64IM-NEXT: addiw a5, a5, 1327 -; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: lh a1, 24(a1) +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a2, a4, a2 +; RV64IM-NEXT: addiw a3, a3, 87 +; RV64IM-NEXT: mul a3, a1, a3 +; RV64IM-NEXT: srai a3, a3, 26 +; RV64IM-NEXT: srli a4, a3, 15 +; RV64IM-NEXT: andi a4, a4, 1 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: lui a4, 1 +; RV64IM-NEXT: addiw a4, a4, 1327 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: sub a1, a1, a3 ; RV64IM-NEXT: sh zero, 0(a0) -; RV64IM-NEXT: sh a2, 6(a0) -; RV64IM-NEXT: sh a3, 2(a0) -; RV64IM-NEXT: sh a1, 4(a0) +; RV64IM-NEXT: sh a1, 6(a0) +; RV64IM-NEXT: sh a2, 2(a0) +; RV64IM-NEXT: sh a6, 4(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -1019,26 +1401,51 @@ ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) -; RV32I-NEXT: lh a2, 4(a1) -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lh s2, 12(a1) -; RV32I-NEXT: lh a0, 8(a1) -; RV32I-NEXT: srli a1, a2, 17 -; RV32I-NEXT: add a1, a2, a1 -; RV32I-NEXT: lui a3, 8 -; RV32I-NEXT: and a1, a1, a3 -; RV32I-NEXT: sub s3, a2, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: lh a1, 4(a1) +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: slli a0, a1, 15 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 30 +; RV32I-NEXT: srli a2, a0, 15 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: slli a0, a0, 15 +; RV32I-NEXT: add s3, a1, a0 +; RV32I-NEXT: lh s1, 8(s0) +; RV32I-NEXT: lui a0, 1048571 +; RV32I-NEXT: addi a1, a0, 535 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 20 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 23 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a1, a0, 1327 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: sh zero, 0(s0) -; RV32I-NEXT: sh a0, 6(s0) -; RV32I-NEXT: sh s1, 4(s0) -; RV32I-NEXT: sh s3, 2(s0) +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s1, s1, a0 +; RV32I-NEXT: lh s0, 12(s0) +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a1, a0, 87 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srai a0, a0, 26 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lui a1, 1 +; RV32I-NEXT: addi a1, a1, 1327 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: sh zero, 0(s2) +; RV32I-NEXT: sh a0, 6(s2) +; RV32I-NEXT: sh s1, 4(s2) +; RV32I-NEXT: sh s3, 2(s2) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -1049,38 +1456,48 @@ ; ; RV32IM-LABEL: dont_fold_urem_i16_smax: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a2, 4(a1) -; RV32IM-NEXT: lh a3, 8(a1) -; RV32IM-NEXT: lh a1, 12(a1) -; RV32IM-NEXT: lui a4, 729444 -; RV32IM-NEXT: addi a4, a4, 713 -; RV32IM-NEXT: mulh a4, a3, a4 -; RV32IM-NEXT: add a4, a4, a3 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 4 -; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: addi a5, zero, 23 -; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: lh a2, 8(a1) +; RV32IM-NEXT: lui a3, 1048571 +; RV32IM-NEXT: addi a3, a3, 535 +; RV32IM-NEXT: mul a3, a2, a3 +; RV32IM-NEXT: srli a3, a3, 16 +; RV32IM-NEXT: add a3, a3, a2 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 20 +; RV32IM-NEXT: srli a4, a3, 15 +; RV32IM-NEXT: andi a4, a4, 1 +; RV32IM-NEXT: add a3, a3, a4 +; RV32IM-NEXT: addi a4, zero, 23 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: lh a4, 4(a1) +; RV32IM-NEXT: sub a2, a2, a3 +; RV32IM-NEXT: lui a3, 8 +; RV32IM-NEXT: addi a3, a3, -1 +; RV32IM-NEXT: mul a3, a4, a3 +; RV32IM-NEXT: srli a3, a3, 16 ; RV32IM-NEXT: sub a3, a3, a4 -; RV32IM-NEXT: lui a4, 395996 -; RV32IM-NEXT: addi a4, a4, -2009 -; RV32IM-NEXT: mulh a4, a1, a4 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 11 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 30 +; RV32IM-NEXT: srli a5, a3, 15 +; RV32IM-NEXT: add a3, a3, a5 +; RV32IM-NEXT: slli a3, a3, 15 +; RV32IM-NEXT: lh a1, 12(a1) +; RV32IM-NEXT: add a3, a4, a3 +; RV32IM-NEXT: lui a4, 3 +; RV32IM-NEXT: addi a4, a4, 87 +; RV32IM-NEXT: mul a4, a1, a4 +; RV32IM-NEXT: srai a4, a4, 26 +; RV32IM-NEXT: srli a5, a4, 15 +; RV32IM-NEXT: andi a5, a5, 1 ; RV32IM-NEXT: add a4, a4, a5 ; RV32IM-NEXT: lui a5, 1 ; RV32IM-NEXT: addi a5, a5, 1327 ; RV32IM-NEXT: mul a4, a4, a5 ; RV32IM-NEXT: sub a1, a1, a4 -; RV32IM-NEXT: srli a4, a2, 17 -; RV32IM-NEXT: add a4, a2, a4 -; RV32IM-NEXT: lui a5, 8 -; RV32IM-NEXT: and a4, a4, a5 -; RV32IM-NEXT: sub a2, a2, a4 ; RV32IM-NEXT: sh zero, 0(a0) -; RV32IM-NEXT: sh a2, 2(a0) ; RV32IM-NEXT: sh a1, 6(a0) -; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: sh a2, 4(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_urem_i16_smax: @@ -1091,26 +1508,51 @@ ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) ; RV64I-NEXT: sd s3, 8(sp) -; RV64I-NEXT: lh a2, 8(a1) -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lh s2, 24(a1) -; RV64I-NEXT: lh a0, 16(a1) -; RV64I-NEXT: srli a1, a2, 49 -; RV64I-NEXT: add a1, a2, a1 -; RV64I-NEXT: lui a3, 8 -; RV64I-NEXT: and a1, a1, a3 -; RV64I-NEXT: sub s3, a2, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: lh a1, 8(a1) +; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: slli a0, a1, 15 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 62 +; RV64I-NEXT: srli a2, a0, 15 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: slli a0, a0, 15 +; RV64I-NEXT: add s3, a1, a0 +; RV64I-NEXT: lh s1, 16(s0) +; RV64I-NEXT: lui a0, 1048571 +; RV64I-NEXT: addiw a1, a0, 535 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 52 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 23 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a1, a0, 1327 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: sh zero, 0(s0) -; RV64I-NEXT: sh a0, 6(s0) -; RV64I-NEXT: sh s1, 4(s0) -; RV64I-NEXT: sh s3, 2(s0) +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s1, s1, a0 +; RV64I-NEXT: lh s0, 24(s0) +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a1, a0, 87 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srai a0, a0, 26 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lui a1, 1 +; RV64I-NEXT: addiw a1, a1, 1327 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s0, a0 +; RV64I-NEXT: sh zero, 0(s2) +; RV64I-NEXT: sh a0, 6(s2) +; RV64I-NEXT: sh s1, 4(s2) +; RV64I-NEXT: sh s3, 2(s2) ; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) @@ -1122,49 +1564,46 @@ ; RV64IM-LABEL: dont_fold_urem_i16_smax: ; RV64IM: # %bb.0: ; RV64IM-NEXT: lh a2, 8(a1) -; RV64IM-NEXT: lh a3, 24(a1) -; RV64IM-NEXT: lh a1, 16(a1) -; RV64IM-NEXT: lui a4, 1043590 -; RV64IM-NEXT: addiw a4, a4, -1781 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 1069 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, -1959 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 357 -; RV64IM-NEXT: mulh a4, a1, a4 -; RV64IM-NEXT: add a4, a4, a1 -; RV64IM-NEXT: srli a5, a4, 63 -; RV64IM-NEXT: srli a4, a4, 4 -; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: slli a3, a2, 15 +; RV64IM-NEXT: sub a3, a3, a2 +; RV64IM-NEXT: srli a3, a3, 16 +; RV64IM-NEXT: sub a3, a3, a2 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 62 +; RV64IM-NEXT: srli a4, a3, 15 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: slli a3, a3, 15 +; RV64IM-NEXT: lh a4, 16(a1) +; RV64IM-NEXT: add a2, a2, a3 +; RV64IM-NEXT: lui a3, 1048571 +; RV64IM-NEXT: addiw a3, a3, 535 +; RV64IM-NEXT: mul a3, a4, a3 +; RV64IM-NEXT: srli a3, a3, 16 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 52 +; RV64IM-NEXT: srli a5, a3, 15 +; RV64IM-NEXT: andi a5, a5, 1 +; RV64IM-NEXT: add a3, a3, a5 ; RV64IM-NEXT: addi a5, zero, 23 -; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a1, a1, a4 -; RV64IM-NEXT: lui a4, 12375 -; RV64IM-NEXT: addiw a4, a4, -575 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 883 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, -431 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 1959 -; RV64IM-NEXT: mulh a4, a3, a4 -; RV64IM-NEXT: srli a5, a4, 63 -; RV64IM-NEXT: srli a4, a4, 11 +; RV64IM-NEXT: mul a3, a3, a5 +; RV64IM-NEXT: lh a1, 24(a1) +; RV64IM-NEXT: sub a3, a4, a3 +; RV64IM-NEXT: lui a4, 3 +; RV64IM-NEXT: addiw a4, a4, 87 +; RV64IM-NEXT: mul a4, a1, a4 +; RV64IM-NEXT: srai a4, a4, 26 +; RV64IM-NEXT: srli a5, a4, 15 +; RV64IM-NEXT: andi a5, a5, 1 ; RV64IM-NEXT: add a4, a4, a5 ; RV64IM-NEXT: lui a5, 1 ; RV64IM-NEXT: addiw a5, a5, 1327 ; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a3, a3, a4 -; RV64IM-NEXT: srli a4, a2, 49 -; RV64IM-NEXT: add a4, a2, a4 -; RV64IM-NEXT: lui a5, 8 -; RV64IM-NEXT: and a4, a4, a5 -; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: sub a1, a1, a4 ; RV64IM-NEXT: sh zero, 0(a0) +; RV64IM-NEXT: sh a1, 6(a0) +; RV64IM-NEXT: sh a3, 4(a0) ; RV64IM-NEXT: sh a2, 2(a0) -; RV64IM-NEXT: sh a3, 6(a0) -; RV64IM-NEXT: sh a1, 4(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -1174,180 +1613,457 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32I-LABEL: dont_fold_srem_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -48 -; RV32I-NEXT: sw ra, 44(sp) -; RV32I-NEXT: sw s0, 40(sp) -; RV32I-NEXT: sw s1, 36(sp) -; RV32I-NEXT: sw s2, 32(sp) -; RV32I-NEXT: sw s3, 28(sp) -; RV32I-NEXT: sw s4, 24(sp) -; RV32I-NEXT: sw s5, 20(sp) -; RV32I-NEXT: sw s6, 16(sp) -; RV32I-NEXT: sw s7, 12(sp) -; RV32I-NEXT: sw s8, 8(sp) -; RV32I-NEXT: sw s9, 4(sp) -; RV32I-NEXT: lw s2, 24(a1) -; RV32I-NEXT: lw s3, 28(a1) -; RV32I-NEXT: lw s4, 16(a1) -; RV32I-NEXT: lw s5, 20(a1) +; RV32I-NEXT: addi sp, sp, -240 +; RV32I-NEXT: sw ra, 236(sp) +; RV32I-NEXT: sw s0, 232(sp) +; RV32I-NEXT: sw s1, 228(sp) +; RV32I-NEXT: sw s2, 224(sp) +; RV32I-NEXT: sw s3, 220(sp) +; RV32I-NEXT: sw s4, 216(sp) +; RV32I-NEXT: sw s5, 212(sp) +; RV32I-NEXT: sw s6, 208(sp) +; RV32I-NEXT: sw s7, 204(sp) +; RV32I-NEXT: sw s8, 200(sp) +; RV32I-NEXT: sw s9, 196(sp) +; RV32I-NEXT: sw s10, 192(sp) +; RV32I-NEXT: lw s3, 0(a1) +; RV32I-NEXT: lw s5, 4(a1) +; RV32I-NEXT: lw s4, 24(a1) +; RV32I-NEXT: lw s7, 28(a1) ; RV32I-NEXT: lw s6, 8(a1) -; RV32I-NEXT: lw s1, 12(a1) -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: lw s8, 12(a1) +; RV32I-NEXT: lw s2, 16(a1) +; RV32I-NEXT: lw s1, 20(a1) ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a2, zero, 1 -; RV32I-NEXT: mv a0, a3 +; RV32I-NEXT: addi a0, zero, -1 +; RV32I-NEXT: sw a0, 108(sp) +; RV32I-NEXT: sw a0, 104(sp) +; RV32I-NEXT: lui a0, 729444 +; RV32I-NEXT: addi a0, a0, 712 +; RV32I-NEXT: sw a0, 100(sp) +; RV32I-NEXT: lui a0, 364722 +; RV32I-NEXT: addi a0, a0, 357 +; RV32I-NEXT: sw a0, 96(sp) +; RV32I-NEXT: sw s1, 116(sp) +; RV32I-NEXT: sw s2, 112(sp) +; RV32I-NEXT: srai a3, s1, 31 +; RV32I-NEXT: sw a3, 124(sp) +; RV32I-NEXT: addi a0, sp, 128 +; RV32I-NEXT: addi a1, sp, 112 +; RV32I-NEXT: addi a2, sp, 96 +; RV32I-NEXT: sw a3, 120(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: sw zero, 60(sp) +; RV32I-NEXT: sw zero, 56(sp) +; RV32I-NEXT: lui a0, 410452 +; RV32I-NEXT: addi a0, a0, -952 +; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: lui a0, 25653 +; RV32I-NEXT: addi a0, a0, 965 +; RV32I-NEXT: sw a0, 48(sp) +; RV32I-NEXT: sw s8, 68(sp) +; RV32I-NEXT: sw s6, 64(sp) +; RV32I-NEXT: srai a3, s8, 31 +; RV32I-NEXT: sw a3, 76(sp) +; RV32I-NEXT: addi a0, sp, 80 +; RV32I-NEXT: addi a1, sp, 64 +; RV32I-NEXT: addi a2, sp, 48 +; RV32I-NEXT: sw a3, 72(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: sw zero, 156(sp) +; RV32I-NEXT: sw zero, 152(sp) +; RV32I-NEXT: lui a0, 395996 +; RV32I-NEXT: addi a0, a0, -2010 +; RV32I-NEXT: sw a0, 148(sp) +; RV32I-NEXT: lui a0, 941649 +; RV32I-NEXT: addi a0, a0, 1959 +; RV32I-NEXT: sw a0, 144(sp) +; RV32I-NEXT: sw s7, 164(sp) +; RV32I-NEXT: sw s4, 160(sp) +; RV32I-NEXT: srai a3, s7, 31 +; RV32I-NEXT: sw a3, 172(sp) +; RV32I-NEXT: addi a0, sp, 176 +; RV32I-NEXT: addi a1, sp, 160 +; RV32I-NEXT: addi a2, sp, 144 +; RV32I-NEXT: sw a3, 168(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: sw zero, 8(sp) +; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw s5, 20(sp) +; RV32I-NEXT: sw s3, 16(sp) +; RV32I-NEXT: srai a3, s5, 31 +; RV32I-NEXT: sw a3, 28(sp) +; RV32I-NEXT: addi a0, sp, 32 +; RV32I-NEXT: addi a1, sp, 16 +; RV32I-NEXT: mv a2, sp +; RV32I-NEXT: sw a3, 24(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: lw a0, 140(sp) +; RV32I-NEXT: lw a1, 136(sp) +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: add a2, a1, s2 +; RV32I-NEXT: sltu a1, a2, a1 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: slli a0, a1, 28 +; RV32I-NEXT: srli a2, a2, 4 +; RV32I-NEXT: or a2, a2, a0 +; RV32I-NEXT: srli a0, a1, 31 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: srai a1, a1, 4 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: addi a2, zero, 23 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __moddi3 -; RV32I-NEXT: mv s7, a0 -; RV32I-NEXT: mv s8, a1 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: sltu a2, s2, a0 +; RV32I-NEXT: sub a1, s1, a1 +; RV32I-NEXT: lw a3, 92(sp) +; RV32I-NEXT: lw a4, 88(sp) +; RV32I-NEXT: sub s9, a1, a2 +; RV32I-NEXT: sub s10, s2, a0 +; RV32I-NEXT: slli a0, a3, 24 +; RV32I-NEXT: srli a1, a4, 8 +; RV32I-NEXT: or a1, a1, a0 +; RV32I-NEXT: srli a0, a3, 31 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: sltu a1, a0, a1 +; RV32I-NEXT: srai a2, a3, 8 +; RV32I-NEXT: add a1, a2, a1 ; RV32I-NEXT: addi a2, zero, 654 -; RV32I-NEXT: mv a0, s6 -; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __moddi3 -; RV32I-NEXT: mv s6, a0 -; RV32I-NEXT: mv s9, a1 -; RV32I-NEXT: addi a2, zero, 23 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s5 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: sub a0, s8, a1 +; RV32I-NEXT: lw a1, 188(sp) +; RV32I-NEXT: lw a2, 184(sp) +; RV32I-NEXT: sltu a3, s6, s2 +; RV32I-NEXT: sub s8, a0, a3 +; RV32I-NEXT: slli a0, a1, 21 +; RV32I-NEXT: srli a2, a2, 11 +; RV32I-NEXT: or a2, a2, a0 +; RV32I-NEXT: srli a0, a1, 31 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: srai a1, a1, 11 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: lui a2, 1 +; RV32I-NEXT: addi a2, a2, 1327 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __moddi3 -; RV32I-NEXT: mv s4, a0 -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a2, a0, 1327 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: sltu a0, s4, a0 +; RV32I-NEXT: lw a2, 44(sp) +; RV32I-NEXT: lw a3, 40(sp) +; RV32I-NEXT: sub a1, s7, a1 +; RV32I-NEXT: sub s7, a1, a0 +; RV32I-NEXT: add a1, a2, s5 +; RV32I-NEXT: add a0, a3, s3 +; RV32I-NEXT: sltu a2, a0, a3 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: addi a2, zero, 1 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __moddi3 -; RV32I-NEXT: sw a1, 28(s0) -; RV32I-NEXT: sw a0, 24(s0) -; RV32I-NEXT: sw s1, 20(s0) -; RV32I-NEXT: sw s4, 16(s0) -; RV32I-NEXT: sw s9, 12(s0) -; RV32I-NEXT: sw s6, 8(s0) -; RV32I-NEXT: sw s8, 4(s0) -; RV32I-NEXT: sw s7, 0(s0) -; RV32I-NEXT: lw s9, 4(sp) -; RV32I-NEXT: lw s8, 8(sp) -; RV32I-NEXT: lw s7, 12(sp) -; RV32I-NEXT: lw s6, 16(sp) -; RV32I-NEXT: lw s5, 20(sp) -; RV32I-NEXT: lw s4, 24(sp) -; RV32I-NEXT: lw s3, 28(sp) -; RV32I-NEXT: lw s2, 32(sp) -; RV32I-NEXT: lw s1, 36(sp) -; RV32I-NEXT: lw s0, 40(sp) -; RV32I-NEXT: lw ra, 44(sp) -; RV32I-NEXT: addi sp, sp, 48 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: sltu a2, s3, a0 +; RV32I-NEXT: sub a1, s5, a1 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sub a2, s6, s2 +; RV32I-NEXT: sub a3, s4, s1 +; RV32I-NEXT: sub a0, s3, a0 +; RV32I-NEXT: sw a0, 0(s0) +; RV32I-NEXT: sw a1, 4(s0) +; RV32I-NEXT: sw a3, 24(s0) +; RV32I-NEXT: sw a2, 8(s0) +; RV32I-NEXT: sw s7, 28(s0) +; RV32I-NEXT: sw s8, 12(s0) +; RV32I-NEXT: sw s10, 16(s0) +; RV32I-NEXT: sw s9, 20(s0) +; RV32I-NEXT: lw s10, 192(sp) +; RV32I-NEXT: lw s9, 196(sp) +; RV32I-NEXT: lw s8, 200(sp) +; RV32I-NEXT: lw s7, 204(sp) +; RV32I-NEXT: lw s6, 208(sp) +; RV32I-NEXT: lw s5, 212(sp) +; RV32I-NEXT: lw s4, 216(sp) +; RV32I-NEXT: lw s3, 220(sp) +; RV32I-NEXT: lw s2, 224(sp) +; RV32I-NEXT: lw s1, 228(sp) +; RV32I-NEXT: lw s0, 232(sp) +; RV32I-NEXT: lw ra, 236(sp) +; RV32I-NEXT: addi sp, sp, 240 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: dont_fold_srem_i64: ; RV32IM: # %bb.0: -; RV32IM-NEXT: addi sp, sp, -48 -; RV32IM-NEXT: sw ra, 44(sp) -; RV32IM-NEXT: sw s0, 40(sp) -; RV32IM-NEXT: sw s1, 36(sp) -; RV32IM-NEXT: sw s2, 32(sp) -; RV32IM-NEXT: sw s3, 28(sp) -; RV32IM-NEXT: sw s4, 24(sp) -; RV32IM-NEXT: sw s5, 20(sp) -; RV32IM-NEXT: sw s6, 16(sp) -; RV32IM-NEXT: sw s7, 12(sp) -; RV32IM-NEXT: sw s8, 8(sp) -; RV32IM-NEXT: sw s9, 4(sp) +; RV32IM-NEXT: addi sp, sp, -240 +; RV32IM-NEXT: sw ra, 236(sp) +; RV32IM-NEXT: sw s0, 232(sp) +; RV32IM-NEXT: sw s1, 228(sp) +; RV32IM-NEXT: sw s2, 224(sp) +; RV32IM-NEXT: sw s3, 220(sp) +; RV32IM-NEXT: sw s4, 216(sp) +; RV32IM-NEXT: sw s5, 212(sp) +; RV32IM-NEXT: sw s6, 208(sp) +; RV32IM-NEXT: sw s7, 204(sp) +; RV32IM-NEXT: sw s8, 200(sp) +; RV32IM-NEXT: lw s3, 0(a1) +; RV32IM-NEXT: lw s7, 4(a1) ; RV32IM-NEXT: lw s2, 24(a1) -; RV32IM-NEXT: lw s3, 28(a1) -; RV32IM-NEXT: lw s4, 16(a1) -; RV32IM-NEXT: lw s5, 20(a1) -; RV32IM-NEXT: lw s6, 8(a1) -; RV32IM-NEXT: lw s1, 12(a1) -; RV32IM-NEXT: lw a3, 0(a1) -; RV32IM-NEXT: lw a1, 4(a1) +; RV32IM-NEXT: lw s5, 28(a1) +; RV32IM-NEXT: lw s4, 8(a1) +; RV32IM-NEXT: lw s6, 12(a1) +; RV32IM-NEXT: lw s8, 16(a1) +; RV32IM-NEXT: lw s1, 20(a1) ; RV32IM-NEXT: mv s0, a0 -; RV32IM-NEXT: addi a2, zero, 1 -; RV32IM-NEXT: mv a0, a3 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __moddi3 -; RV32IM-NEXT: mv s7, a0 -; RV32IM-NEXT: mv s8, a1 -; RV32IM-NEXT: addi a2, zero, 654 -; RV32IM-NEXT: mv a0, s6 -; RV32IM-NEXT: mv a1, s1 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __moddi3 -; RV32IM-NEXT: mv s6, a0 -; RV32IM-NEXT: mv s9, a1 -; RV32IM-NEXT: addi a2, zero, 23 -; RV32IM-NEXT: mv a0, s4 -; RV32IM-NEXT: mv a1, s5 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __moddi3 -; RV32IM-NEXT: mv s4, a0 -; RV32IM-NEXT: mv s1, a1 -; RV32IM-NEXT: lui a0, 1 -; RV32IM-NEXT: addi a2, a0, 1327 -; RV32IM-NEXT: mv a0, s2 -; RV32IM-NEXT: mv a1, s3 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __moddi3 -; RV32IM-NEXT: sw a1, 28(s0) -; RV32IM-NEXT: sw a0, 24(s0) -; RV32IM-NEXT: sw s1, 20(s0) -; RV32IM-NEXT: sw s4, 16(s0) -; RV32IM-NEXT: sw s9, 12(s0) -; RV32IM-NEXT: sw s6, 8(s0) -; RV32IM-NEXT: sw s8, 4(s0) -; RV32IM-NEXT: sw s7, 0(s0) -; RV32IM-NEXT: lw s9, 4(sp) -; RV32IM-NEXT: lw s8, 8(sp) -; RV32IM-NEXT: lw s7, 12(sp) -; RV32IM-NEXT: lw s6, 16(sp) -; RV32IM-NEXT: lw s5, 20(sp) -; RV32IM-NEXT: lw s4, 24(sp) -; RV32IM-NEXT: lw s3, 28(sp) -; RV32IM-NEXT: lw s2, 32(sp) -; RV32IM-NEXT: lw s1, 36(sp) -; RV32IM-NEXT: lw s0, 40(sp) -; RV32IM-NEXT: lw ra, 44(sp) -; RV32IM-NEXT: addi sp, sp, 48 +; RV32IM-NEXT: addi a0, zero, -1 +; RV32IM-NEXT: sw a0, 116(sp) +; RV32IM-NEXT: sw a0, 112(sp) +; RV32IM-NEXT: lui a0, 729444 +; RV32IM-NEXT: addi a0, a0, 712 +; RV32IM-NEXT: sw a0, 108(sp) +; RV32IM-NEXT: lui a0, 364722 +; RV32IM-NEXT: addi a0, a0, 357 +; RV32IM-NEXT: sw a0, 104(sp) +; RV32IM-NEXT: sw s1, 124(sp) +; RV32IM-NEXT: sw s8, 120(sp) +; RV32IM-NEXT: srai a3, s1, 31 +; RV32IM-NEXT: sw a3, 132(sp) +; RV32IM-NEXT: addi a0, sp, 136 +; RV32IM-NEXT: addi a1, sp, 120 +; RV32IM-NEXT: addi a2, sp, 104 +; RV32IM-NEXT: sw a3, 128(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: sw zero, 68(sp) +; RV32IM-NEXT: sw zero, 64(sp) +; RV32IM-NEXT: lui a0, 410452 +; RV32IM-NEXT: addi a0, a0, -952 +; RV32IM-NEXT: sw a0, 60(sp) +; RV32IM-NEXT: lui a0, 25653 +; RV32IM-NEXT: addi a0, a0, 965 +; RV32IM-NEXT: sw a0, 56(sp) +; RV32IM-NEXT: sw s6, 76(sp) +; RV32IM-NEXT: sw s4, 72(sp) +; RV32IM-NEXT: srai a3, s6, 31 +; RV32IM-NEXT: sw a3, 84(sp) +; RV32IM-NEXT: addi a0, sp, 88 +; RV32IM-NEXT: addi a1, sp, 72 +; RV32IM-NEXT: addi a2, sp, 56 +; RV32IM-NEXT: sw a3, 80(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: sw zero, 164(sp) +; RV32IM-NEXT: sw zero, 160(sp) +; RV32IM-NEXT: lui a0, 395996 +; RV32IM-NEXT: addi a0, a0, -2010 +; RV32IM-NEXT: sw a0, 156(sp) +; RV32IM-NEXT: lui a0, 941649 +; RV32IM-NEXT: addi a0, a0, 1959 +; RV32IM-NEXT: sw a0, 152(sp) +; RV32IM-NEXT: sw s5, 172(sp) +; RV32IM-NEXT: sw s2, 168(sp) +; RV32IM-NEXT: srai a3, s5, 31 +; RV32IM-NEXT: sw a3, 180(sp) +; RV32IM-NEXT: addi a0, sp, 184 +; RV32IM-NEXT: addi a1, sp, 168 +; RV32IM-NEXT: addi a2, sp, 152 +; RV32IM-NEXT: sw a3, 176(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: sw zero, 20(sp) +; RV32IM-NEXT: sw zero, 16(sp) +; RV32IM-NEXT: sw zero, 12(sp) +; RV32IM-NEXT: sw zero, 8(sp) +; RV32IM-NEXT: sw s7, 28(sp) +; RV32IM-NEXT: sw s3, 24(sp) +; RV32IM-NEXT: srai a3, s7, 31 +; RV32IM-NEXT: sw a3, 36(sp) +; RV32IM-NEXT: addi a0, sp, 40 +; RV32IM-NEXT: addi a1, sp, 24 +; RV32IM-NEXT: addi a2, sp, 8 +; RV32IM-NEXT: sw a3, 32(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: lw a0, 148(sp) +; RV32IM-NEXT: lw a1, 144(sp) +; RV32IM-NEXT: add a0, a0, s1 +; RV32IM-NEXT: add a2, a1, s8 +; RV32IM-NEXT: sltu a1, a2, a1 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: slli a1, a0, 28 +; RV32IM-NEXT: srli a2, a2, 4 +; RV32IM-NEXT: or a1, a2, a1 +; RV32IM-NEXT: srli a2, a0, 31 +; RV32IM-NEXT: add a2, a1, a2 +; RV32IM-NEXT: addi a3, zero, 23 +; RV32IM-NEXT: mulhu a4, a2, a3 +; RV32IM-NEXT: sltu a1, a2, a1 +; RV32IM-NEXT: srai a0, a0, 4 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: mul a0, a0, a3 +; RV32IM-NEXT: add a0, a4, a0 +; RV32IM-NEXT: sub a0, s1, a0 +; RV32IM-NEXT: mul a1, a2, a3 +; RV32IM-NEXT: sltu a2, s8, a1 +; RV32IM-NEXT: lw a3, 100(sp) +; RV32IM-NEXT: lw a4, 96(sp) +; RV32IM-NEXT: sub a6, a0, a2 +; RV32IM-NEXT: sub a7, s8, a1 +; RV32IM-NEXT: slli a2, a3, 24 +; RV32IM-NEXT: srli a4, a4, 8 +; RV32IM-NEXT: or a2, a4, a2 +; RV32IM-NEXT: srli a4, a3, 31 +; RV32IM-NEXT: add a4, a2, a4 +; RV32IM-NEXT: sltu a2, a4, a2 +; RV32IM-NEXT: srai a3, a3, 8 +; RV32IM-NEXT: add a2, a3, a2 +; RV32IM-NEXT: addi a3, zero, 654 +; RV32IM-NEXT: mul a2, a2, a3 +; RV32IM-NEXT: mulhu a5, a4, a3 +; RV32IM-NEXT: add a2, a5, a2 +; RV32IM-NEXT: sub a2, s6, a2 +; RV32IM-NEXT: mul a3, a4, a3 +; RV32IM-NEXT: lw a4, 196(sp) +; RV32IM-NEXT: lw a5, 192(sp) +; RV32IM-NEXT: sltu s1, s4, a3 +; RV32IM-NEXT: sub a2, a2, s1 +; RV32IM-NEXT: slli s1, a4, 21 +; RV32IM-NEXT: srli a5, a5, 11 +; RV32IM-NEXT: or a5, a5, s1 +; RV32IM-NEXT: srli s1, a4, 31 +; RV32IM-NEXT: add s1, a5, s1 +; RV32IM-NEXT: sltu a5, s1, a5 +; RV32IM-NEXT: srai a4, a4, 11 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: lui a5, 1 +; RV32IM-NEXT: addi a5, a5, 1327 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: mulhu a0, s1, a5 +; RV32IM-NEXT: add a0, a0, a4 +; RV32IM-NEXT: sub a0, s5, a0 +; RV32IM-NEXT: mul a4, s1, a5 +; RV32IM-NEXT: lw a5, 52(sp) +; RV32IM-NEXT: lw s1, 48(sp) +; RV32IM-NEXT: sltu a1, s2, a4 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: add a1, a5, s7 +; RV32IM-NEXT: add a5, s1, s3 +; RV32IM-NEXT: sltu s1, a5, s1 +; RV32IM-NEXT: add a1, a1, s1 +; RV32IM-NEXT: sub a1, s7, a1 +; RV32IM-NEXT: sltu s1, s3, a5 +; RV32IM-NEXT: sub a1, a1, s1 +; RV32IM-NEXT: sub a3, s4, a3 +; RV32IM-NEXT: sub a4, s2, a4 +; RV32IM-NEXT: sub a5, s3, a5 +; RV32IM-NEXT: sw a5, 0(s0) +; RV32IM-NEXT: sw a4, 24(s0) +; RV32IM-NEXT: sw a3, 8(s0) +; RV32IM-NEXT: sw a1, 4(s0) +; RV32IM-NEXT: sw a7, 16(s0) +; RV32IM-NEXT: sw a0, 28(s0) +; RV32IM-NEXT: sw a2, 12(s0) +; RV32IM-NEXT: sw a6, 20(s0) +; RV32IM-NEXT: lw s8, 200(sp) +; RV32IM-NEXT: lw s7, 204(sp) +; RV32IM-NEXT: lw s6, 208(sp) +; RV32IM-NEXT: lw s5, 212(sp) +; RV32IM-NEXT: lw s4, 216(sp) +; RV32IM-NEXT: lw s3, 220(sp) +; RV32IM-NEXT: lw s2, 224(sp) +; RV32IM-NEXT: lw s1, 228(sp) +; RV32IM-NEXT: lw s0, 232(sp) +; RV32IM-NEXT: lw ra, 236(sp) +; RV32IM-NEXT: addi sp, sp, 240 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_srem_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: sd ra, 56(sp) +; RV64I-NEXT: sd s0, 48(sp) +; RV64I-NEXT: sd s1, 40(sp) +; RV64I-NEXT: sd s2, 32(sp) +; RV64I-NEXT: sd s3, 24(sp) +; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: sd s5, 8(sp) +; RV64I-NEXT: ld s4, 0(a1) ; RV64I-NEXT: ld s2, 24(a1) +; RV64I-NEXT: ld s0, 8(a1) ; RV64I-NEXT: ld s1, 16(a1) -; RV64I-NEXT: ld a2, 8(a1) -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: addi a1, zero, 654 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: mv s3, a0 -; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: srai a1, s1, 63 +; RV64I-NEXT: lui a0, 1043590 +; RV64I-NEXT: addiw a0, a0, -1781 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a0, a0, 1069 +; RV64I-NEXT: slli a0, a0, 12 +; RV64I-NEXT: addi a0, a0, -1959 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a2, a0, 357 +; RV64I-NEXT: addi a3, zero, -1 ; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a1, a0, 1327 +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: add a0, a1, s1 +; RV64I-NEXT: srli a1, a0, 63 +; RV64I-NEXT: srai a0, a0, 4 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s5, s1, a0 +; RV64I-NEXT: srai a1, s0, 63 +; RV64I-NEXT: lui a0, 6413 +; RV64I-NEXT: addiw a0, a0, 1265 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a0, a0, 1027 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a0, a0, 1077 +; RV64I-NEXT: slli a0, a0, 12 +; RV64I-NEXT: addi a2, a0, 965 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: srli a0, a1, 63 +; RV64I-NEXT: srai a1, a1, 8 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: addi a1, zero, 654 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s0, s0, a0 +; RV64I-NEXT: srai a1, s2, 63 +; RV64I-NEXT: lui a0, 12375 +; RV64I-NEXT: addiw a0, a0, -575 +; RV64I-NEXT: slli a0, a0, 12 +; RV64I-NEXT: addi a0, a0, 883 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a0, a0, -431 +; RV64I-NEXT: slli a0, a0, 12 +; RV64I-NEXT: addi a2, a0, 1959 ; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: sd zero, 0(s0) -; RV64I-NEXT: sd a0, 24(s0) -; RV64I-NEXT: sd s1, 16(s0) -; RV64I-NEXT: sd s3, 8(s0) -; RV64I-NEXT: ld s3, 8(sp) -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: srli a0, a1, 63 +; RV64I-NEXT: srai a1, a1, 11 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: lui a1, 1 +; RV64I-NEXT: addiw a1, a1, 1327 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s1, s2, a0 +; RV64I-NEXT: srai a1, s4, 63 +; RV64I-NEXT: mv a0, s4 +; RV64I-NEXT: mv a2, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: add a0, a1, s4 +; RV64I-NEXT: sub a0, s4, a0 +; RV64I-NEXT: sd a0, 0(s3) +; RV64I-NEXT: sd s1, 24(s3) +; RV64I-NEXT: sd s0, 8(s3) +; RV64I-NEXT: sd s5, 16(s3) +; RV64I-NEXT: ld s5, 8(sp) +; RV64I-NEXT: ld s4, 16(sp) +; RV64I-NEXT: ld s3, 24(sp) +; RV64I-NEXT: ld s2, 32(sp) +; RV64I-NEXT: ld s1, 40(sp) +; RV64I-NEXT: ld s0, 48(sp) +; RV64I-NEXT: ld ra, 56(sp) +; RV64I-NEXT: addi sp, sp, 64 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: dont_fold_srem_i64: diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll --- a/llvm/test/CodeGen/RISCV/urem-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll @@ -13,8 +13,22 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lui a0, 364242 +; RV32I-NEXT: addi a2, a0, 777 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a1, zero +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: sub a0, s0, a1 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: srli a0, a0, 6 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: lw s0, 8(sp) ; RV32I-NEXT: lw ra, 12(sp) ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -37,34 +51,41 @@ ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: sd s0, 0(sp) +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: lui a1, 364242 +; RV64I-NEXT: addiw a1, a1, 777 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: sub a1, s0, a0 +; RV64I-NEXT: srliw a1, a1, 1 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: srli a0, a0, 6 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: subw a0, s0, a0 +; RV64I-NEXT: ld s0, 0(sp) ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: fold_urem_positive_odd: ; RV64IM: # %bb.0: -; RV64IM-NEXT: slli a0, a0, 32 -; RV64IM-NEXT: srli a0, a0, 32 -; RV64IM-NEXT: lui a1, 1423 -; RV64IM-NEXT: addiw a1, a1, -733 -; RV64IM-NEXT: slli a1, a1, 15 -; RV64IM-NEXT: addi a1, a1, 1035 -; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, -1811 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 561 -; RV64IM-NEXT: mulhu a1, a0, a1 +; RV64IM-NEXT: slli a1, a0, 32 +; RV64IM-NEXT: srli a1, a1, 32 +; RV64IM-NEXT: lui a2, 364242 +; RV64IM-NEXT: addiw a2, a2, 777 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: srli a1, a1, 32 ; RV64IM-NEXT: sub a2, a0, a1 -; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: srliw a2, a2, 1 ; RV64IM-NEXT: add a1, a2, a1 ; RV64IM-NEXT: srli a1, a1, 6 ; RV64IM-NEXT: addi a2, zero, 95 ; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: subw a0, a0, a1 ; RV64IM-NEXT: ret %1 = urem i32 %x, 95 ret i32 %1 @@ -76,8 +97,19 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lui a0, 1012964 +; RV32I-NEXT: addi a2, a0, -61 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a1, zero +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: srli a0, a1, 10 ; RV32I-NEXT: addi a1, zero, 1060 -; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: lw s0, 8(sp) ; RV32I-NEXT: lw ra, 12(sp) ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -97,31 +129,37 @@ ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: sd s0, 0(sp) +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: lui a1, 62 +; RV64I-NEXT: addiw a1, a1, -711 +; RV64I-NEXT: slli a1, a1, 14 +; RV64I-NEXT: addi a1, a1, -61 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 42 ; RV64I-NEXT: addi a1, zero, 1060 -; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: subw a0, s0, a0 +; RV64I-NEXT: ld s0, 0(sp) ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: fold_urem_positive_even: ; RV64IM: # %bb.0: -; RV64IM-NEXT: slli a0, a0, 32 -; RV64IM-NEXT: srli a0, a0, 32 -; RV64IM-NEXT: lui a1, 1048020 -; RV64IM-NEXT: addiw a1, a1, -1793 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 139 -; RV64IM-NEXT: slli a1, a1, 14 -; RV64IM-NEXT: addi a1, a1, 1793 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -139 -; RV64IM-NEXT: mulhu a1, a0, a1 -; RV64IM-NEXT: srli a1, a1, 10 +; RV64IM-NEXT: slli a1, a0, 32 +; RV64IM-NEXT: srli a1, a1, 32 +; RV64IM-NEXT: lui a2, 62 +; RV64IM-NEXT: addiw a2, a2, -711 +; RV64IM-NEXT: slli a2, a2, 14 +; RV64IM-NEXT: addi a2, a2, -61 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: srli a1, a1, 42 ; RV64IM-NEXT: addi a2, zero, 1060 ; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: subw a0, a0, a1 ; RV64IM-NEXT: ret %1 = urem i32 %x, 1060 ret i32 %1 @@ -137,13 +175,21 @@ ; RV32I-NEXT: sw s0, 8(sp) ; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: lui a0, 364242 +; RV32I-NEXT: addi a2, a0, 777 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call __udivsi3 -; RV32I-NEXT: add a0, s1, a0 +; RV32I-NEXT: mv a1, zero +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: sub a0, s0, a1 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: srli s1, a0, 6 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: add a0, a0, s1 ; RV32I-NEXT: lw s1, 4(sp) ; RV32I-NEXT: lw s0, 8(sp) ; RV32I-NEXT: lw ra, 12(sp) @@ -171,16 +217,22 @@ ; RV64I-NEXT: sd ra, 24(sp) ; RV64I-NEXT: sd s0, 16(sp) ; RV64I-NEXT: sd s1, 8(sp) +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: srli s0, a0, 32 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: lui a1, 364242 +; RV64I-NEXT: addiw a1, a1, 777 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: sub a1, s0, a0 +; RV64I-NEXT: srliw a1, a1, 1 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: srli s1, a0, 6 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __udivdi3 -; RV64I-NEXT: add a0, s1, a0 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s0, a0 +; RV64I-NEXT: addw a0, a0, s1 ; RV64I-NEXT: ld s1, 8(sp) ; RV64I-NEXT: ld s0, 16(sp) ; RV64I-NEXT: ld ra, 24(sp) @@ -189,25 +241,20 @@ ; ; RV64IM-LABEL: combine_urem_udiv: ; RV64IM: # %bb.0: -; RV64IM-NEXT: slli a0, a0, 32 -; RV64IM-NEXT: srli a0, a0, 32 -; RV64IM-NEXT: lui a1, 1423 -; RV64IM-NEXT: addiw a1, a1, -733 -; RV64IM-NEXT: slli a1, a1, 15 -; RV64IM-NEXT: addi a1, a1, 1035 -; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, -1811 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 561 -; RV64IM-NEXT: mulhu a1, a0, a1 +; RV64IM-NEXT: slli a1, a0, 32 +; RV64IM-NEXT: srli a1, a1, 32 +; RV64IM-NEXT: lui a2, 364242 +; RV64IM-NEXT: addiw a2, a2, 777 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: srli a1, a1, 32 ; RV64IM-NEXT: sub a2, a0, a1 -; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: srliw a2, a2, 1 ; RV64IM-NEXT: add a1, a2, a1 ; RV64IM-NEXT: srli a1, a1, 6 ; RV64IM-NEXT: addi a2, zero, 95 ; RV64IM-NEXT: mul a2, a1, a2 ; RV64IM-NEXT: sub a0, a0, a2 -; RV64IM-NEXT: add a0, a0, a1 +; RV64IM-NEXT: addw a0, a0, a1 ; RV64IM-NEXT: ret %1 = urem i32 %x, 95 %2 = udiv i32 %x, 95 @@ -248,32 +295,123 @@ define i64 @dont_fold_urem_i64(i64 %x) nounwind { ; RV32I-LABEL: dont_fold_urem_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -16 -; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, -64 +; RV32I-NEXT: sw ra, 60(sp) +; RV32I-NEXT: sw s0, 56(sp) +; RV32I-NEXT: sw s1, 52(sp) +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: sw zero, 8(sp) +; RV32I-NEXT: sw zero, 28(sp) +; RV32I-NEXT: sw zero, 24(sp) +; RV32I-NEXT: lui a0, 342392 +; RV32I-NEXT: addi a0, a0, 668 +; RV32I-NEXT: sw a0, 4(sp) +; RV32I-NEXT: lui a0, 770382 +; RV32I-NEXT: addi a0, a0, 1505 +; RV32I-NEXT: sw a0, 0(sp) +; RV32I-NEXT: srli a0, a1, 1 +; RV32I-NEXT: sw a0, 20(sp) +; RV32I-NEXT: slli a0, a1, 31 +; RV32I-NEXT: srli a1, s1, 1 +; RV32I-NEXT: or a3, a1, a0 +; RV32I-NEXT: addi a0, sp, 32 +; RV32I-NEXT: addi a1, sp, 16 +; RV32I-NEXT: mv a2, sp +; RV32I-NEXT: sw a3, 16(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: lw a1, 44(sp) +; RV32I-NEXT: lw a0, 40(sp) +; RV32I-NEXT: slli a2, a1, 28 +; RV32I-NEXT: srli a0, a0, 4 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a1, a1, 4 ; RV32I-NEXT: addi a2, zero, 98 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __umoddi3 -; RV32I-NEXT: lw ra, 12(sp) -; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: sltu a2, s1, a0 +; RV32I-NEXT: sub a1, s0, a1 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sub a0, s1, a0 +; RV32I-NEXT: lw s1, 52(sp) +; RV32I-NEXT: lw s0, 56(sp) +; RV32I-NEXT: lw ra, 60(sp) +; RV32I-NEXT: addi sp, sp, 64 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: dont_fold_urem_i64: ; RV32IM: # %bb.0: -; RV32IM-NEXT: addi sp, sp, -16 -; RV32IM-NEXT: sw ra, 12(sp) -; RV32IM-NEXT: addi a2, zero, 98 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: lw ra, 12(sp) -; RV32IM-NEXT: addi sp, sp, 16 +; RV32IM-NEXT: addi sp, sp, -64 +; RV32IM-NEXT: sw ra, 60(sp) +; RV32IM-NEXT: sw s0, 56(sp) +; RV32IM-NEXT: sw s1, 52(sp) +; RV32IM-NEXT: mv s0, a1 +; RV32IM-NEXT: mv s1, a0 +; RV32IM-NEXT: sw zero, 12(sp) +; RV32IM-NEXT: sw zero, 8(sp) +; RV32IM-NEXT: sw zero, 28(sp) +; RV32IM-NEXT: sw zero, 24(sp) +; RV32IM-NEXT: lui a0, 342392 +; RV32IM-NEXT: addi a0, a0, 668 +; RV32IM-NEXT: sw a0, 4(sp) +; RV32IM-NEXT: lui a0, 770382 +; RV32IM-NEXT: addi a0, a0, 1505 +; RV32IM-NEXT: sw a0, 0(sp) +; RV32IM-NEXT: srli a0, a1, 1 +; RV32IM-NEXT: sw a0, 20(sp) +; RV32IM-NEXT: slli a0, a1, 31 +; RV32IM-NEXT: srli a1, s1, 1 +; RV32IM-NEXT: or a3, a1, a0 +; RV32IM-NEXT: addi a0, sp, 32 +; RV32IM-NEXT: addi a1, sp, 16 +; RV32IM-NEXT: mv a2, sp +; RV32IM-NEXT: sw a3, 16(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: lw a0, 44(sp) +; RV32IM-NEXT: srli a1, a0, 4 +; RV32IM-NEXT: lw a2, 40(sp) +; RV32IM-NEXT: addi a3, zero, 98 +; RV32IM-NEXT: mul a1, a1, a3 +; RV32IM-NEXT: slli a0, a0, 28 +; RV32IM-NEXT: srli a2, a2, 4 +; RV32IM-NEXT: or a0, a2, a0 +; RV32IM-NEXT: mulhu a2, a0, a3 +; RV32IM-NEXT: add a1, a2, a1 +; RV32IM-NEXT: sub a1, s0, a1 +; RV32IM-NEXT: mul a0, a0, a3 +; RV32IM-NEXT: sltu a2, s1, a0 +; RV32IM-NEXT: sub a1, a1, a2 +; RV32IM-NEXT: sub a0, s1, a0 +; RV32IM-NEXT: lw s1, 52(sp) +; RV32IM-NEXT: lw s0, 56(sp) +; RV32IM-NEXT: lw ra, 60(sp) +; RV32IM-NEXT: addi sp, sp, 64 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_urem_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: sd s0, 0(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: srli a0, a0, 1 +; RV64I-NEXT: lui a1, 2675 +; RV64I-NEXT: addiw a1, a1, -251 +; RV64I-NEXT: slli a1, a1, 13 +; RV64I-NEXT: addi a1, a1, 1839 +; RV64I-NEXT: slli a1, a1, 13 +; RV64I-NEXT: addi a1, a1, 167 +; RV64I-NEXT: slli a1, a1, 13 +; RV64I-NEXT: addi a2, a1, 1505 +; RV64I-NEXT: mv a1, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: srli a0, a1, 4 ; RV64I-NEXT: addi a1, zero, 98 -; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s0, a0 +; RV64I-NEXT: ld s0, 0(sp) ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -20,30 +20,55 @@ ; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: sw s5, 4(sp) -; RV32I-NEXT: lhu s2, 12(a1) +; RV32I-NEXT: lhu s2, 0(a1) ; RV32I-NEXT: lhu s3, 8(a1) -; RV32I-NEXT: lhu s0, 4(a1) -; RV32I-NEXT: lhu a2, 0(a1) -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s4, a0 -; RV32I-NEXT: addi a1, zero, 124 -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s5, a0 -; RV32I-NEXT: addi a1, zero, 98 -; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: lhu s4, 4(a1) +; RV32I-NEXT: lhu s1, 12(a1) ; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 1373 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: sub a1, s1, a0 +; RV32I-NEXT: lui a2, 16 +; RV32I-NEXT: addi a3, a2, -2 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: srli a1, a1, 1 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: addi a1, a2, -512 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: srli a0, a0, 9 ; RV32I-NEXT: addi a1, zero, 1003 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s5, s1, a0 +; RV32I-NEXT: srli a0, s4, 2 +; RV32I-NEXT: lui a1, 4 +; RV32I-NEXT: addi a1, a1, 529 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 19 +; RV32I-NEXT: addi a1, zero, 124 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s4, s4, a0 +; RV32I-NEXT: srli a0, s3, 1 +; RV32I-NEXT: lui a1, 1 +; RV32I-NEXT: addi a1, a1, -1421 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 17 +; RV32I-NEXT: addi a1, zero, 98 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s1, s3, a0 +; RV32I-NEXT: lui a0, 11 +; RV32I-NEXT: addi a1, a0, -905 ; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: sh a0, 6(s1) -; RV32I-NEXT: sh s0, 4(s1) -; RV32I-NEXT: sh s5, 2(s1) -; RV32I-NEXT: sh s4, 0(s1) +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 22 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s2, a0 +; RV32I-NEXT: sh a0, 0(s0) +; RV32I-NEXT: sh s1, 4(s0) +; RV32I-NEXT: sh s4, 2(s0) +; RV32I-NEXT: sh s5, 6(s0) ; RV32I-NEXT: lw s5, 4(sp) ; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) @@ -56,46 +81,50 @@ ; ; RV32IM-LABEL: fold_urem_vec_1: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lhu a6, 12(a1) -; RV32IM-NEXT: lhu a3, 8(a1) -; RV32IM-NEXT: lhu a4, 0(a1) +; RV32IM-NEXT: lhu a6, 0(a1) +; RV32IM-NEXT: lhu a3, 12(a1) +; RV32IM-NEXT: lhu a7, 8(a1) ; RV32IM-NEXT: lhu a1, 4(a1) -; RV32IM-NEXT: lui a5, 364242 -; RV32IM-NEXT: addi a5, a5, 777 -; RV32IM-NEXT: mulhu a5, a4, a5 -; RV32IM-NEXT: sub a2, a4, a5 +; RV32IM-NEXT: addi a5, zero, 1373 +; RV32IM-NEXT: mul a5, a3, a5 +; RV32IM-NEXT: srli a5, a5, 16 +; RV32IM-NEXT: sub a2, a3, a5 +; RV32IM-NEXT: lui a4, 16 +; RV32IM-NEXT: addi a4, a4, -2 +; RV32IM-NEXT: and a2, a2, a4 ; RV32IM-NEXT: srli a2, a2, 1 ; RV32IM-NEXT: add a2, a2, a5 -; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: srli a2, a2, 9 +; RV32IM-NEXT: addi a4, zero, 1003 +; RV32IM-NEXT: mul a2, a2, a4 +; RV32IM-NEXT: sub a2, a3, a2 +; RV32IM-NEXT: srli a3, a1, 2 +; RV32IM-NEXT: lui a4, 4 +; RV32IM-NEXT: addi a4, a4, 529 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: srli a3, a3, 19 +; RV32IM-NEXT: addi a4, zero, 124 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: sub a1, a1, a3 +; RV32IM-NEXT: srli a3, a7, 1 +; RV32IM-NEXT: lui a4, 1 +; RV32IM-NEXT: addi a4, a4, -1421 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: srli a3, a3, 17 +; RV32IM-NEXT: addi a4, zero, 98 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: sub a3, a7, a3 +; RV32IM-NEXT: lui a4, 11 +; RV32IM-NEXT: addi a4, a4, -905 +; RV32IM-NEXT: mul a4, a6, a4 +; RV32IM-NEXT: srli a4, a4, 22 ; RV32IM-NEXT: addi a5, zero, 95 -; RV32IM-NEXT: mul a2, a2, a5 -; RV32IM-NEXT: sub a2, a4, a2 -; RV32IM-NEXT: srli a4, a1, 2 -; RV32IM-NEXT: lui a5, 135300 -; RV32IM-NEXT: addi a5, a5, 529 -; RV32IM-NEXT: mulhu a4, a4, a5 -; RV32IM-NEXT: srli a4, a4, 2 -; RV32IM-NEXT: addi a5, zero, 124 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a1, a1, a4 -; RV32IM-NEXT: lui a4, 342392 -; RV32IM-NEXT: addi a4, a4, 669 -; RV32IM-NEXT: mulhu a4, a3, a4 -; RV32IM-NEXT: srli a4, a4, 5 -; RV32IM-NEXT: addi a5, zero, 98 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a3, a3, a4 -; RV32IM-NEXT: lui a4, 267633 -; RV32IM-NEXT: addi a4, a4, -1809 -; RV32IM-NEXT: mulhu a4, a6, a4 -; RV32IM-NEXT: srli a4, a4, 8 -; RV32IM-NEXT: addi a5, zero, 1003 ; RV32IM-NEXT: mul a4, a4, a5 ; RV32IM-NEXT: sub a4, a6, a4 -; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: sh a4, 0(a0) ; RV32IM-NEXT: sh a3, 4(a0) ; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: sh a2, 6(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: fold_urem_vec_1: @@ -108,30 +137,55 @@ ; RV64I-NEXT: sd s3, 24(sp) ; RV64I-NEXT: sd s4, 16(sp) ; RV64I-NEXT: sd s5, 8(sp) -; RV64I-NEXT: lhu s2, 24(a1) +; RV64I-NEXT: lhu s2, 0(a1) ; RV64I-NEXT: lhu s3, 16(a1) -; RV64I-NEXT: lhu s0, 8(a1) -; RV64I-NEXT: lhu a2, 0(a1) -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s4, a0 -; RV64I-NEXT: addi a1, zero, 124 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s5, a0 -; RV64I-NEXT: addi a1, zero, 98 -; RV64I-NEXT: mv a0, s3 -; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: lhu s4, 8(a1) +; RV64I-NEXT: lhu s1, 24(a1) ; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 1373 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: sub a1, s1, a0 +; RV64I-NEXT: lui a2, 16 +; RV64I-NEXT: addiw a3, a2, -2 +; RV64I-NEXT: and a1, a1, a3 +; RV64I-NEXT: srli a1, a1, 1 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: addiw a1, a2, -512 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: srli a0, a0, 9 ; RV64I-NEXT: addi a1, zero, 1003 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s5, s1, a0 +; RV64I-NEXT: srli a0, s4, 2 +; RV64I-NEXT: lui a1, 4 +; RV64I-NEXT: addiw a1, a1, 529 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 19 +; RV64I-NEXT: addi a1, zero, 124 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s4, s4, a0 +; RV64I-NEXT: srli a0, s3, 1 +; RV64I-NEXT: lui a1, 1 +; RV64I-NEXT: addiw a1, a1, -1421 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 17 +; RV64I-NEXT: addi a1, zero, 98 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s1, s3, a0 +; RV64I-NEXT: lui a0, 11 +; RV64I-NEXT: addiw a1, a0, -905 ; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: sh a0, 6(s1) -; RV64I-NEXT: sh s0, 4(s1) -; RV64I-NEXT: sh s5, 2(s1) -; RV64I-NEXT: sh s4, 0(s1) +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 22 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s2, a0 +; RV64I-NEXT: sh a0, 0(s0) +; RV64I-NEXT: sh s1, 4(s0) +; RV64I-NEXT: sh s4, 2(s0) +; RV64I-NEXT: sh s5, 6(s0) ; RV64I-NEXT: ld s5, 8(sp) ; RV64I-NEXT: ld s4, 16(sp) ; RV64I-NEXT: ld s3, 24(sp) @@ -144,71 +198,50 @@ ; ; RV64IM-LABEL: fold_urem_vec_1: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lhu a6, 24(a1) -; RV64IM-NEXT: lhu a3, 16(a1) -; RV64IM-NEXT: lhu a4, 8(a1) -; RV64IM-NEXT: lhu a1, 0(a1) -; RV64IM-NEXT: lui a5, 1423 -; RV64IM-NEXT: addiw a5, a5, -733 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1035 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, -1811 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, 561 -; RV64IM-NEXT: mulhu a5, a1, a5 -; RV64IM-NEXT: sub a2, a1, a5 +; RV64IM-NEXT: lhu a6, 0(a1) +; RV64IM-NEXT: lhu a3, 24(a1) +; RV64IM-NEXT: lhu a7, 16(a1) +; RV64IM-NEXT: lhu a1, 8(a1) +; RV64IM-NEXT: addi a5, zero, 1373 +; RV64IM-NEXT: mul a5, a3, a5 +; RV64IM-NEXT: srli a5, a5, 16 +; RV64IM-NEXT: sub a2, a3, a5 +; RV64IM-NEXT: lui a4, 16 +; RV64IM-NEXT: addiw a4, a4, -2 +; RV64IM-NEXT: and a2, a2, a4 ; RV64IM-NEXT: srli a2, a2, 1 ; RV64IM-NEXT: add a2, a2, a5 -; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: srli a2, a2, 9 +; RV64IM-NEXT: addi a4, zero, 1003 +; RV64IM-NEXT: mul a2, a2, a4 +; RV64IM-NEXT: sub a2, a3, a2 +; RV64IM-NEXT: srli a3, a1, 2 +; RV64IM-NEXT: lui a4, 4 +; RV64IM-NEXT: addiw a4, a4, 529 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: srli a3, a3, 19 +; RV64IM-NEXT: addi a4, zero, 124 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: sub a1, a1, a3 +; RV64IM-NEXT: srli a3, a7, 1 +; RV64IM-NEXT: lui a4, 1 +; RV64IM-NEXT: addiw a4, a4, -1421 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: srli a3, a3, 17 +; RV64IM-NEXT: addi a4, zero, 98 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: sub a3, a7, a3 +; RV64IM-NEXT: lui a4, 11 +; RV64IM-NEXT: addiw a4, a4, -905 +; RV64IM-NEXT: mul a4, a6, a4 +; RV64IM-NEXT: srli a4, a4, 22 ; RV64IM-NEXT: addi a5, zero, 95 -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: sub a1, a1, a2 -; RV64IM-NEXT: srli a2, a4, 2 -; RV64IM-NEXT: lui a5, 264 -; RV64IM-NEXT: addiw a5, a5, 1057 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1057 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1057 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, 133 -; RV64IM-NEXT: mulhu a2, a2, a5 -; RV64IM-NEXT: srli a2, a2, 3 -; RV64IM-NEXT: addi a5, zero, 124 -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: sub a2, a4, a2 -; RV64IM-NEXT: srli a4, a3, 1 -; RV64IM-NEXT: lui a5, 2675 -; RV64IM-NEXT: addiw a5, a5, -251 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, 1839 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, 167 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, 1505 -; RV64IM-NEXT: mulhu a4, a4, a5 -; RV64IM-NEXT: srli a4, a4, 4 -; RV64IM-NEXT: addi a5, zero, 98 -; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a3, a3, a4 -; RV64IM-NEXT: lui a4, 8364 -; RV64IM-NEXT: addiw a4, a4, -1977 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 1907 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 453 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 1213 -; RV64IM-NEXT: mulhu a4, a6, a4 -; RV64IM-NEXT: srli a4, a4, 7 -; RV64IM-NEXT: addi a5, zero, 1003 ; RV64IM-NEXT: mul a4, a4, a5 ; RV64IM-NEXT: sub a4, a6, a4 -; RV64IM-NEXT: sh a4, 6(a0) +; RV64IM-NEXT: sh a4, 0(a0) ; RV64IM-NEXT: sh a3, 4(a0) -; RV64IM-NEXT: sh a2, 2(a0) -; RV64IM-NEXT: sh a1, 0(a0) +; RV64IM-NEXT: sh a1, 2(a0) +; RV64IM-NEXT: sh a2, 6(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -225,30 +258,47 @@ ; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: sw s5, 4(sp) +; RV32I-NEXT: sw s6, 0(sp) ; RV32I-NEXT: lhu s2, 12(a1) ; RV32I-NEXT: lhu s3, 8(a1) -; RV32I-NEXT: lhu s0, 4(a1) -; RV32I-NEXT: lhu a2, 0(a1) -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s4, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: lhu s4, 4(a1) +; RV32I-NEXT: lhu s1, 0(a1) ; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: lui a0, 11 +; RV32I-NEXT: addi s0, a0, -905 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 22 ; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s6, s1, a0 +; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 22 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s4, s4, a0 ; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 22 ; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s1, s3, a0 ; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: sh a0, 6(s1) -; RV32I-NEXT: sh s0, 4(s1) -; RV32I-NEXT: sh s5, 2(s1) -; RV32I-NEXT: sh s4, 0(s1) +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 22 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s2, a0 +; RV32I-NEXT: sh a0, 6(s5) +; RV32I-NEXT: sh s1, 4(s5) +; RV32I-NEXT: sh s4, 2(s5) +; RV32I-NEXT: sh s6, 0(s5) +; RV32I-NEXT: lw s6, 0(sp) ; RV32I-NEXT: lw s5, 4(sp) ; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) @@ -262,44 +312,32 @@ ; RV32IM-LABEL: fold_urem_vec_2: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lhu a6, 12(a1) -; RV32IM-NEXT: lhu a7, 8(a1) +; RV32IM-NEXT: lhu a3, 8(a1) ; RV32IM-NEXT: lhu a4, 0(a1) ; RV32IM-NEXT: lhu a1, 4(a1) -; RV32IM-NEXT: lui a5, 364242 -; RV32IM-NEXT: addi a5, a5, 777 -; RV32IM-NEXT: mulhu a2, a4, a5 -; RV32IM-NEXT: sub a3, a4, a2 -; RV32IM-NEXT: srli a3, a3, 1 -; RV32IM-NEXT: add a2, a3, a2 -; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: addi a3, zero, 95 -; RV32IM-NEXT: mul a2, a2, a3 -; RV32IM-NEXT: sub t0, a4, a2 -; RV32IM-NEXT: mulhu a4, a1, a5 -; RV32IM-NEXT: sub a2, a1, a4 -; RV32IM-NEXT: srli a2, a2, 1 -; RV32IM-NEXT: add a2, a2, a4 -; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: mul a2, a2, a3 -; RV32IM-NEXT: sub a1, a1, a2 -; RV32IM-NEXT: mulhu a2, a7, a5 -; RV32IM-NEXT: sub a4, a7, a2 -; RV32IM-NEXT: srli a4, a4, 1 -; RV32IM-NEXT: add a2, a4, a2 -; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: mul a2, a2, a3 -; RV32IM-NEXT: sub a2, a7, a2 -; RV32IM-NEXT: mulhu a4, a6, a5 -; RV32IM-NEXT: sub a5, a6, a4 -; RV32IM-NEXT: srli a5, a5, 1 -; RV32IM-NEXT: add a4, a5, a4 -; RV32IM-NEXT: srli a4, a4, 6 -; RV32IM-NEXT: mul a3, a4, a3 -; RV32IM-NEXT: sub a3, a6, a3 -; RV32IM-NEXT: sh a3, 6(a0) -; RV32IM-NEXT: sh a2, 4(a0) +; RV32IM-NEXT: lui a5, 11 +; RV32IM-NEXT: addi a5, a5, -905 +; RV32IM-NEXT: mul a2, a4, a5 +; RV32IM-NEXT: srli a2, a2, 22 +; RV32IM-NEXT: addi a7, zero, 95 +; RV32IM-NEXT: mul a2, a2, a7 +; RV32IM-NEXT: sub a2, a4, a2 +; RV32IM-NEXT: mul a4, a1, a5 +; RV32IM-NEXT: srli a4, a4, 22 +; RV32IM-NEXT: mul a4, a4, a7 +; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: mul a4, a3, a5 +; RV32IM-NEXT: srli a4, a4, 22 +; RV32IM-NEXT: mul a4, a4, a7 +; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: mul a4, a6, a5 +; RV32IM-NEXT: srli a4, a4, 22 +; RV32IM-NEXT: mul a4, a4, a7 +; RV32IM-NEXT: sub a4, a6, a4 +; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: sh a3, 4(a0) ; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh t0, 0(a0) +; RV32IM-NEXT: sh a2, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: fold_urem_vec_2: @@ -312,30 +350,47 @@ ; RV64I-NEXT: sd s3, 24(sp) ; RV64I-NEXT: sd s4, 16(sp) ; RV64I-NEXT: sd s5, 8(sp) +; RV64I-NEXT: sd s6, 0(sp) ; RV64I-NEXT: lhu s2, 24(a1) ; RV64I-NEXT: lhu s3, 16(a1) -; RV64I-NEXT: lhu s0, 8(a1) -; RV64I-NEXT: lhu a2, 0(a1) -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s4, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: lhu s4, 8(a1) +; RV64I-NEXT: lhu s1, 0(a1) ; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: lui a0, 11 +; RV64I-NEXT: addiw s0, a0, -905 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 22 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s6, s1, a0 +; RV64I-NEXT: mv a0, s4 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 22 ; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s4, s4, a0 ; RV64I-NEXT: mv a0, s3 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 22 ; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s1, s3, a0 ; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: sh a0, 6(s1) -; RV64I-NEXT: sh s0, 4(s1) -; RV64I-NEXT: sh s5, 2(s1) -; RV64I-NEXT: sh s4, 0(s1) +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 22 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s2, a0 +; RV64I-NEXT: sh a0, 6(s5) +; RV64I-NEXT: sh s1, 4(s5) +; RV64I-NEXT: sh s4, 2(s5) +; RV64I-NEXT: sh s6, 0(s5) +; RV64I-NEXT: ld s6, 0(sp) ; RV64I-NEXT: ld s5, 8(sp) ; RV64I-NEXT: ld s4, 16(sp) ; RV64I-NEXT: ld s3, 24(sp) @@ -349,50 +404,32 @@ ; RV64IM-LABEL: fold_urem_vec_2: ; RV64IM: # %bb.0: ; RV64IM-NEXT: lhu a6, 24(a1) -; RV64IM-NEXT: lhu a7, 16(a1) -; RV64IM-NEXT: lhu a4, 8(a1) -; RV64IM-NEXT: lhu a1, 0(a1) -; RV64IM-NEXT: lui a5, 1423 -; RV64IM-NEXT: addiw a5, a5, -733 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1035 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, -1811 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, 561 -; RV64IM-NEXT: mulhu a2, a1, a5 -; RV64IM-NEXT: sub a3, a1, a2 -; RV64IM-NEXT: srli a3, a3, 1 -; RV64IM-NEXT: add a2, a3, a2 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: addi a3, zero, 95 -; RV64IM-NEXT: mul a2, a2, a3 -; RV64IM-NEXT: sub t0, a1, a2 -; RV64IM-NEXT: mulhu a2, a4, a5 -; RV64IM-NEXT: sub a1, a4, a2 -; RV64IM-NEXT: srli a1, a1, 1 -; RV64IM-NEXT: add a1, a1, a2 -; RV64IM-NEXT: srli a1, a1, 6 -; RV64IM-NEXT: mul a1, a1, a3 -; RV64IM-NEXT: sub a1, a4, a1 -; RV64IM-NEXT: mulhu a2, a7, a5 -; RV64IM-NEXT: sub a4, a7, a2 -; RV64IM-NEXT: srli a4, a4, 1 -; RV64IM-NEXT: add a2, a4, a2 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: mul a2, a2, a3 -; RV64IM-NEXT: sub a2, a7, a2 -; RV64IM-NEXT: mulhu a4, a6, a5 -; RV64IM-NEXT: sub a5, a6, a4 -; RV64IM-NEXT: srli a5, a5, 1 -; RV64IM-NEXT: add a4, a5, a4 -; RV64IM-NEXT: srli a4, a4, 6 -; RV64IM-NEXT: mul a3, a4, a3 -; RV64IM-NEXT: sub a3, a6, a3 -; RV64IM-NEXT: sh a3, 6(a0) -; RV64IM-NEXT: sh a2, 4(a0) +; RV64IM-NEXT: lhu a3, 16(a1) +; RV64IM-NEXT: lhu a4, 0(a1) +; RV64IM-NEXT: lhu a1, 8(a1) +; RV64IM-NEXT: lui a5, 11 +; RV64IM-NEXT: addiw a5, a5, -905 +; RV64IM-NEXT: mul a2, a4, a5 +; RV64IM-NEXT: srli a2, a2, 22 +; RV64IM-NEXT: addi a7, zero, 95 +; RV64IM-NEXT: mul a2, a2, a7 +; RV64IM-NEXT: sub a2, a4, a2 +; RV64IM-NEXT: mul a4, a1, a5 +; RV64IM-NEXT: srli a4, a4, 22 +; RV64IM-NEXT: mul a4, a4, a7 +; RV64IM-NEXT: sub a1, a1, a4 +; RV64IM-NEXT: mul a4, a3, a5 +; RV64IM-NEXT: srli a4, a4, 22 +; RV64IM-NEXT: mul a4, a4, a7 +; RV64IM-NEXT: sub a3, a3, a4 +; RV64IM-NEXT: mul a4, a6, a5 +; RV64IM-NEXT: srli a4, a4, 22 +; RV64IM-NEXT: mul a4, a4, a7 +; RV64IM-NEXT: sub a4, a6, a4 +; RV64IM-NEXT: sh a4, 6(a0) +; RV64IM-NEXT: sh a3, 4(a0) ; RV64IM-NEXT: sh a1, 2(a0) -; RV64IM-NEXT: sh t0, 0(a0) +; RV64IM-NEXT: sh a2, 0(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -414,52 +451,53 @@ ; RV32I-NEXT: sw s6, 16(sp) ; RV32I-NEXT: sw s7, 12(sp) ; RV32I-NEXT: sw s8, 8(sp) -; RV32I-NEXT: sw s9, 4(sp) ; RV32I-NEXT: lhu s2, 0(a1) ; RV32I-NEXT: lhu s3, 4(a1) ; RV32I-NEXT: lhu s4, 8(a1) ; RV32I-NEXT: lhu s1, 12(a1) -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: call __umodsi3 ; RV32I-NEXT: mv s5, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s6, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s7, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s8, a0 -; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: lui a0, 11 +; RV32I-NEXT: addi s0, a0, -905 ; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: call __udivsi3 -; RV32I-NEXT: mv s9, a0 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli s6, a0, 22 ; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s6 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s8, s1, a0 ; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: call __udivsi3 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli s7, a0, 22 ; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s7 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s4, s4, a0 ; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: call __udivsi3 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli s1, a0, 22 ; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s3, s3, a0 ; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __udivsi3 -; RV32I-NEXT: add a0, s8, a0 -; RV32I-NEXT: add a1, s7, s1 -; RV32I-NEXT: add a2, s6, s4 -; RV32I-NEXT: add a3, s5, s9 -; RV32I-NEXT: sh a3, 6(s0) -; RV32I-NEXT: sh a2, 4(s0) -; RV32I-NEXT: sh a1, 2(s0) -; RV32I-NEXT: sh a0, 0(s0) -; RV32I-NEXT: lw s9, 4(sp) +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli s0, a0, 22 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s2, a0 +; RV32I-NEXT: add a0, a0, s0 +; RV32I-NEXT: add a1, s3, s1 +; RV32I-NEXT: add a2, s4, s7 +; RV32I-NEXT: add a3, s8, s6 +; RV32I-NEXT: sh a3, 6(s5) +; RV32I-NEXT: sh a2, 4(s5) +; RV32I-NEXT: sh a1, 2(s5) +; RV32I-NEXT: sh a0, 0(s5) ; RV32I-NEXT: lw s8, 8(sp) ; RV32I-NEXT: lw s7, 12(sp) ; RV32I-NEXT: lw s6, 16(sp) @@ -476,173 +514,144 @@ ; RV32IM-LABEL: combine_urem_udiv: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lhu a6, 0(a1) -; RV32IM-NEXT: lhu a7, 4(a1) +; RV32IM-NEXT: lhu a3, 4(a1) ; RV32IM-NEXT: lhu a4, 12(a1) ; RV32IM-NEXT: lhu a1, 8(a1) -; RV32IM-NEXT: lui a5, 364242 -; RV32IM-NEXT: addi a5, a5, 777 -; RV32IM-NEXT: mulhu a2, a4, a5 -; RV32IM-NEXT: sub a3, a4, a2 -; RV32IM-NEXT: srli a3, a3, 1 -; RV32IM-NEXT: add a2, a3, a2 -; RV32IM-NEXT: srli t3, a2, 6 -; RV32IM-NEXT: addi t0, zero, 95 -; RV32IM-NEXT: mul a3, t3, t0 -; RV32IM-NEXT: sub t1, a4, a3 -; RV32IM-NEXT: mulhu a4, a1, a5 -; RV32IM-NEXT: sub a3, a1, a4 -; RV32IM-NEXT: srli a3, a3, 1 -; RV32IM-NEXT: add a3, a3, a4 -; RV32IM-NEXT: srli a3, a3, 6 -; RV32IM-NEXT: mul a4, a3, t0 -; RV32IM-NEXT: sub t2, a1, a4 -; RV32IM-NEXT: mulhu a4, a7, a5 -; RV32IM-NEXT: sub a1, a7, a4 -; RV32IM-NEXT: srli a1, a1, 1 -; RV32IM-NEXT: add a1, a1, a4 -; RV32IM-NEXT: srli a1, a1, 6 -; RV32IM-NEXT: mul a4, a1, t0 -; RV32IM-NEXT: sub a4, a7, a4 -; RV32IM-NEXT: mulhu a5, a6, a5 -; RV32IM-NEXT: sub a2, a6, a5 -; RV32IM-NEXT: srli a2, a2, 1 -; RV32IM-NEXT: add a2, a2, a5 -; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: mul a5, a2, t0 +; RV32IM-NEXT: lui a5, 11 +; RV32IM-NEXT: addi a5, a5, -905 +; RV32IM-NEXT: mul a2, a4, a5 +; RV32IM-NEXT: srli t0, a2, 22 +; RV32IM-NEXT: addi a7, zero, 95 +; RV32IM-NEXT: mul a2, t0, a7 +; RV32IM-NEXT: sub t1, a4, a2 +; RV32IM-NEXT: mul a4, a1, a5 +; RV32IM-NEXT: srli a4, a4, 22 +; RV32IM-NEXT: mul a2, a4, a7 +; RV32IM-NEXT: sub t2, a1, a2 +; RV32IM-NEXT: mul a2, a3, a5 +; RV32IM-NEXT: srli a2, a2, 22 +; RV32IM-NEXT: mul a1, a2, a7 +; RV32IM-NEXT: sub a1, a3, a1 +; RV32IM-NEXT: mul a3, a6, a5 +; RV32IM-NEXT: srli a3, a3, 22 +; RV32IM-NEXT: mul a5, a3, a7 ; RV32IM-NEXT: sub a5, a6, a5 -; RV32IM-NEXT: add a2, a5, a2 -; RV32IM-NEXT: add a1, a4, a1 -; RV32IM-NEXT: add a3, t2, a3 -; RV32IM-NEXT: add a4, t1, t3 +; RV32IM-NEXT: add a3, a5, a3 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: add a2, t2, a4 +; RV32IM-NEXT: add a4, t1, t0 ; RV32IM-NEXT: sh a4, 6(a0) -; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a2, 4(a0) ; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: sh a3, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: combine_urem_udiv: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -96 -; RV64I-NEXT: sd ra, 88(sp) -; RV64I-NEXT: sd s0, 80(sp) -; RV64I-NEXT: sd s1, 72(sp) -; RV64I-NEXT: sd s2, 64(sp) -; RV64I-NEXT: sd s3, 56(sp) -; RV64I-NEXT: sd s4, 48(sp) -; RV64I-NEXT: sd s5, 40(sp) -; RV64I-NEXT: sd s6, 32(sp) -; RV64I-NEXT: sd s7, 24(sp) -; RV64I-NEXT: sd s8, 16(sp) -; RV64I-NEXT: sd s9, 8(sp) +; RV64I-NEXT: addi sp, sp, -80 +; RV64I-NEXT: sd ra, 72(sp) +; RV64I-NEXT: sd s0, 64(sp) +; RV64I-NEXT: sd s1, 56(sp) +; RV64I-NEXT: sd s2, 48(sp) +; RV64I-NEXT: sd s3, 40(sp) +; RV64I-NEXT: sd s4, 32(sp) +; RV64I-NEXT: sd s5, 24(sp) +; RV64I-NEXT: sd s6, 16(sp) +; RV64I-NEXT: sd s7, 8(sp) +; RV64I-NEXT: sd s8, 0(sp) ; RV64I-NEXT: lhu s2, 0(a1) ; RV64I-NEXT: lhu s3, 8(a1) ; RV64I-NEXT: lhu s4, 16(a1) ; RV64I-NEXT: lhu s1, 24(a1) -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: call __umoddi3 ; RV64I-NEXT: mv s5, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s6, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s3 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s7, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s8, a0 -; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: lui a0, 11 +; RV64I-NEXT: addiw s0, a0, -905 ; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: call __udivdi3 -; RV64I-NEXT: mv s9, a0 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli s6, a0, 22 ; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s6 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s8, s1, a0 ; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: call __udivdi3 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli s7, a0, 22 ; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s7 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s4, s4, a0 ; RV64I-NEXT: mv a0, s3 -; RV64I-NEXT: call __udivdi3 -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli s1, a0, 22 ; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s3, s3, a0 ; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __udivdi3 -; RV64I-NEXT: add a0, s8, a0 -; RV64I-NEXT: add a1, s7, s1 -; RV64I-NEXT: add a2, s6, s4 -; RV64I-NEXT: add a3, s5, s9 -; RV64I-NEXT: sh a3, 6(s0) -; RV64I-NEXT: sh a2, 4(s0) -; RV64I-NEXT: sh a1, 2(s0) -; RV64I-NEXT: sh a0, 0(s0) -; RV64I-NEXT: ld s9, 8(sp) -; RV64I-NEXT: ld s8, 16(sp) -; RV64I-NEXT: ld s7, 24(sp) -; RV64I-NEXT: ld s6, 32(sp) -; RV64I-NEXT: ld s5, 40(sp) -; RV64I-NEXT: ld s4, 48(sp) -; RV64I-NEXT: ld s3, 56(sp) -; RV64I-NEXT: ld s2, 64(sp) -; RV64I-NEXT: ld s1, 72(sp) -; RV64I-NEXT: ld s0, 80(sp) -; RV64I-NEXT: ld ra, 88(sp) -; RV64I-NEXT: addi sp, sp, 96 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli s0, a0, 22 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s2, a0 +; RV64I-NEXT: add a0, a0, s0 +; RV64I-NEXT: add a1, s3, s1 +; RV64I-NEXT: add a2, s4, s7 +; RV64I-NEXT: add a3, s8, s6 +; RV64I-NEXT: sh a3, 6(s5) +; RV64I-NEXT: sh a2, 4(s5) +; RV64I-NEXT: sh a1, 2(s5) +; RV64I-NEXT: sh a0, 0(s5) +; RV64I-NEXT: ld s8, 0(sp) +; RV64I-NEXT: ld s7, 8(sp) +; RV64I-NEXT: ld s6, 16(sp) +; RV64I-NEXT: ld s5, 24(sp) +; RV64I-NEXT: ld s4, 32(sp) +; RV64I-NEXT: ld s3, 40(sp) +; RV64I-NEXT: ld s2, 48(sp) +; RV64I-NEXT: ld s1, 56(sp) +; RV64I-NEXT: ld s0, 64(sp) +; RV64I-NEXT: ld ra, 72(sp) +; RV64I-NEXT: addi sp, sp, 80 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: combine_urem_udiv: ; RV64IM: # %bb.0: ; RV64IM-NEXT: lhu a6, 0(a1) -; RV64IM-NEXT: lhu a7, 8(a1) -; RV64IM-NEXT: lhu a4, 16(a1) -; RV64IM-NEXT: lhu a1, 24(a1) -; RV64IM-NEXT: lui a5, 1423 -; RV64IM-NEXT: addiw a5, a5, -733 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1035 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, -1811 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, 561 -; RV64IM-NEXT: mulhu a2, a1, a5 -; RV64IM-NEXT: sub a3, a1, a2 -; RV64IM-NEXT: srli a3, a3, 1 -; RV64IM-NEXT: add a2, a3, a2 -; RV64IM-NEXT: srli t3, a2, 6 -; RV64IM-NEXT: addi t0, zero, 95 -; RV64IM-NEXT: mul a3, t3, t0 -; RV64IM-NEXT: sub t1, a1, a3 -; RV64IM-NEXT: mulhu a3, a4, a5 -; RV64IM-NEXT: sub a1, a4, a3 -; RV64IM-NEXT: srli a1, a1, 1 -; RV64IM-NEXT: add a1, a1, a3 -; RV64IM-NEXT: srli a1, a1, 6 -; RV64IM-NEXT: mul a3, a1, t0 -; RV64IM-NEXT: sub t2, a4, a3 -; RV64IM-NEXT: mulhu a4, a7, a5 -; RV64IM-NEXT: sub a3, a7, a4 -; RV64IM-NEXT: srli a3, a3, 1 -; RV64IM-NEXT: add a3, a3, a4 -; RV64IM-NEXT: srli a3, a3, 6 -; RV64IM-NEXT: mul a4, a3, t0 -; RV64IM-NEXT: sub a4, a7, a4 -; RV64IM-NEXT: mulhu a5, a6, a5 -; RV64IM-NEXT: sub a2, a6, a5 -; RV64IM-NEXT: srli a2, a2, 1 -; RV64IM-NEXT: add a2, a2, a5 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: mul a5, a2, t0 +; RV64IM-NEXT: lhu a3, 8(a1) +; RV64IM-NEXT: lhu a4, 24(a1) +; RV64IM-NEXT: lhu a1, 16(a1) +; RV64IM-NEXT: lui a5, 11 +; RV64IM-NEXT: addiw a5, a5, -905 +; RV64IM-NEXT: mul a2, a4, a5 +; RV64IM-NEXT: srli t0, a2, 22 +; RV64IM-NEXT: addi a7, zero, 95 +; RV64IM-NEXT: mul a2, t0, a7 +; RV64IM-NEXT: sub t1, a4, a2 +; RV64IM-NEXT: mul a4, a1, a5 +; RV64IM-NEXT: srli a4, a4, 22 +; RV64IM-NEXT: mul a2, a4, a7 +; RV64IM-NEXT: sub t2, a1, a2 +; RV64IM-NEXT: mul a2, a3, a5 +; RV64IM-NEXT: srli a2, a2, 22 +; RV64IM-NEXT: mul a1, a2, a7 +; RV64IM-NEXT: sub a1, a3, a1 +; RV64IM-NEXT: mul a3, a6, a5 +; RV64IM-NEXT: srli a3, a3, 22 +; RV64IM-NEXT: mul a5, a3, a7 ; RV64IM-NEXT: sub a5, a6, a5 -; RV64IM-NEXT: add a2, a5, a2 -; RV64IM-NEXT: add a3, a4, a3 -; RV64IM-NEXT: add a1, t2, a1 -; RV64IM-NEXT: add a4, t1, t3 +; RV64IM-NEXT: add a3, a5, a3 +; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: add a2, t2, a4 +; RV64IM-NEXT: add a4, t1, t0 ; RV64IM-NEXT: sh a4, 6(a0) -; RV64IM-NEXT: sh a1, 4(a0) -; RV64IM-NEXT: sh a3, 2(a0) -; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: sh a2, 4(a0) +; RV64IM-NEXT: sh a1, 2(a0) +; RV64IM-NEXT: sh a3, 0(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, %2 = udiv <4 x i16> %x, @@ -660,21 +669,28 @@ ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: lhu s2, 8(a1) ; RV32I-NEXT: lhu s3, 4(a1) -; RV32I-NEXT: lhu s1, 0(a1) -; RV32I-NEXT: lhu a2, 12(a1) -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu s4, 0(a1) +; RV32I-NEXT: lhu s0, 12(a1) +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lui a0, 11 +; RV32I-NEXT: addi a1, a0, -905 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 22 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: andi a1, s1, 63 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: andi a1, s4, 63 ; RV32I-NEXT: andi a2, s3, 31 ; RV32I-NEXT: andi a3, s2, 7 -; RV32I-NEXT: sh a0, 6(s0) -; RV32I-NEXT: sh a3, 4(s0) -; RV32I-NEXT: sh a2, 2(s0) -; RV32I-NEXT: sh a1, 0(s0) +; RV32I-NEXT: sh a3, 4(s1) +; RV32I-NEXT: sh a2, 2(s1) +; RV32I-NEXT: sh a1, 0(s1) +; RV32I-NEXT: sh a0, 6(s1) +; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -689,15 +705,12 @@ ; RV32IM-NEXT: lhu a3, 4(a1) ; RV32IM-NEXT: lhu a4, 12(a1) ; RV32IM-NEXT: lhu a1, 0(a1) -; RV32IM-NEXT: lui a5, 364242 -; RV32IM-NEXT: addi a5, a5, 777 -; RV32IM-NEXT: mulhu a5, a4, a5 -; RV32IM-NEXT: sub a2, a4, a5 -; RV32IM-NEXT: srli a2, a2, 1 -; RV32IM-NEXT: add a2, a2, a5 -; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: addi a5, zero, 95 -; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: lui a5, 11 +; RV32IM-NEXT: addi a5, a5, -905 +; RV32IM-NEXT: mul a5, a4, a5 +; RV32IM-NEXT: srli a5, a5, 22 +; RV32IM-NEXT: addi a2, zero, 95 +; RV32IM-NEXT: mul a2, a5, a2 ; RV32IM-NEXT: sub a2, a4, a2 ; RV32IM-NEXT: andi a1, a1, 63 ; RV32IM-NEXT: andi a3, a3, 31 @@ -716,21 +729,28 @@ ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) ; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: sd s4, 0(sp) ; RV64I-NEXT: lhu s2, 16(a1) ; RV64I-NEXT: lhu s3, 8(a1) -; RV64I-NEXT: lhu s1, 0(a1) -; RV64I-NEXT: lhu a2, 24(a1) -; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu s4, 0(a1) +; RV64I-NEXT: lhu s0, 24(a1) +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lui a0, 11 +; RV64I-NEXT: addiw a1, a0, -905 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 22 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: andi a1, s1, 63 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s0, a0 +; RV64I-NEXT: andi a1, s4, 63 ; RV64I-NEXT: andi a2, s3, 31 ; RV64I-NEXT: andi a3, s2, 7 -; RV64I-NEXT: sh a0, 6(s0) -; RV64I-NEXT: sh a3, 4(s0) -; RV64I-NEXT: sh a2, 2(s0) -; RV64I-NEXT: sh a1, 0(s0) +; RV64I-NEXT: sh a3, 4(s1) +; RV64I-NEXT: sh a2, 2(s1) +; RV64I-NEXT: sh a1, 0(s1) +; RV64I-NEXT: sh a0, 6(s1) +; RV64I-NEXT: ld s4, 0(sp) ; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) @@ -743,31 +763,22 @@ ; RV64IM: # %bb.0: ; RV64IM-NEXT: lhu a6, 16(a1) ; RV64IM-NEXT: lhu a3, 8(a1) -; RV64IM-NEXT: lhu a4, 0(a1) -; RV64IM-NEXT: lhu a1, 24(a1) -; RV64IM-NEXT: lui a5, 1423 -; RV64IM-NEXT: addiw a5, a5, -733 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1035 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, -1811 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, 561 -; RV64IM-NEXT: mulhu a5, a1, a5 -; RV64IM-NEXT: sub a2, a1, a5 -; RV64IM-NEXT: srli a2, a2, 1 -; RV64IM-NEXT: add a2, a2, a5 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: addi a5, zero, 95 -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: sub a1, a1, a2 -; RV64IM-NEXT: andi a2, a4, 63 +; RV64IM-NEXT: lhu a4, 24(a1) +; RV64IM-NEXT: lhu a1, 0(a1) +; RV64IM-NEXT: lui a5, 11 +; RV64IM-NEXT: addiw a5, a5, -905 +; RV64IM-NEXT: mul a5, a4, a5 +; RV64IM-NEXT: srli a5, a5, 22 +; RV64IM-NEXT: addi a2, zero, 95 +; RV64IM-NEXT: mul a2, a5, a2 +; RV64IM-NEXT: sub a2, a4, a2 +; RV64IM-NEXT: andi a1, a1, 63 ; RV64IM-NEXT: andi a3, a3, 31 ; RV64IM-NEXT: andi a4, a6, 7 ; RV64IM-NEXT: sh a4, 4(a0) ; RV64IM-NEXT: sh a3, 2(a0) -; RV64IM-NEXT: sh a2, 0(a0) -; RV64IM-NEXT: sh a1, 6(a0) +; RV64IM-NEXT: sh a1, 0(a0) +; RV64IM-NEXT: sh a2, 6(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -783,26 +794,50 @@ ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: lhu s2, 12(a1) +; RV32I-NEXT: lhu s3, 4(a1) ; RV32I-NEXT: lhu s1, 8(a1) -; RV32I-NEXT: lhu a2, 4(a1) ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a1, zero, 654 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s3, a0 -; RV32I-NEXT: addi a1, zero, 23 +; RV32I-NEXT: lui a0, 6 +; RV32I-NEXT: addi a1, a0, 1069 ; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a1, a0, 1327 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: sub a1, s1, a0 +; RV32I-NEXT: lui a2, 16 +; RV32I-NEXT: addi a3, a2, -2 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: srli a1, a1, 1 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: addi a1, a2, -16 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: srli a0, a0, 4 +; RV32I-NEXT: addi a1, zero, 23 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s4, s1, a0 +; RV32I-NEXT: lui a0, 13 +; RV32I-NEXT: addi a1, a0, -1941 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 25 +; RV32I-NEXT: addi a1, zero, 654 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s1, s3, a0 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a1, a0, 87 ; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 26 +; RV32I-NEXT: lui a1, 1 +; RV32I-NEXT: addi a1, a1, 1327 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s2, a0 ; RV32I-NEXT: sh zero, 0(s0) ; RV32I-NEXT: sh a0, 6(s0) -; RV32I-NEXT: sh s1, 4(s0) -; RV32I-NEXT: sh s3, 2(s0) +; RV32I-NEXT: sh s1, 2(s0) +; RV32I-NEXT: sh s4, 4(s0) +; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -813,36 +848,42 @@ ; ; RV32IM-LABEL: dont_fold_urem_one: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lhu a2, 4(a1) -; RV32IM-NEXT: lhu a3, 12(a1) -; RV32IM-NEXT: lhu a1, 8(a1) -; RV32IM-NEXT: srli a4, a2, 1 -; RV32IM-NEXT: lui a5, 820904 -; RV32IM-NEXT: addi a5, a5, -1903 -; RV32IM-NEXT: mulhu a4, a4, a5 -; RV32IM-NEXT: srli a4, a4, 8 -; RV32IM-NEXT: addi a5, zero, 654 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a2, a2, a4 -; RV32IM-NEXT: lui a4, 729444 -; RV32IM-NEXT: addi a4, a4, 713 -; RV32IM-NEXT: mulhu a4, a1, a4 -; RV32IM-NEXT: srli a4, a4, 4 -; RV32IM-NEXT: addi a5, zero, 23 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a1, a1, a4 -; RV32IM-NEXT: lui a4, 395996 -; RV32IM-NEXT: addi a4, a4, -2009 -; RV32IM-NEXT: mulhu a4, a3, a4 -; RV32IM-NEXT: srli a4, a4, 11 -; RV32IM-NEXT: lui a5, 1 -; RV32IM-NEXT: addi a5, a5, 1327 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: lhu a6, 12(a1) +; RV32IM-NEXT: lhu a3, 8(a1) +; RV32IM-NEXT: lhu a1, 4(a1) +; RV32IM-NEXT: lui a4, 6 +; RV32IM-NEXT: addi a4, a4, 1069 +; RV32IM-NEXT: mul a4, a3, a4 +; RV32IM-NEXT: srli a4, a4, 16 +; RV32IM-NEXT: sub a5, a3, a4 +; RV32IM-NEXT: lui a2, 16 +; RV32IM-NEXT: addi a2, a2, -2 +; RV32IM-NEXT: and a2, a5, a2 +; RV32IM-NEXT: srli a2, a2, 1 +; RV32IM-NEXT: add a2, a2, a4 +; RV32IM-NEXT: srli a2, a2, 4 +; RV32IM-NEXT: addi a4, zero, 23 +; RV32IM-NEXT: mul a2, a2, a4 +; RV32IM-NEXT: sub a2, a3, a2 +; RV32IM-NEXT: lui a3, 13 +; RV32IM-NEXT: addi a3, a3, -1941 +; RV32IM-NEXT: mul a3, a1, a3 +; RV32IM-NEXT: srli a3, a3, 25 +; RV32IM-NEXT: addi a4, zero, 654 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: sub a1, a1, a3 +; RV32IM-NEXT: lui a3, 3 +; RV32IM-NEXT: addi a3, a3, 87 +; RV32IM-NEXT: mul a3, a6, a3 +; RV32IM-NEXT: srli a3, a3, 26 +; RV32IM-NEXT: lui a4, 1 +; RV32IM-NEXT: addi a4, a4, 1327 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: sub a3, a6, a3 ; RV32IM-NEXT: sh zero, 0(a0) ; RV32IM-NEXT: sh a3, 6(a0) -; RV32IM-NEXT: sh a1, 4(a0) -; RV32IM-NEXT: sh a2, 2(a0) +; RV32IM-NEXT: sh a1, 2(a0) +; RV32IM-NEXT: sh a2, 4(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_urem_one: @@ -853,26 +894,50 @@ ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) ; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: sd s4, 0(sp) ; RV64I-NEXT: lhu s2, 24(a1) +; RV64I-NEXT: lhu s3, 8(a1) ; RV64I-NEXT: lhu s1, 16(a1) -; RV64I-NEXT: lhu a2, 8(a1) ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: addi a1, zero, 654 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s3, a0 -; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: lui a0, 6 +; RV64I-NEXT: addiw a1, a0, 1069 ; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a1, a0, 1327 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: sub a1, s1, a0 +; RV64I-NEXT: lui a2, 16 +; RV64I-NEXT: addiw a3, a2, -2 +; RV64I-NEXT: and a1, a1, a3 +; RV64I-NEXT: srli a1, a1, 1 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: addiw a1, a2, -16 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: srli a0, a0, 4 +; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s4, s1, a0 +; RV64I-NEXT: lui a0, 13 +; RV64I-NEXT: addiw a1, a0, -1941 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 25 +; RV64I-NEXT: addi a1, zero, 654 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s1, s3, a0 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a1, a0, 87 ; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 26 +; RV64I-NEXT: lui a1, 1 +; RV64I-NEXT: addiw a1, a1, 1327 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s2, a0 ; RV64I-NEXT: sh zero, 0(s0) ; RV64I-NEXT: sh a0, 6(s0) -; RV64I-NEXT: sh s1, 4(s0) -; RV64I-NEXT: sh s3, 2(s0) +; RV64I-NEXT: sh s1, 2(s0) +; RV64I-NEXT: sh s4, 4(s0) +; RV64I-NEXT: ld s4, 0(sp) ; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) @@ -883,57 +948,42 @@ ; ; RV64IM-LABEL: dont_fold_urem_one: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lhu a2, 24(a1) -; RV64IM-NEXT: lhu a3, 8(a1) -; RV64IM-NEXT: lhu a1, 16(a1) -; RV64IM-NEXT: lui a4, 3206 -; RV64IM-NEXT: addiw a4, a4, -1781 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 1069 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, -1959 -; RV64IM-NEXT: slli a4, a4, 14 -; RV64IM-NEXT: addi a4, a4, 713 -; RV64IM-NEXT: mulhu a4, a1, a4 -; RV64IM-NEXT: sub a5, a1, a4 -; RV64IM-NEXT: srli a5, a5, 1 -; RV64IM-NEXT: add a4, a5, a4 -; RV64IM-NEXT: srli a4, a4, 4 -; RV64IM-NEXT: addi a5, zero, 23 -; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a1, a1, a4 -; RV64IM-NEXT: srli a4, a3, 1 -; RV64IM-NEXT: lui a5, 6413 -; RV64IM-NEXT: addiw a5, a5, 1265 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, 1027 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, 1077 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, 965 -; RV64IM-NEXT: mulhu a4, a4, a5 -; RV64IM-NEXT: srli a4, a4, 7 -; RV64IM-NEXT: addi a5, zero, 654 -; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a3, a3, a4 -; RV64IM-NEXT: lui a4, 1044567 -; RV64IM-NEXT: addiw a4, a4, -575 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 883 -; RV64IM-NEXT: slli a4, a4, 14 -; RV64IM-NEXT: addi a4, a4, -861 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, -179 -; RV64IM-NEXT: mulhu a4, a2, a4 -; RV64IM-NEXT: srli a4, a4, 12 -; RV64IM-NEXT: lui a5, 1 -; RV64IM-NEXT: addiw a5, a5, 1327 -; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: lhu a6, 24(a1) +; RV64IM-NEXT: lhu a3, 16(a1) +; RV64IM-NEXT: lhu a1, 8(a1) +; RV64IM-NEXT: lui a4, 6 +; RV64IM-NEXT: addiw a4, a4, 1069 +; RV64IM-NEXT: mul a4, a3, a4 +; RV64IM-NEXT: srli a4, a4, 16 +; RV64IM-NEXT: sub a5, a3, a4 +; RV64IM-NEXT: lui a2, 16 +; RV64IM-NEXT: addiw a2, a2, -2 +; RV64IM-NEXT: and a2, a5, a2 +; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: add a2, a2, a4 +; RV64IM-NEXT: srli a2, a2, 4 +; RV64IM-NEXT: addi a4, zero, 23 +; RV64IM-NEXT: mul a2, a2, a4 +; RV64IM-NEXT: sub a2, a3, a2 +; RV64IM-NEXT: lui a3, 13 +; RV64IM-NEXT: addiw a3, a3, -1941 +; RV64IM-NEXT: mul a3, a1, a3 +; RV64IM-NEXT: srli a3, a3, 25 +; RV64IM-NEXT: addi a4, zero, 654 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: sub a1, a1, a3 +; RV64IM-NEXT: lui a3, 3 +; RV64IM-NEXT: addiw a3, a3, 87 +; RV64IM-NEXT: mul a3, a6, a3 +; RV64IM-NEXT: srli a3, a3, 26 +; RV64IM-NEXT: lui a4, 1 +; RV64IM-NEXT: addiw a4, a4, 1327 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: sub a3, a6, a3 ; RV64IM-NEXT: sh zero, 0(a0) -; RV64IM-NEXT: sh a2, 6(a0) -; RV64IM-NEXT: sh a3, 2(a0) -; RV64IM-NEXT: sh a1, 4(a0) +; RV64IM-NEXT: sh a3, 6(a0) +; RV64IM-NEXT: sh a1, 2(a0) +; RV64IM-NEXT: sh a2, 4(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -952,180 +1002,545 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32I-LABEL: dont_fold_urem_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -48 -; RV32I-NEXT: sw ra, 44(sp) -; RV32I-NEXT: sw s0, 40(sp) -; RV32I-NEXT: sw s1, 36(sp) -; RV32I-NEXT: sw s2, 32(sp) -; RV32I-NEXT: sw s3, 28(sp) -; RV32I-NEXT: sw s4, 24(sp) -; RV32I-NEXT: sw s5, 20(sp) -; RV32I-NEXT: sw s6, 16(sp) -; RV32I-NEXT: sw s7, 12(sp) -; RV32I-NEXT: sw s8, 8(sp) -; RV32I-NEXT: sw s9, 4(sp) -; RV32I-NEXT: lw s2, 24(a1) -; RV32I-NEXT: lw s3, 28(a1) -; RV32I-NEXT: lw s4, 16(a1) -; RV32I-NEXT: lw s5, 20(a1) -; RV32I-NEXT: lw s6, 8(a1) -; RV32I-NEXT: lw s1, 12(a1) -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a2, zero, 1 -; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __umoddi3 -; RV32I-NEXT: mv s7, a0 -; RV32I-NEXT: mv s8, a1 +; RV32I-NEXT: addi sp, sp, -464 +; RV32I-NEXT: sw ra, 460(sp) +; RV32I-NEXT: sw s0, 456(sp) +; RV32I-NEXT: sw s1, 452(sp) +; RV32I-NEXT: sw s2, 448(sp) +; RV32I-NEXT: sw s3, 444(sp) +; RV32I-NEXT: sw s4, 440(sp) +; RV32I-NEXT: sw s5, 436(sp) +; RV32I-NEXT: sw s6, 432(sp) +; RV32I-NEXT: sw s7, 428(sp) +; RV32I-NEXT: sw s8, 424(sp) +; RV32I-NEXT: sw s9, 420(sp) +; RV32I-NEXT: sw s10, 416(sp) +; RV32I-NEXT: sw s11, 412(sp) +; RV32I-NEXT: lw a2, 0(a1) +; RV32I-NEXT: sw a2, 20(sp) +; RV32I-NEXT: lw a2, 4(a1) +; RV32I-NEXT: sw a2, 16(sp) +; RV32I-NEXT: lw s6, 24(a1) +; RV32I-NEXT: lw s0, 28(a1) +; RV32I-NEXT: lw s7, 16(a1) +; RV32I-NEXT: lw s1, 20(a1) +; RV32I-NEXT: lw s8, 8(a1) +; RV32I-NEXT: lw s10, 12(a1) +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: sw zero, 180(sp) +; RV32I-NEXT: sw zero, 176(sp) +; RV32I-NEXT: sw zero, 196(sp) +; RV32I-NEXT: sw zero, 192(sp) +; RV32I-NEXT: lui a0, 410452 +; RV32I-NEXT: addi a0, a0, -952 +; RV32I-NEXT: sw a0, 172(sp) +; RV32I-NEXT: lui a0, 25653 +; RV32I-NEXT: addi a0, a0, 965 +; RV32I-NEXT: sw a0, 168(sp) +; RV32I-NEXT: srli a0, s10, 1 +; RV32I-NEXT: sw a0, 188(sp) +; RV32I-NEXT: slli a0, s10, 31 +; RV32I-NEXT: srli a1, s8, 1 +; RV32I-NEXT: or a3, a1, a0 +; RV32I-NEXT: addi a0, sp, 200 +; RV32I-NEXT: addi a1, sp, 184 +; RV32I-NEXT: addi a2, sp, 168 +; RV32I-NEXT: sw a3, 184(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: sw zero, 276(sp) +; RV32I-NEXT: sw zero, 272(sp) +; RV32I-NEXT: sw zero, 292(sp) +; RV32I-NEXT: sw zero, 288(sp) +; RV32I-NEXT: lui a0, 410312 +; RV32I-NEXT: addi a0, a0, 1424 +; RV32I-NEXT: sw a0, 268(sp) +; RV32I-NEXT: lui a0, 729444 +; RV32I-NEXT: addi a0, a0, 713 +; RV32I-NEXT: sw a0, 264(sp) +; RV32I-NEXT: sw s1, 284(sp) +; RV32I-NEXT: mv s3, s1 +; RV32I-NEXT: sw s1, 8(sp) +; RV32I-NEXT: addi a0, sp, 296 +; RV32I-NEXT: addi a1, sp, 280 +; RV32I-NEXT: addi a2, sp, 264 +; RV32I-NEXT: sw s7, 280(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: sw zero, 372(sp) +; RV32I-NEXT: sw zero, 368(sp) +; RV32I-NEXT: sw zero, 388(sp) +; RV32I-NEXT: sw zero, 384(sp) +; RV32I-NEXT: lui a0, 791991 +; RV32I-NEXT: addi a0, a0, 77 +; RV32I-NEXT: sw a0, 364(sp) +; RV32I-NEXT: lui a0, 834723 +; RV32I-NEXT: addi a0, a0, -179 +; RV32I-NEXT: sw a0, 360(sp) +; RV32I-NEXT: sw s0, 380(sp) +; RV32I-NEXT: mv s4, s0 +; RV32I-NEXT: sw s0, 12(sp) +; RV32I-NEXT: addi a0, sp, 392 +; RV32I-NEXT: addi a1, sp, 376 +; RV32I-NEXT: addi a2, sp, 360 +; RV32I-NEXT: sw s6, 376(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: sw zero, 132(sp) +; RV32I-NEXT: sw zero, 128(sp) +; RV32I-NEXT: sw zero, 124(sp) +; RV32I-NEXT: sw zero, 120(sp) +; RV32I-NEXT: sw zero, 148(sp) +; RV32I-NEXT: sw zero, 144(sp) +; RV32I-NEXT: lw s0, 208(sp) +; RV32I-NEXT: sub a0, s8, s0 +; RV32I-NEXT: sw a0, 136(sp) +; RV32I-NEXT: lw s1, 212(sp) +; RV32I-NEXT: sltu a0, s8, s0 +; RV32I-NEXT: sub a1, s10, s1 +; RV32I-NEXT: sub a3, a1, a0 +; RV32I-NEXT: addi a0, sp, 152 +; RV32I-NEXT: addi a1, sp, 136 +; RV32I-NEXT: addi a2, sp, 120 +; RV32I-NEXT: sw a3, 140(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: sw zero, 228(sp) +; RV32I-NEXT: sw zero, 224(sp) +; RV32I-NEXT: lui a0, 524288 +; RV32I-NEXT: sw a0, 220(sp) +; RV32I-NEXT: sw zero, 216(sp) +; RV32I-NEXT: sw zero, 244(sp) +; RV32I-NEXT: sw zero, 240(sp) +; RV32I-NEXT: lw s2, 304(sp) +; RV32I-NEXT: sub a0, s7, s2 +; RV32I-NEXT: sw a0, 232(sp) +; RV32I-NEXT: lw s9, 308(sp) +; RV32I-NEXT: sltu a0, s7, s2 +; RV32I-NEXT: sub a1, s3, s9 +; RV32I-NEXT: sub a3, a1, a0 +; RV32I-NEXT: addi a0, sp, 248 +; RV32I-NEXT: addi a1, sp, 232 +; RV32I-NEXT: addi a2, sp, 216 +; RV32I-NEXT: sw a3, 236(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: sw zero, 324(sp) +; RV32I-NEXT: sw zero, 320(sp) +; RV32I-NEXT: sw zero, 316(sp) +; RV32I-NEXT: sw zero, 312(sp) +; RV32I-NEXT: sw zero, 340(sp) +; RV32I-NEXT: sw zero, 336(sp) +; RV32I-NEXT: lw s3, 400(sp) +; RV32I-NEXT: sub a0, s6, s3 +; RV32I-NEXT: sw a0, 328(sp) +; RV32I-NEXT: lw s11, 404(sp) +; RV32I-NEXT: sltu a0, s6, s3 +; RV32I-NEXT: sub a1, s4, s11 +; RV32I-NEXT: sub a3, a1, a0 +; RV32I-NEXT: addi a0, sp, 344 +; RV32I-NEXT: addi a1, sp, 328 +; RV32I-NEXT: addi a2, sp, 312 +; RV32I-NEXT: sw a3, 332(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: lw a0, 164(sp) +; RV32I-NEXT: lw a1, 160(sp) +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: add a2, a1, s0 +; RV32I-NEXT: sltu a1, a2, a1 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: slli a0, a1, 25 +; RV32I-NEXT: srli a2, a2, 7 +; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: srli a1, a1, 7 ; RV32I-NEXT: addi a2, zero, 654 -; RV32I-NEXT: mv a0, s6 -; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __umoddi3 -; RV32I-NEXT: mv s6, a0 -; RV32I-NEXT: mv s9, a1 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: sub a0, s10, a1 +; RV32I-NEXT: lw a1, 260(sp) +; RV32I-NEXT: lw a2, 256(sp) +; RV32I-NEXT: sltu a3, s8, s4 +; RV32I-NEXT: sub s10, a0, a3 +; RV32I-NEXT: add a0, a1, s9 +; RV32I-NEXT: add a1, a2, s2 +; RV32I-NEXT: sltu a2, a1, a2 +; RV32I-NEXT: add a2, a0, a2 +; RV32I-NEXT: slli a0, a2, 28 +; RV32I-NEXT: srli a1, a1, 4 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: srli a1, a2, 4 ; RV32I-NEXT: addi a2, zero, 23 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s5 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __umoddi3 -; RV32I-NEXT: mv s4, a0 -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a2, a0, 1327 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: mv s9, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: lw a1, 356(sp) +; RV32I-NEXT: lw a2, 352(sp) +; RV32I-NEXT: sltu a3, s7, s9 +; RV32I-NEXT: sub s2, a0, a3 +; RV32I-NEXT: add a0, a1, s11 +; RV32I-NEXT: add a1, a2, s3 +; RV32I-NEXT: sltu a2, a1, a2 +; RV32I-NEXT: add a2, a0, a2 +; RV32I-NEXT: slli a0, a2, 20 +; RV32I-NEXT: srli a1, a1, 12 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: srli a1, a2, 12 +; RV32I-NEXT: lui a2, 1 +; RV32I-NEXT: addi a2, a2, 1327 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __umoddi3 -; RV32I-NEXT: sw a1, 28(s0) -; RV32I-NEXT: sw a0, 24(s0) -; RV32I-NEXT: sw s1, 20(s0) -; RV32I-NEXT: sw s4, 16(s0) -; RV32I-NEXT: sw s9, 12(s0) -; RV32I-NEXT: sw s6, 8(s0) -; RV32I-NEXT: sw s8, 4(s0) -; RV32I-NEXT: sw s7, 0(s0) -; RV32I-NEXT: lw s9, 4(sp) -; RV32I-NEXT: lw s8, 8(sp) -; RV32I-NEXT: lw s7, 12(sp) -; RV32I-NEXT: lw s6, 16(sp) -; RV32I-NEXT: lw s5, 20(sp) -; RV32I-NEXT: lw s4, 24(sp) -; RV32I-NEXT: lw s3, 28(sp) -; RV32I-NEXT: lw s2, 32(sp) -; RV32I-NEXT: lw s1, 36(sp) -; RV32I-NEXT: lw s0, 40(sp) -; RV32I-NEXT: lw ra, 44(sp) -; RV32I-NEXT: addi sp, sp, 48 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: mv s11, a0 +; RV32I-NEXT: sltu a0, s6, a0 +; RV32I-NEXT: lw a2, 12(sp) +; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: sub s3, a1, a0 +; RV32I-NEXT: addi a2, zero, 1 +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: lw s0, 16(sp) +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: sltu a2, s1, a0 +; RV32I-NEXT: sub a1, s0, a1 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sub a2, s8, s4 +; RV32I-NEXT: sub a3, s7, s9 +; RV32I-NEXT: sub a4, s6, s11 +; RV32I-NEXT: sub a0, s1, a0 +; RV32I-NEXT: sw a0, 0(s5) +; RV32I-NEXT: sw a1, 4(s5) +; RV32I-NEXT: sw a4, 24(s5) +; RV32I-NEXT: sw a3, 16(s5) +; RV32I-NEXT: sw s3, 28(s5) +; RV32I-NEXT: sw s2, 20(s5) +; RV32I-NEXT: sw a2, 8(s5) +; RV32I-NEXT: sw s10, 12(s5) +; RV32I-NEXT: lw s11, 412(sp) +; RV32I-NEXT: lw s10, 416(sp) +; RV32I-NEXT: lw s9, 420(sp) +; RV32I-NEXT: lw s8, 424(sp) +; RV32I-NEXT: lw s7, 428(sp) +; RV32I-NEXT: lw s6, 432(sp) +; RV32I-NEXT: lw s5, 436(sp) +; RV32I-NEXT: lw s4, 440(sp) +; RV32I-NEXT: lw s3, 444(sp) +; RV32I-NEXT: lw s2, 448(sp) +; RV32I-NEXT: lw s1, 452(sp) +; RV32I-NEXT: lw s0, 456(sp) +; RV32I-NEXT: lw ra, 460(sp) +; RV32I-NEXT: addi sp, sp, 464 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: dont_fold_urem_i64: ; RV32IM: # %bb.0: -; RV32IM-NEXT: addi sp, sp, -48 -; RV32IM-NEXT: sw ra, 44(sp) -; RV32IM-NEXT: sw s0, 40(sp) -; RV32IM-NEXT: sw s1, 36(sp) -; RV32IM-NEXT: sw s2, 32(sp) -; RV32IM-NEXT: sw s3, 28(sp) -; RV32IM-NEXT: sw s4, 24(sp) -; RV32IM-NEXT: sw s5, 20(sp) -; RV32IM-NEXT: sw s6, 16(sp) -; RV32IM-NEXT: sw s7, 12(sp) -; RV32IM-NEXT: sw s8, 8(sp) -; RV32IM-NEXT: sw s9, 4(sp) +; RV32IM-NEXT: addi sp, sp, -448 +; RV32IM-NEXT: sw ra, 444(sp) +; RV32IM-NEXT: sw s0, 440(sp) +; RV32IM-NEXT: sw s1, 436(sp) +; RV32IM-NEXT: sw s2, 432(sp) +; RV32IM-NEXT: sw s3, 428(sp) +; RV32IM-NEXT: sw s4, 424(sp) +; RV32IM-NEXT: sw s5, 420(sp) +; RV32IM-NEXT: sw s6, 416(sp) +; RV32IM-NEXT: sw s7, 412(sp) +; RV32IM-NEXT: sw s8, 408(sp) +; RV32IM-NEXT: sw s9, 404(sp) +; RV32IM-NEXT: sw s10, 400(sp) +; RV32IM-NEXT: sw s11, 396(sp) ; RV32IM-NEXT: lw s2, 24(a1) -; RV32IM-NEXT: lw s3, 28(a1) -; RV32IM-NEXT: lw s4, 16(a1) -; RV32IM-NEXT: lw s5, 20(a1) +; RV32IM-NEXT: lw s5, 28(a1) +; RV32IM-NEXT: lw s3, 16(a1) +; RV32IM-NEXT: lw s1, 20(a1) ; RV32IM-NEXT: lw s6, 8(a1) -; RV32IM-NEXT: lw s1, 12(a1) -; RV32IM-NEXT: lw a3, 0(a1) -; RV32IM-NEXT: lw a1, 4(a1) -; RV32IM-NEXT: mv s0, a0 -; RV32IM-NEXT: addi a2, zero, 1 -; RV32IM-NEXT: mv a0, a3 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: mv s7, a0 -; RV32IM-NEXT: mv s8, a1 -; RV32IM-NEXT: addi a2, zero, 654 -; RV32IM-NEXT: mv a0, s6 -; RV32IM-NEXT: mv a1, s1 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: mv s6, a0 -; RV32IM-NEXT: mv s9, a1 -; RV32IM-NEXT: addi a2, zero, 23 -; RV32IM-NEXT: mv a0, s4 -; RV32IM-NEXT: mv a1, s5 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: mv s4, a0 -; RV32IM-NEXT: mv s1, a1 -; RV32IM-NEXT: lui a0, 1 -; RV32IM-NEXT: addi a2, a0, 1327 -; RV32IM-NEXT: mv a0, s2 -; RV32IM-NEXT: mv a1, s3 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: sw a1, 28(s0) -; RV32IM-NEXT: sw a0, 24(s0) -; RV32IM-NEXT: sw s1, 20(s0) -; RV32IM-NEXT: sw s4, 16(s0) -; RV32IM-NEXT: sw s9, 12(s0) -; RV32IM-NEXT: sw s6, 8(s0) -; RV32IM-NEXT: sw s8, 4(s0) -; RV32IM-NEXT: sw s7, 0(s0) -; RV32IM-NEXT: lw s9, 4(sp) -; RV32IM-NEXT: lw s8, 8(sp) -; RV32IM-NEXT: lw s7, 12(sp) -; RV32IM-NEXT: lw s6, 16(sp) -; RV32IM-NEXT: lw s5, 20(sp) -; RV32IM-NEXT: lw s4, 24(sp) -; RV32IM-NEXT: lw s3, 28(sp) -; RV32IM-NEXT: lw s2, 32(sp) -; RV32IM-NEXT: lw s1, 36(sp) -; RV32IM-NEXT: lw s0, 40(sp) -; RV32IM-NEXT: lw ra, 44(sp) -; RV32IM-NEXT: addi sp, sp, 48 +; RV32IM-NEXT: lw s7, 12(a1) +; RV32IM-NEXT: mv s9, a0 +; RV32IM-NEXT: sw zero, 164(sp) +; RV32IM-NEXT: sw zero, 160(sp) +; RV32IM-NEXT: sw zero, 180(sp) +; RV32IM-NEXT: sw zero, 176(sp) +; RV32IM-NEXT: lui a0, 410452 +; RV32IM-NEXT: addi a0, a0, -952 +; RV32IM-NEXT: sw a0, 156(sp) +; RV32IM-NEXT: lui a0, 25653 +; RV32IM-NEXT: addi a0, a0, 965 +; RV32IM-NEXT: sw a0, 152(sp) +; RV32IM-NEXT: srli a0, s7, 1 +; RV32IM-NEXT: sw a0, 172(sp) +; RV32IM-NEXT: slli a0, s7, 31 +; RV32IM-NEXT: srli a1, s6, 1 +; RV32IM-NEXT: or a3, a1, a0 +; RV32IM-NEXT: addi a0, sp, 184 +; RV32IM-NEXT: addi a1, sp, 168 +; RV32IM-NEXT: addi a2, sp, 152 +; RV32IM-NEXT: sw a3, 168(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: sw zero, 260(sp) +; RV32IM-NEXT: sw zero, 256(sp) +; RV32IM-NEXT: sw zero, 276(sp) +; RV32IM-NEXT: sw zero, 272(sp) +; RV32IM-NEXT: lui a0, 410312 +; RV32IM-NEXT: addi a0, a0, 1424 +; RV32IM-NEXT: sw a0, 252(sp) +; RV32IM-NEXT: lui a0, 729444 +; RV32IM-NEXT: addi a0, a0, 713 +; RV32IM-NEXT: sw a0, 248(sp) +; RV32IM-NEXT: sw s1, 268(sp) +; RV32IM-NEXT: sw s1, 4(sp) +; RV32IM-NEXT: addi a0, sp, 280 +; RV32IM-NEXT: addi a1, sp, 264 +; RV32IM-NEXT: addi a2, sp, 248 +; RV32IM-NEXT: sw s3, 264(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: sw zero, 356(sp) +; RV32IM-NEXT: sw zero, 352(sp) +; RV32IM-NEXT: sw zero, 372(sp) +; RV32IM-NEXT: sw zero, 368(sp) +; RV32IM-NEXT: lui a0, 791991 +; RV32IM-NEXT: addi a0, a0, 77 +; RV32IM-NEXT: sw a0, 348(sp) +; RV32IM-NEXT: lui a0, 834723 +; RV32IM-NEXT: addi a0, a0, -179 +; RV32IM-NEXT: sw a0, 344(sp) +; RV32IM-NEXT: sw s5, 364(sp) +; RV32IM-NEXT: addi a0, sp, 376 +; RV32IM-NEXT: addi a1, sp, 360 +; RV32IM-NEXT: addi a2, sp, 344 +; RV32IM-NEXT: sw s2, 360(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: sw zero, 116(sp) +; RV32IM-NEXT: sw zero, 112(sp) +; RV32IM-NEXT: sw zero, 108(sp) +; RV32IM-NEXT: sw zero, 104(sp) +; RV32IM-NEXT: sw zero, 132(sp) +; RV32IM-NEXT: sw zero, 128(sp) +; RV32IM-NEXT: lw s11, 192(sp) +; RV32IM-NEXT: sub a0, s6, s11 +; RV32IM-NEXT: sw a0, 120(sp) +; RV32IM-NEXT: lw s8, 196(sp) +; RV32IM-NEXT: sltu a0, s6, s11 +; RV32IM-NEXT: sub a1, s7, s8 +; RV32IM-NEXT: sub a3, a1, a0 +; RV32IM-NEXT: addi a0, sp, 136 +; RV32IM-NEXT: addi a1, sp, 120 +; RV32IM-NEXT: addi a2, sp, 104 +; RV32IM-NEXT: sw a3, 124(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: sw zero, 212(sp) +; RV32IM-NEXT: sw zero, 208(sp) +; RV32IM-NEXT: lui a0, 524288 +; RV32IM-NEXT: sw a0, 204(sp) +; RV32IM-NEXT: sw zero, 200(sp) +; RV32IM-NEXT: sw zero, 228(sp) +; RV32IM-NEXT: sw zero, 224(sp) +; RV32IM-NEXT: lw s0, 288(sp) +; RV32IM-NEXT: sub a0, s3, s0 +; RV32IM-NEXT: sw a0, 216(sp) +; RV32IM-NEXT: lw s10, 292(sp) +; RV32IM-NEXT: sltu a0, s3, s0 +; RV32IM-NEXT: sub a1, s1, s10 +; RV32IM-NEXT: sub a3, a1, a0 +; RV32IM-NEXT: addi a0, sp, 232 +; RV32IM-NEXT: addi a1, sp, 216 +; RV32IM-NEXT: addi a2, sp, 200 +; RV32IM-NEXT: sw a3, 220(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: sw zero, 308(sp) +; RV32IM-NEXT: sw zero, 304(sp) +; RV32IM-NEXT: sw zero, 300(sp) +; RV32IM-NEXT: sw zero, 296(sp) +; RV32IM-NEXT: sw zero, 324(sp) +; RV32IM-NEXT: sw zero, 320(sp) +; RV32IM-NEXT: lw s1, 384(sp) +; RV32IM-NEXT: sub a0, s2, s1 +; RV32IM-NEXT: sw a0, 312(sp) +; RV32IM-NEXT: lw s4, 388(sp) +; RV32IM-NEXT: sltu a0, s2, s1 +; RV32IM-NEXT: sub a1, s5, s4 +; RV32IM-NEXT: sub a3, a1, a0 +; RV32IM-NEXT: addi a0, sp, 328 +; RV32IM-NEXT: addi a1, sp, 312 +; RV32IM-NEXT: addi a2, sp, 296 +; RV32IM-NEXT: sw a3, 316(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: lw a0, 148(sp) +; RV32IM-NEXT: lw a1, 144(sp) +; RV32IM-NEXT: add a0, a0, s8 +; RV32IM-NEXT: add a2, a1, s11 +; RV32IM-NEXT: sltu a1, a2, a1 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: srli a1, a0, 7 +; RV32IM-NEXT: addi a3, zero, 654 +; RV32IM-NEXT: mul a1, a1, a3 +; RV32IM-NEXT: slli a0, a0, 25 +; RV32IM-NEXT: srli a2, a2, 7 +; RV32IM-NEXT: or a0, a2, a0 +; RV32IM-NEXT: mulhu a2, a0, a3 +; RV32IM-NEXT: add a1, a2, a1 +; RV32IM-NEXT: sub a1, s7, a1 +; RV32IM-NEXT: mul a0, a0, a3 +; RV32IM-NEXT: lw a2, 244(sp) +; RV32IM-NEXT: lw a3, 240(sp) +; RV32IM-NEXT: sltu a4, s6, a0 +; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: add a2, a2, s10 +; RV32IM-NEXT: add a4, a3, s0 +; RV32IM-NEXT: sltu a3, a4, a3 +; RV32IM-NEXT: add a2, a2, a3 +; RV32IM-NEXT: srli a3, a2, 4 +; RV32IM-NEXT: addi a5, zero, 23 +; RV32IM-NEXT: mul a3, a3, a5 +; RV32IM-NEXT: slli a2, a2, 28 +; RV32IM-NEXT: srli a4, a4, 4 +; RV32IM-NEXT: or a2, a4, a2 +; RV32IM-NEXT: mulhu a4, a2, a5 +; RV32IM-NEXT: add a3, a4, a3 +; RV32IM-NEXT: lw a4, 4(sp) +; RV32IM-NEXT: sub a3, a4, a3 +; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: lw a4, 340(sp) +; RV32IM-NEXT: lw a5, 336(sp) +; RV32IM-NEXT: sltu s0, s3, a2 +; RV32IM-NEXT: sub a3, a3, s0 +; RV32IM-NEXT: add a4, a4, s4 +; RV32IM-NEXT: add s1, a5, s1 +; RV32IM-NEXT: sltu a5, s1, a5 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: srli a5, a4, 12 +; RV32IM-NEXT: lui s0, 1 +; RV32IM-NEXT: addi s0, s0, 1327 +; RV32IM-NEXT: mul a5, a5, s0 +; RV32IM-NEXT: slli a4, a4, 20 +; RV32IM-NEXT: srli s1, s1, 12 +; RV32IM-NEXT: or a4, s1, a4 +; RV32IM-NEXT: mulhu s1, a4, s0 +; RV32IM-NEXT: add a5, s1, a5 +; RV32IM-NEXT: sub a5, s5, a5 +; RV32IM-NEXT: mul a4, a4, s0 +; RV32IM-NEXT: sltu s1, s2, a4 +; RV32IM-NEXT: sub a5, a5, s1 +; RV32IM-NEXT: sub a0, s6, a0 +; RV32IM-NEXT: sub a2, s3, a2 +; RV32IM-NEXT: sub a4, s2, a4 +; RV32IM-NEXT: sw zero, 4(s9) +; RV32IM-NEXT: sw zero, 0(s9) +; RV32IM-NEXT: sw a4, 24(s9) +; RV32IM-NEXT: sw a2, 16(s9) +; RV32IM-NEXT: sw a5, 28(s9) +; RV32IM-NEXT: sw a3, 20(s9) +; RV32IM-NEXT: sw a0, 8(s9) +; RV32IM-NEXT: sw a1, 12(s9) +; RV32IM-NEXT: lw s11, 396(sp) +; RV32IM-NEXT: lw s10, 400(sp) +; RV32IM-NEXT: lw s9, 404(sp) +; RV32IM-NEXT: lw s8, 408(sp) +; RV32IM-NEXT: lw s7, 412(sp) +; RV32IM-NEXT: lw s6, 416(sp) +; RV32IM-NEXT: lw s5, 420(sp) +; RV32IM-NEXT: lw s4, 424(sp) +; RV32IM-NEXT: lw s3, 428(sp) +; RV32IM-NEXT: lw s2, 432(sp) +; RV32IM-NEXT: lw s1, 436(sp) +; RV32IM-NEXT: lw s0, 440(sp) +; RV32IM-NEXT: lw ra, 444(sp) +; RV32IM-NEXT: addi sp, sp, 448 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_urem_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: sd ra, 56(sp) +; RV64I-NEXT: sd s0, 48(sp) +; RV64I-NEXT: sd s1, 40(sp) +; RV64I-NEXT: sd s2, 32(sp) +; RV64I-NEXT: sd s3, 24(sp) +; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: sd s5, 8(sp) ; RV64I-NEXT: ld s2, 24(a1) -; RV64I-NEXT: ld s1, 16(a1) -; RV64I-NEXT: ld a2, 8(a1) -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: addi a1, zero, 654 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: ld s5, 16(a1) +; RV64I-NEXT: ld s1, 8(a1) ; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: srli a0, s1, 1 +; RV64I-NEXT: lui a1, 6413 +; RV64I-NEXT: addiw a1, a1, 1265 +; RV64I-NEXT: slli a1, a1, 13 +; RV64I-NEXT: addi a1, a1, 1027 +; RV64I-NEXT: slli a1, a1, 13 +; RV64I-NEXT: addi a1, a1, 1077 +; RV64I-NEXT: slli a1, a1, 12 +; RV64I-NEXT: addi a2, a1, 965 +; RV64I-NEXT: mv a1, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: sub a0, s1, a1 +; RV64I-NEXT: mv a1, zero +; RV64I-NEXT: mv a2, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: add a0, a1, s0 +; RV64I-NEXT: srli a0, a0, 7 +; RV64I-NEXT: addi a1, zero, 654 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s4, s1, a0 +; RV64I-NEXT: lui a0, 3206 +; RV64I-NEXT: addiw a0, a0, -1781 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a0, a0, 1069 +; RV64I-NEXT: slli a0, a0, 12 +; RV64I-NEXT: addi a0, a0, -1959 +; RV64I-NEXT: slli a0, a0, 14 +; RV64I-NEXT: addi a2, a0, 713 +; RV64I-NEXT: mv a0, s5 +; RV64I-NEXT: mv a1, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: sub a0, s5, a1 +; RV64I-NEXT: addi a1, zero, -1 +; RV64I-NEXT: slli a2, a1, 63 +; RV64I-NEXT: mv a1, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: add a0, a1, s0 +; RV64I-NEXT: srli a0, a0, 4 ; RV64I-NEXT: addi a1, zero, 23 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a1, a0, 1327 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s1, s5, a0 +; RV64I-NEXT: lui a0, 1044567 +; RV64I-NEXT: addiw a0, a0, -575 +; RV64I-NEXT: slli a0, a0, 12 +; RV64I-NEXT: addi a0, a0, 883 +; RV64I-NEXT: slli a0, a0, 14 +; RV64I-NEXT: addi a0, a0, -861 +; RV64I-NEXT: slli a0, a0, 12 +; RV64I-NEXT: addi a2, a0, -179 ; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: sd zero, 0(s0) -; RV64I-NEXT: sd a0, 24(s0) -; RV64I-NEXT: sd s1, 16(s0) -; RV64I-NEXT: sd s3, 8(s0) -; RV64I-NEXT: ld s3, 8(sp) -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: mv a1, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: sub a0, s2, a1 +; RV64I-NEXT: mv a1, zero +; RV64I-NEXT: mv a2, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: add a0, a1, s0 +; RV64I-NEXT: srli a0, a0, 12 +; RV64I-NEXT: lui a1, 1 +; RV64I-NEXT: addiw a1, a1, 1327 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s2, a0 +; RV64I-NEXT: sd zero, 0(s3) +; RV64I-NEXT: sd a0, 24(s3) +; RV64I-NEXT: sd s1, 16(s3) +; RV64I-NEXT: sd s4, 8(s3) +; RV64I-NEXT: ld s5, 8(sp) +; RV64I-NEXT: ld s4, 16(sp) +; RV64I-NEXT: ld s3, 24(sp) +; RV64I-NEXT: ld s2, 32(sp) +; RV64I-NEXT: ld s1, 40(sp) +; RV64I-NEXT: ld s0, 48(sp) +; RV64I-NEXT: ld ra, 56(sp) +; RV64I-NEXT: addi sp, sp, 64 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: dont_fold_urem_i64: diff --git a/llvm/test/CodeGen/SPARC/rem.ll b/llvm/test/CodeGen/SPARC/rem.ll --- a/llvm/test/CodeGen/SPARC/rem.ll +++ b/llvm/test/CodeGen/SPARC/rem.ll @@ -30,13 +30,31 @@ ; CHECK-LABEL: test3: ; CHECK: .cfi_startproc ; CHECK-NEXT: ! %bb.0: ! %entry -; CHECK-NEXT: sethi 2545, %o1 -; CHECK-NEXT: or %o1, 379, %o1 -; CHECK-NEXT: mulx %o0, %o1, %o0 -; CHECK-NEXT: udivx %o0, 1021, %o1 -; CHECK-NEXT: mulx %o1, 1021, %o1 -; CHECK-NEXT: retl -; CHECK-NEXT: sub %o0, %o1, %o0 +; CHECK-NEXT: save %sp, -176, %sp +; CHECK-NEXT: .cfi_def_cfa_register %fp +; CHECK-NEXT: .cfi_window_save +; CHECK-NEXT: .cfi_register %o7, %i7 +; CHECK-NEXT: sethi 2545, %i1 +; CHECK-NEXT: or %i1, 379, %i1 +; CHECK-NEXT: mulx %i0, %i1, %i0 +; CHECK-NEXT: sethi 1331003, %i1 +; CHECK-NEXT: or %i1, 435, %i1 +; CHECK-NEXT: sethi 12324, %i2 +; CHECK-NEXT: or %i2, 108, %i2 +; CHECK-NEXT: sllx %i2, 32, %i2 +; CHECK-NEXT: or %i2, %i1, %o3 +; CHECK-NEXT: mov 0, %o0 +; CHECK-NEXT: mov %i0, %o1 +; CHECK-NEXT: call __multi3 +; CHECK-NEXT: mov %o0, %o2 +; CHECK-NEXT: sub %i0, %o0, %i1 +; CHECK-NEXT: srlx %i1, 1, %i1 +; CHECK-NEXT: add %i1, %o0, %i1 +; CHECK-NEXT: srlx %i1, 9, %i1 +; CHECK-NEXT: mulx %i1, 1021, %i1 +; CHECK-NEXT: sub %i0, %i1, %i0 +; CHECK-NEXT: ret +; CHECK-NEXT: restore entry: %mul = mul i64 %b, 2606459 %rem = urem i64 %mul, 1021 diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll --- a/llvm/test/CodeGen/X86/divide-by-constant.ll +++ b/llvm/test/CodeGen/X86/divide-by-constant.ll @@ -294,19 +294,82 @@ define i64 @PR23590(i64 %x) nounwind { ; X32-LABEL: PR23590: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $12345 # imm = 0x3039 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $16, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $7 -; X32-NEXT: pushl %edx -; X32-NEXT: pushl %eax -; X32-NEXT: calll __udivdi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %ebp +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl $417841695, %ebx # imm = 0x18E7C21F +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl $1425045447, %edx # imm = 0x54F077C7 +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: setb %al +; X32-NEXT: movzbl %al, %ebp +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl $1425045447, %edx # imm = 0x54F077C7 +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: shrdl $12, %ebx, %eax +; X32-NEXT: movl $12345, %edx # imm = 0x3039 +; X32-NEXT: mull %edx +; X32-NEXT: shrl $12, %ebx +; X32-NEXT: imull $12345, %ebx, %edi # imm = 0x3039 +; X32-NEXT: addl %edx, %edi +; X32-NEXT: subl %eax, %esi +; X32-NEXT: sbbl %edi, %ecx +; X32-NEXT: movl $-1840700269, %ebp # imm = 0x92492493 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl $613566756, %edx # imm = 0x24924924 +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: adcl %edi, %ebx +; X32-NEXT: setb %al +; X32-NEXT: movzbl %al, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl $613566756, %edx # imm = 0x24924924 +; X32-NEXT: mull %edx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: adcl %edi, %edx +; X32-NEXT: subl %eax, %esi +; X32-NEXT: sbbl %edx, %ecx +; X32-NEXT: movl %ecx, %edi +; X32-NEXT: shrl %edi +; X32-NEXT: shldl $31, %esi, %ecx +; X32-NEXT: addl %eax, %ecx +; X32-NEXT: adcl %edx, %edi +; X32-NEXT: shrdl $2, %edi, %ecx +; X32-NEXT: shrl $2, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edi, %edx +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi +; X32-NEXT: popl %ebx +; X32-NEXT: popl %ebp ; X32-NEXT: retl ; ; X64-FAST-LABEL: PR23590: @@ -355,27 +418,40 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: subl $12, %esp -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $-294967296 # imm = 0xEE6B2800 -; X32-NEXT: pushl %ebp -; X32-NEXT: pushl %ebx -; X32-NEXT: calll __udivdi3 -; X32-NEXT: addl $16, %esp -; X32-NEXT: movl %eax, %esi +; X32-NEXT: pushl %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %esi, %edi +; X32-NEXT: shldl $21, %ecx, %edi +; X32-NEXT: movl $-400107883, %ebx # imm = 0xE826D695 +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: shrl $11, %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl $1125899, %edx # imm = 0x112E0B +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %edx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $-294967296 # imm = 0xEE6B2800 -; X32-NEXT: pushl %ebp -; X32-NEXT: pushl %ebx -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $16, %esp -; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: adcl %ebp, %edi +; X32-NEXT: setb %al +; X32-NEXT: movzbl %al, %ebx ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %edi, %edx -; X32-NEXT: addl $12, %esp +; X32-NEXT: movl $1125899, %edx # imm = 0x112E0B +; X32-NEXT: mull %edx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: adcl %ebx, %edx +; X32-NEXT: shrdl $9, %edx, %eax +; X32-NEXT: imull $-294967296, %eax, %esi # imm = 0xEE6B2800 +; X32-NEXT: subl %esi, %ecx +; X32-NEXT: shrl $9, %edx +; X32-NEXT: addl $4, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -409,27 +485,56 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: subl $12, %esp -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: pushl %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $-294967296 # imm = 0xEE6B2800 -; X32-NEXT: pushl %ebp -; X32-NEXT: pushl %ebx -; X32-NEXT: calll __divdi3 -; X32-NEXT: addl $16, %esp -; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl %ebp, %esi +; X32-NEXT: sarl $31, %esi +; X32-NEXT: movl $651596979, %edi # imm = 0x26D694B3 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl $288230376, %edx # imm = 0x112E0BE8 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: setb %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl $288230376, %edx # imm = 0x112E0BE8 +; X32-NEXT: mull %edx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $-294967296 # imm = 0xEE6B2800 -; X32-NEXT: pushl %ebp -; X32-NEXT: pushl %ebx -; X32-NEXT: calll __moddi3 -; X32-NEXT: addl $16, %esp -; X32-NEXT: movl %eax, %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl (%esp), %edi # 4-byte Folded Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %edi, %edx -; X32-NEXT: addl $12, %esp +; X32-NEXT: movl $651596979, %edx # imm = 0x26D694B3 +; X32-NEXT: mull %edx +; X32-NEXT: imull $288230376, %esi, %ebx # imm = 0x112E0BE8 +; X32-NEXT: addl %edx, %ebx +; X32-NEXT: imull $651596979, %esi, %esi # imm = 0x26D694B3 +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: movl %esi, %edx +; X32-NEXT: sarl $28, %edx +; X32-NEXT: shrdl $28, %esi, %eax +; X32-NEXT: shrl $31, %esi +; X32-NEXT: addl %eax, %esi +; X32-NEXT: adcl $0, %edx +; X32-NEXT: imull $-294967296, %esi, %eax # imm = 0xEE6B2800 +; X32-NEXT: subl %eax, %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: addl $4, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -7,25 +7,97 @@ define i64 @mod128(i128 %x) nounwind { ; X86-64-LABEL: mod128: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $3, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __modti3 -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: pushq %rbx +; X86-64-NEXT: movq %rdi, %r9 +; X86-64-NEXT: movabsq $6148914691236517206, %r10 # imm = 0x5555555555555556 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r10 +; X86-64-NEXT: movq %rdx, %r8 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r10 +; X86-64-NEXT: movq %rdx, %r11 +; X86-64-NEXT: movq %rax, %rdi +; X86-64-NEXT: addq %r8, %rdi +; X86-64-NEXT: adcq $0, %r11 +; X86-64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; X86-64-NEXT: movq %r9, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %r8 +; X86-64-NEXT: addq %rdi, %rax +; X86-64-NEXT: adcq %r11, %r8 +; X86-64-NEXT: setb %al +; X86-64-NEXT: movzbl %al, %ebx +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %r11 +; X86-64-NEXT: movq %rax, %rdi +; X86-64-NEXT: addq %r8, %rdi +; X86-64-NEXT: adcq %rbx, %r11 +; X86-64-NEXT: sarq $63, %rsi +; X86-64-NEXT: imulq %rsi, %rcx +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r10 +; X86-64-NEXT: addq %rcx, %rdx +; X86-64-NEXT: imulq %r10, %rsi +; X86-64-NEXT: addq %rdx, %rsi +; X86-64-NEXT: addq %rdi, %rax +; X86-64-NEXT: adcq %r11, %rsi +; X86-64-NEXT: shrq $63, %rsi +; X86-64-NEXT: addq %rax, %rsi +; X86-64-NEXT: leaq (%rsi,%rsi,2), %rax +; X86-64-NEXT: subq %rax, %r9 +; X86-64-NEXT: movq %r9, %rax +; X86-64-NEXT: popq %rbx ; X86-64-NEXT: retq ; ; WIN64-LABEL: mod128: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __modti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: pushq %rsi +; WIN64-NEXT: pushq %rdi +; WIN64-NEXT: pushq %rbx +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %r10 +; WIN64-NEXT: movabsq $6148914691236517206, %rsi # imm = 0x5555555555555556 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: movq %rdx, %r11 +; WIN64-NEXT: movq %rax, %rcx +; WIN64-NEXT: addq %r9, %rcx +; WIN64-NEXT: adcq $0, %r11 +; WIN64-NEXT: movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555 +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: mulq %rdi +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: addq %rcx, %rax +; WIN64-NEXT: adcq %r11, %r9 +; WIN64-NEXT: setb %al +; WIN64-NEXT: movzbl %al, %ebx +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rdi +; WIN64-NEXT: movq %rdx, %r11 +; WIN64-NEXT: movq %rax, %rcx +; WIN64-NEXT: addq %r9, %rcx +; WIN64-NEXT: adcq %rbx, %r11 +; WIN64-NEXT: sarq $63, %r8 +; WIN64-NEXT: imulq %r8, %rdi +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: addq %rdi, %rdx +; WIN64-NEXT: imulq %rsi, %r8 +; WIN64-NEXT: addq %rdx, %r8 +; WIN64-NEXT: addq %rcx, %rax +; WIN64-NEXT: adcq %r11, %r8 +; WIN64-NEXT: shrq $63, %r8 +; WIN64-NEXT: addq %rax, %r8 +; WIN64-NEXT: leaq (%r8,%r8,2), %rax +; WIN64-NEXT: subq %rax, %r10 +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: popq %rbx +; WIN64-NEXT: popq %rdi +; WIN64-NEXT: popq %rsi ; WIN64-NEXT: retq @@ -37,25 +109,89 @@ define i64 @div128(i128 %x) nounwind { ; X86-64-LABEL: div128: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $3, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __divti3 -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rdi, %r9 +; X86-64-NEXT: movabsq $6148914691236517206, %r10 # imm = 0x5555555555555556 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r10 +; X86-64-NEXT: movq %rdx, %r8 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r10 +; X86-64-NEXT: movq %rdx, %r11 +; X86-64-NEXT: movq %rax, %rdi +; X86-64-NEXT: addq %r8, %rdi +; X86-64-NEXT: adcq $0, %r11 +; X86-64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; X86-64-NEXT: movq %r9, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %r8 +; X86-64-NEXT: addq %rdi, %rax +; X86-64-NEXT: adcq %r11, %r8 +; X86-64-NEXT: setb %al +; X86-64-NEXT: movzbl %al, %r11d +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %r9 +; X86-64-NEXT: movq %rax, %rdi +; X86-64-NEXT: addq %r8, %rdi +; X86-64-NEXT: adcq %r11, %r9 +; X86-64-NEXT: sarq $63, %rsi +; X86-64-NEXT: imulq %rsi, %rcx +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r10 +; X86-64-NEXT: addq %rcx, %rdx +; X86-64-NEXT: imulq %r10, %rsi +; X86-64-NEXT: addq %rdx, %rsi +; X86-64-NEXT: addq %rdi, %rax +; X86-64-NEXT: adcq %r9, %rsi +; X86-64-NEXT: shrq $63, %rsi +; X86-64-NEXT: addq %rsi, %rax ; X86-64-NEXT: retq ; ; WIN64-LABEL: div128: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __divti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: pushq %rsi +; WIN64-NEXT: pushq %rdi +; WIN64-NEXT: pushq %rbx +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %r10 +; WIN64-NEXT: movabsq $6148914691236517206, %rbx # imm = 0x5555555555555556 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rbx +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rbx +; WIN64-NEXT: movq %rdx, %r11 +; WIN64-NEXT: movq %rax, %rcx +; WIN64-NEXT: addq %r9, %rcx +; WIN64-NEXT: adcq $0, %r11 +; WIN64-NEXT: movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555 +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: mulq %rdi +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: addq %rcx, %rax +; WIN64-NEXT: adcq %r11, %r9 +; WIN64-NEXT: setb %al +; WIN64-NEXT: movzbl %al, %esi +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rdi +; WIN64-NEXT: movq %rdx, %r10 +; WIN64-NEXT: movq %rax, %rcx +; WIN64-NEXT: addq %r9, %rcx +; WIN64-NEXT: adcq %rsi, %r10 +; WIN64-NEXT: sarq $63, %r8 +; WIN64-NEXT: imulq %r8, %rdi +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rbx +; WIN64-NEXT: addq %rdi, %rdx +; WIN64-NEXT: imulq %rbx, %r8 +; WIN64-NEXT: addq %rdx, %r8 +; WIN64-NEXT: addq %rcx, %rax +; WIN64-NEXT: adcq %r10, %r8 +; WIN64-NEXT: shrq $63, %r8 +; WIN64-NEXT: addq %r8, %rax +; WIN64-NEXT: popq %rbx +; WIN64-NEXT: popq %rdi +; WIN64-NEXT: popq %rsi ; WIN64-NEXT: retq @@ -67,25 +203,68 @@ define i64 @umod128(i128 %x) nounwind { ; X86-64-LABEL: umod128: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $3, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3 -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rsi, %r8 +; X86-64-NEXT: movq %rdi, %r10 +; X86-64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %r9 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %rsi +; X86-64-NEXT: movq %rax, %rcx +; X86-64-NEXT: addq %r9, %rcx +; X86-64-NEXT: adcq $0, %rsi +; X86-64-NEXT: movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r9 +; X86-64-NEXT: movq %rdx, %rdi +; X86-64-NEXT: addq %rcx, %rax +; X86-64-NEXT: adcq %rsi, %rdi +; X86-64-NEXT: setb %al +; X86-64-NEXT: movzbl %al, %ecx +; X86-64-NEXT: movq %r8, %rax +; X86-64-NEXT: mulq %r9 +; X86-64-NEXT: addq %rdi, %rax +; X86-64-NEXT: adcq %rcx, %rdx +; X86-64-NEXT: shldq $63, %rax, %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax +; X86-64-NEXT: subq %rax, %r10 +; X86-64-NEXT: movq %r10, %rax ; X86-64-NEXT: retq ; ; WIN64-LABEL: umod128: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: pushq %rsi +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %r10 +; WIN64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: mulq %rcx +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rcx +; WIN64-NEXT: movq %rdx, %r11 +; WIN64-NEXT: movq %rax, %rcx +; WIN64-NEXT: addq %r9, %rcx +; WIN64-NEXT: adcq $0, %r11 +; WIN64-NEXT: movabsq $-6148914691236517206, %rsi # imm = 0xAAAAAAAAAAAAAAAA +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: addq %rcx, %rax +; WIN64-NEXT: adcq %r11, %r9 +; WIN64-NEXT: setb %al +; WIN64-NEXT: movzbl %al, %ecx +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: addq %r9, %rax +; WIN64-NEXT: adcq %rcx, %rdx +; WIN64-NEXT: shldq $63, %rax, %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax +; WIN64-NEXT: subq %rax, %r10 +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: popq %rsi ; WIN64-NEXT: retq @@ -97,25 +276,61 @@ define i64 @udiv128(i128 %x) nounwind { ; X86-64-LABEL: udiv128: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $3, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __udivti3 -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rsi, %r8 +; X86-64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %r9 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %rsi +; X86-64-NEXT: movq %rax, %rcx +; X86-64-NEXT: addq %r9, %rcx +; X86-64-NEXT: adcq $0, %rsi +; X86-64-NEXT: movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r9 +; X86-64-NEXT: movq %rdx, %rdi +; X86-64-NEXT: addq %rcx, %rax +; X86-64-NEXT: adcq %rsi, %rdi +; X86-64-NEXT: setb %al +; X86-64-NEXT: movzbl %al, %ecx +; X86-64-NEXT: movq %r8, %rax +; X86-64-NEXT: mulq %r9 +; X86-64-NEXT: addq %rdi, %rax +; X86-64-NEXT: adcq %rdx, %rcx +; X86-64-NEXT: shrdq $1, %rcx, %rax ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv128: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: pushq %rsi +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %r10 +; WIN64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: mulq %rcx +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rcx +; WIN64-NEXT: movq %rdx, %r11 +; WIN64-NEXT: movq %rax, %rcx +; WIN64-NEXT: addq %r9, %rcx +; WIN64-NEXT: adcq $0, %r11 +; WIN64-NEXT: movabsq $-6148914691236517206, %rsi # imm = 0xAAAAAAAAAAAAAAAA +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: addq %rcx, %rax +; WIN64-NEXT: adcq %r11, %r9 +; WIN64-NEXT: setb %al +; WIN64-NEXT: movzbl %al, %ecx +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: addq %r9, %rax +; WIN64-NEXT: adcq %rdx, %rcx +; WIN64-NEXT: shrdq $1, %rcx, %rax +; WIN64-NEXT: popq %rsi ; WIN64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/i128-sdiv.ll b/llvm/test/CodeGen/X86/i128-sdiv.ll --- a/llvm/test/CodeGen/X86/i128-sdiv.ll +++ b/llvm/test/CodeGen/X86/i128-sdiv.ll @@ -1,155 +1,112 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=CHECK,X86 -; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=CHECK,X64 - +; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s ; Make sure none of these crash, and that the power-of-two transformations ; trigger correctly. -define i128 @test1(i128 %x) nounwind { -; X86-LABEL: test1: -; X86: # %bb.0: -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shrl $30, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl %edx, %edi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx -; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: shrdl $2, %ecx, %esi -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: sarl $2, %edx -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %esi, (%eax) -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: retl $4 -; -; X64-LABEL: test1: -; X64: # %bb.0: -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: sarq $63, %rax -; X64-NEXT: movq %rax, %rdx -; X64-NEXT: shrq $62, %rdx -; X64-NEXT: addq %rdi, %rax -; X64-NEXT: adcq %rsi, %rdx -; X64-NEXT: movq %rdx, %rax -; X64-NEXT: sarq $2, %rax -; X64-NEXT: sarq $63, %rdx -; X64-NEXT: retq +define i128 @test1(i128 %x) { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq %rax, %rdx +; CHECK-NEXT: shrq $62, %rdx +; CHECK-NEXT: addq %rdi, %rax +; CHECK-NEXT: adcq %rsi, %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: sarq $2, %rax +; CHECK-NEXT: sarq $63, %rdx +; CHECK-NEXT: retq %tmp = sdiv i128 %x, 73786976294838206464 ret i128 %tmp } -define i128 @test2(i128 %x) nounwind { -; X86-LABEL: test2: -; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shrl $30, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl %edx, %edi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx -; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: shrdl $2, %ecx, %esi -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: sarl $2, %ecx -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: negl %esi -; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %ecx, %ebx -; X86-NEXT: movl $0, %ecx -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: sbbl %edx, %edi -; X86-NEXT: movl %esi, (%eax) -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: retl $4 -; -; X64-LABEL: test2: -; X64: # %bb.0: -; X64-NEXT: movq %rsi, %rcx -; X64-NEXT: sarq $63, %rcx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: shrq $62, %rax -; X64-NEXT: addq %rdi, %rcx -; X64-NEXT: adcq %rsi, %rax -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: sarq $63, %rcx -; X64-NEXT: sarq $2, %rax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: negq %rax -; X64-NEXT: sbbq %rcx, %rdx -; X64-NEXT: retq +define i128 @test2(i128 %x) { +; CHECK-LABEL: test2: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rsi, %rcx +; CHECK-NEXT: sarq $63, %rcx +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: shrq $62, %rax +; CHECK-NEXT: addq %rdi, %rcx +; CHECK-NEXT: adcq %rsi, %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: sarq $63, %rcx +; CHECK-NEXT: sarq $2, %rax +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: negq %rax +; CHECK-NEXT: sbbq %rcx, %rdx +; CHECK-NEXT: retq %tmp = sdiv i128 %x, -73786976294838206464 ret i128 %tmp } -define i128 @test3(i128 %x) nounwind { -; X86-LABEL: test3: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %esp, %eax -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl $-5 -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl $-3 -; X86-NEXT: pushl 24(%ebp) -; X86-NEXT: pushl 20(%ebp) -; X86-NEXT: pushl 16(%ebp) -; X86-NEXT: pushl 12(%ebp) -; X86-NEXT: pushl %eax -; X86-NEXT: calll __divti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: movl (%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) -; X86-NEXT: movl %eax, (%esi) -; X86-NEXT: movl %esi, %eax -; X86-NEXT: leal -8(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebp -; X86-NEXT: retl $4 -; -; X64-LABEL: test3: -; X64: # %bb.0: -; X64-NEXT: pushq %rax -; X64-NEXT: movq $-3, %rdx -; X64-NEXT: movq $-5, %rcx -; X64-NEXT: callq __divti3 -; X64-NEXT: popq %rcx -; X64-NEXT: retq +define i128 @test3(i128 %x) { +; CHECK-LABEL: test3: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %r14, -16 +; CHECK-NEXT: movq %rsi, %r10 +; CHECK-NEXT: movq %rdi, %r9 +; CHECK-NEXT: movabsq $6917529027641081855, %r14 # imm = 0x5FFFFFFFFFFFFFFF +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: mulq %r14 +; CHECK-NEXT: movq %rdx, %r8 +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: mulq %r14 +; CHECK-NEXT: movq %rdx, %rcx +; CHECK-NEXT: movq %rax, %rdi +; CHECK-NEXT: addq %r8, %rdi +; CHECK-NEXT: adcq $0, %rcx +; CHECK-NEXT: movabsq $-9223372036854775808, %rbx # imm = 0x8000000000000000 +; CHECK-NEXT: movq %r9, %rax +; CHECK-NEXT: mulq %rbx +; CHECK-NEXT: movq %rdx, %rsi +; CHECK-NEXT: addq %rdi, %rax +; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: setb %al +; CHECK-NEXT: movzbl %al, %ecx +; CHECK-NEXT: movq %r10, %rax +; CHECK-NEXT: mulq %rbx +; CHECK-NEXT: movq %rdx, %r11 +; CHECK-NEXT: movq %rax, %rdi +; CHECK-NEXT: addq %rsi, %rdi +; CHECK-NEXT: adcq %rcx, %r11 +; CHECK-NEXT: andq %r10, %rbx +; CHECK-NEXT: movq %r10, %rsi +; CHECK-NEXT: sarq $63, %rsi +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: mulq %r14 +; CHECK-NEXT: movq %rax, %r8 +; CHECK-NEXT: addq %rbx, %rdx +; CHECK-NEXT: imulq %r14, %rsi +; CHECK-NEXT: addq %rdx, %rsi +; CHECK-NEXT: movq $-1, %rcx +; CHECK-NEXT: movq %r9, %rax +; CHECK-NEXT: mulq %rcx +; CHECK-NEXT: movq %rdx, %rcx +; CHECK-NEXT: subq %r9, %rcx +; CHECK-NEXT: subq %r10, %rcx +; CHECK-NEXT: addq %r8, %rax +; CHECK-NEXT: adcq %rsi, %rcx +; CHECK-NEXT: addq %rdi, %rax +; CHECK-NEXT: adcq %r11, %rcx +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: sarq $63, %rdx +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: shrq $63, %rax +; CHECK-NEXT: sarq %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %tmp = sdiv i128 %x, -73786976294838206467 ret i128 %tmp } diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll --- a/llvm/test/CodeGen/X86/i128-udiv.ll +++ b/llvm/test/CodeGen/X86/i128-udiv.ll @@ -33,46 +33,74 @@ ; X86-LABEL: test2: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %esp, %eax -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl $-4 -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $0 -; X86-NEXT: pushl 24(%ebp) -; X86-NEXT: pushl 20(%ebp) -; X86-NEXT: pushl 16(%ebp) -; X86-NEXT: pushl 12(%ebp) ; X86-NEXT: pushl %eax -; X86-NEXT: calll __udivti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: movl (%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: shrdl $2, %esi, %ecx +; X86-NEXT: movl $4, %edi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: shrl $2, %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl $17, %ebx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: leal -8(%ebp), %esp +; X86-NEXT: mull %ebx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %ebp, (%eax) +; X86-NEXT: setb %cl +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl $0, 12(%eax) +; X86-NEXT: movl $0, 8(%eax) +; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: test2: ; X64: # %bb.0: -; X64-NEXT: pushq %rax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: movq $-4, %rcx -; X64-NEXT: callq __udivti3 -; X64-NEXT: popq %rcx +; X64-NEXT: shrq $2, %rsi +; X64-NEXT: movl $17, %ecx +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %r8, %rcx +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movl $4, %r8d +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: adcq %rdi, %rsi +; X64-NEXT: setb %al +; X64-NEXT: movzbl %al, %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: mulq %r8 +; X64-NEXT: addq %rsi, %rax +; X64-NEXT: adcq %rcx, %rdx ; X64-NEXT: retq %tmp = udiv i128 %x, -73786976294838206464 ret i128 %tmp @@ -82,46 +110,131 @@ ; X86-LABEL: test3: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %esp, %eax -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl $-5 -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl $-3 -; X86-NEXT: pushl 24(%ebp) -; X86-NEXT: pushl 20(%ebp) -; X86-NEXT: pushl 16(%ebp) -; X86-NEXT: pushl 12(%ebp) -; X86-NEXT: pushl %eax -; X86-NEXT: calll __udivti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: movl (%esp), %eax +; X86-NEXT: subl $24, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl $1073741824, %ebx # imm = 0x40000000 +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl $1073741824, %ebx # imm = 0x40000000 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl %edx, %edi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %edx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %eax, %edx +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl $0, %eax +; X86-NEXT: setb %bl +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl $1073741824, %edx # imm = 0x40000000 +; X86-NEXT: mull %edx +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ebp, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl %bl, %ecx +; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: movl $5, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, %eax -; X86-NEXT: leal -8(%ebp), %esp +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $5, %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $5, %edx +; X86-NEXT: mull %edx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %esi +; X86-NEXT: setb %al +; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl {{[0-9]+}}(%esp), %edx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: setb %dl +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: addb $255, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: addb $255, %dl +; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addb $255, %al +; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: setb %al +; X86-NEXT: addl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: shrl $30, %ecx +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl $0, 12(%eax) +; X86-NEXT: movl $0, 8(%eax) +; X86-NEXT: movl $0, 4(%eax) +; X86-NEXT: addl $24, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: test3: ; X64: # %bb.0: -; X64-NEXT: pushq %rax -; X64-NEXT: movq $-3, %rdx -; X64-NEXT: movq $-5, %rcx -; X64-NEXT: callq __udivti3 -; X64-NEXT: popq %rcx +; X64-NEXT: movq %rsi, %r8 +; X64-NEXT: movl $5, %ecx +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %r9, %rcx +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movabsq $4611686018427387905, %r9 # imm = 0x4000000000000001 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: adcq %rsi, %rdi +; X64-NEXT: setb %al +; X64-NEXT: movzbl %al, %ecx +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: addq %rdi, %rax +; X64-NEXT: adcq %rdx, %rcx +; X64-NEXT: shrq $62, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq %tmp = udiv i128 %x, -73786976294838206467 ret i128 %tmp diff --git a/llvm/test/CodeGen/X86/pr44812.ll b/llvm/test/CodeGen/X86/pr44812.ll --- a/llvm/test/CodeGen/X86/pr44812.ll +++ b/llvm/test/CodeGen/X86/pr44812.ll @@ -4,18 +4,28 @@ define <2 x i32> @foo(<2 x i32> %tmp) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: leal 7(%eax), %ecx -; CHECK-NEXT: testl %eax, %eax -; CHECK-NEXT: cmovnsl %eax, %ecx -; CHECK-NEXT: sarl $3, %ecx +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_offset %esi, -8 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl $-2147483647, %edx # imm = 0x80000001 +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: imull %edx +; CHECK-NEXT: movl %edx, %esi +; CHECK-NEXT: addl %ecx, %esi +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: shrl $31, %eax +; CHECK-NEXT: sarl $2, %esi +; CHECK-NEXT: addl %eax, %esi ; CHECK-NEXT: movl $1717986919, %eax # imm = 0x66666667 ; CHECK-NEXT: imull {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: shrl $31, %eax ; CHECK-NEXT: sarl $2, %edx ; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: popl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl entry: %tmp1 = sdiv <2 x i32> %tmp, diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll --- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll +++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll @@ -56,28 +56,75 @@ define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind { ; X86-LABEL: vrolq_extract_udiv: ; X86: # %bb.0: -; X86-NEXT: subl $44, %esp -; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vmovss %xmm0, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $3, {{[0-9]+}}(%esp) -; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vextractps $2, %xmm0, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $3, {{[0-9]+}}(%esp) -; X86-NEXT: vmovd %eax, %xmm0 -; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 -; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: vmovd %xmm0, %ecx +; X86-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: vpextrd $1, %xmm0, %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: setb %al +; X86-NEXT: movzbl %al, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: mull %edx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: shrl %ecx +; X86-NEXT: shldl $31, %eax, %edx +; X86-NEXT: vmovd %edx, %xmm1 +; X86-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; X86-NEXT: vpextrd $2, %xmm0, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: vpextrd $3, %xmm0, %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl $-1431655766, %ecx # imm = 0xAAAAAAAA +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: setb %al +; X86-NEXT: movzbl %al, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: mull %edx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: shrdl $1, %edx, %eax +; X86-NEXT: vpinsrd $2, %eax, %xmm1, %xmm0 +; X86-NEXT: shrl %edx ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 ; X86-NEXT: vprolq $57, %zmm0, %zmm0 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; X86-NEXT: addl $44, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; @@ -207,46 +254,121 @@ define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind { ; X86-LABEL: no_extract_udiv: ; X86: # %bb.0: -; X86-NEXT: subl $60, %esp -; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vmovss %xmm0, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $3, {{[0-9]+}}(%esp) -; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vextractps $2, %xmm0, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $3, {{[0-9]+}}(%esp) -; X86-NEXT: vmovd %eax, %xmm0 -; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vmovss %xmm0, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302 -; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vextractps $2, %xmm0, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302 -; X86-NEXT: vmovd %eax, %xmm0 -; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 -; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 -; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload -; X86-NEXT: vpsllq $56, %xmm1, %xmm1 -; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 -; X86-NEXT: addl $60, %esp +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $20, %esp +; X86-NEXT: vpextrd $2, %xmm0, %ecx +; X86-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: vpextrd $3, %xmm0, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: imull $-1431655766, %esi, %eax # imm = 0xAAAAAAAA +; X86-NEXT: addl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: vmovd %xmm0, %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl $-1431655765, %ecx # imm = 0xAAAAAAAB +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: vpextrd $1, %xmm0, %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: mull %edx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: imull $-1431655766, %edi, %eax # imm = 0xAAAAAAAA +; X86-NEXT: addl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $-602410997, %esi # imm = 0xDC17F00B +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl $-1439092939, %ecx # imm = 0xAA392F35 +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl $-1439092939, %edx # imm = 0xAA392F35 +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzbl %bl, %ecx +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: shldl $23, %eax, %esi +; X86-NEXT: shrl $9, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl $-602410997, %edi # imm = 0xDC17F00B +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl $-1439092939, %ecx # imm = 0xAA392F35 +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: vmovd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; X86-NEXT: # xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; X86-NEXT: vmovd %esi, %xmm1 +; X86-NEXT: vpinsrd $1, %ebp, %xmm1, %xmm1 +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl $-1439092939, %edx # imm = 0xAA392F35 +; X86-NEXT: mull %edx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzbl %bl, %ecx +; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: shrdl $9, %edx, %eax +; X86-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X86-NEXT: shrl $9, %edx +; X86-NEXT: vpinsrd $3, %edx, %xmm1, %xmm1 +; X86-NEXT: vpsllq $55, %xmm0, %xmm0 +; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X86-NEXT: addl $20, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: no_extract_udiv: @@ -261,7 +383,8 @@ ; X64-NEXT: mulq %rdi ; X64-NEXT: vmovq %rdx, %xmm0 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-NEXT: vpsrlq $1, %xmm0, %xmm0 +; X64-NEXT: vpsllq $55, %xmm0, %xmm0 +; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-NEXT: movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rdi @@ -271,7 +394,6 @@ ; X64-NEXT: vmovq %rdx, %xmm2 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; X64-NEXT: vpsrlq $9, %xmm1, %xmm1 -; X64-NEXT: vpsllq $56, %xmm0, %xmm0 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 ; X64-NEXT: retq %lhs_div = udiv <2 x i64> %i, diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll --- a/llvm/test/CodeGen/X86/rotate-extract.ll +++ b/llvm/test/CodeGen/X86/rotate-extract.ll @@ -232,31 +232,31 @@ ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull $171, %eax, %ecx +; X86-NEXT: shlb $3, %ch +; X86-NEXT: andb $-16, %ch ; X86-NEXT: imull $79, %eax, %edx ; X86-NEXT: subb %dh, %al ; X86-NEXT: shrb %al ; X86-NEXT: addb %dh, %al ; X86-NEXT: shrb $5, %al -; X86-NEXT: shlb $3, %ch -; X86-NEXT: orb %al, %ch -; X86-NEXT: andb $-9, %ch -; X86-NEXT: movb %ch, %al +; X86-NEXT: orb %ch, %al +; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; ; X64-LABEL: no_extract_udiv: ; X64: # %bb.0: -; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: imull $171, %ecx, %eax -; X64-NEXT: shrl $8, %eax -; X64-NEXT: imull $79, %ecx, %edx +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: imull $171, %eax, %ecx +; X64-NEXT: shrl $8, %ecx +; X64-NEXT: shlb $3, %cl +; X64-NEXT: andb $-16, %cl +; X64-NEXT: imull $79, %eax, %edx ; X64-NEXT: shrl $8, %edx -; X64-NEXT: subb %dl, %cl -; X64-NEXT: shrb %cl -; X64-NEXT: addb %dl, %cl -; X64-NEXT: shrb $5, %cl -; X64-NEXT: shlb $3, %al +; X64-NEXT: subb %dl, %al +; X64-NEXT: shrb %al +; X64-NEXT: addb %dl, %al +; X64-NEXT: shrb $5, %al ; X64-NEXT: orb %cl, %al -; X64-NEXT: andb $-9, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %lhs_div = udiv i8 %i, 3 diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll --- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll @@ -6,103 +6,34 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { ; SSE-LABEL: fold_srem_vec_1: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: subl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $9, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 -; SSE-NEXT: shrl $16, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx -; SSE-NEXT: movswl %dx, %esi -; SSE-NEXT: shrl $15, %edx -; SSE-NEXT: sarl $6, %esi -; SSE-NEXT: addl %edx, %esi -; SSE-NEXT: imull $95, %esi, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pextrw $1, %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF -; SSE-NEXT: movl %edx, %esi -; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $21, %edx -; SSE-NEXT: addl %esi, %edx -; SSE-NEXT: imull $-124, %edx, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $2675, %edx, %edx # imm = 0xA73 -; SSE-NEXT: movl %edx, %esi -; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $18, %edx -; SSE-NEXT: addl %esi, %edx -; SSE-NEXT: imull $98, %edx, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: pinsrw $2, %ecx, %xmm1 -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = <1,0,0,65535,u,u,u,u> +; SSE-NEXT: pmullw %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: psrlw $15, %xmm1 +; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2 +; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fold_srem_vec_1: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: subl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $9, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 -; AVX-NEXT: shrl $16, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: movswl %dx, %esi -; AVX-NEXT: shrl $15, %edx -; AVX-NEXT: sarl $6, %esi -; AVX-NEXT: addl %edx, %esi -; AVX-NEXT: imull $95, %esi, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpextrw $1, %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $21, %edx -; AVX-NEXT: addl %esi, %edx -; AVX-NEXT: imull $-124, %edx, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $2675, %edx, %edx # imm = 0xA73 -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $18, %edx -; AVX-NEXT: addl %esi, %edx -; AVX-NEXT: imull $98, %edx, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 -; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm2 +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -111,8 +42,10 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { ; SSE-LABEL: fold_srem_vec_2: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] -; SSE-NEXT: pmulhw %xmm0, %xmm1 +; SSE-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 ; SSE-NEXT: paddw %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: psrlw $15, %xmm2 @@ -122,16 +55,34 @@ ; SSE-NEXT: psubw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fold_srem_vec_2: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 -; AVX-NEXT: vpsraw $6, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: fold_srem_vec_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX1-NEXT: vpsraw $6, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fold_srem_vec_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294945911,4294945911,4294945911,4294945911] +; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX2-NEXT: vpsraw $6, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 } @@ -141,30 +92,51 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; SSE-LABEL: combine_srem_sdiv: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] -; SSE-NEXT: pmulhw %xmm0, %xmm1 +; SSE-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 ; SSE-NEXT: paddw %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: psrlw $15, %xmm2 ; SSE-NEXT: psraw $6, %xmm1 ; SSE-NEXT: paddw %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = <95,95,95,95,u,u,u,u> ; SSE-NEXT: pmullw %xmm1, %xmm2 ; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_srem_sdiv: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 -; AVX-NEXT: vpsraw $6, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 -; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_srem_sdiv: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX1-NEXT: vpsraw $6, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_srem_sdiv: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294945911,4294945911,4294945911,4294945911] +; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX2-NEXT: vpsraw $6, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 +; AVX2-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = srem <4 x i16> %x, %2 = sdiv <4 x i16> %x, %3 = add <4 x i16> %1, %2 @@ -175,79 +147,31 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; SSE-LABEL: dont_fold_srem_power_of_two: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: leal 31(%rax), %ecx -; SSE-NEXT: testw %ax, %ax -; SSE-NEXT: cmovnsl %eax, %ecx -; SSE-NEXT: andl $-32, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: leal 63(%rcx), %edx -; SSE-NEXT: testw %cx, %cx -; SSE-NEXT: cmovnsl %ecx, %edx -; SSE-NEXT: andl $-64, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: leal 7(%rax), %ecx -; SSE-NEXT: testw %ax, %ax -; SSE-NEXT: cmovnsl %eax, %ecx -; SSE-NEXT: andl $-8, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $6, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: imull $95, %edx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlw $15, %xmm2 +; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE-NEXT: psubw %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_srem_power_of_two: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: leal 31(%rax), %ecx -; AVX-NEXT: testw %ax, %ax -; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-32, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: leal 63(%rcx), %edx -; AVX-NEXT: testw %cx, %cx -; AVX-NEXT: cmovnsl %ecx, %edx -; AVX-NEXT: andl $-64, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: leal 7(%rax), %ecx -; AVX-NEXT: testw %ax, %ax -; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-8, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $6, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: imull $95, %edx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -257,83 +181,40 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; SSE-LABEL: dont_fold_srem_one: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $4, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B -; SSE-NEXT: movl %ecx, %esi -; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $23, %ecx -; SSE-NEXT: addl %esi, %ecx -; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; SSE-NEXT: subl %ecx, %eax ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; SSE-NEXT: movl %ecx, %edx -; SSE-NEXT: shrl $31, %edx -; SSE-NEXT: sarl $26, %ecx -; SSE-NEXT: addl %edx, %ecx -; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4,5,6,7] +; SSE-NEXT: pmovsxwd %xmm0, %xmm3 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm3 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: packusdw %xmm3, %xmm3 +; SSE-NEXT: paddw %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = +; SSE-NEXT: pmulhw %xmm3, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7] +; SSE-NEXT: psrlw $15, %xmm3 +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5,6,7] +; SSE-NEXT: paddw %xmm2, %xmm3 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm3 +; SSE-NEXT: psubw %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_srem_one: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $4, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B -; AVX-NEXT: movl %ecx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $23, %ecx -; AVX-NEXT: addl %esi, %ecx -; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; AVX-NEXT: subl %ecx, %eax ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $26, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] +; AVX-NEXT: vpmovsxwd %xmm0, %xmm3 +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX-NEXT: vpsrld $16, %xmm3, %xmm3 +; AVX-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm2, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3,4,5,6,7] +; AVX-NEXT: vpsrlw $15, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpaddw %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -343,77 +224,40 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { ; SSE-LABEL: dont_fold_urem_i16_smax: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $4, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: leal 32767(%rax), %ecx -; SSE-NEXT: testw %ax, %ax -; SSE-NEXT: cmovnsl %eax, %ecx -; SSE-NEXT: andl $-32768, %ecx # imm = 0x8000 -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; SSE-NEXT: movl %ecx, %edx -; SSE-NEXT: shrl $31, %edx -; SSE-NEXT: sarl $26, %ecx -; SSE-NEXT: addl %edx, %ecx -; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = <1,65535,1,0,u,u,u,u> +; SSE-NEXT: pmullw %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = +; SSE-NEXT: pmulhw %xmm2, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] +; SSE-NEXT: psrlw $15, %xmm2 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3],xmm3[4,5,6,7] +; SSE-NEXT: paddw %xmm1, %xmm3 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm3 +; SSE-NEXT: psubw %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_urem_i16_smax: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $4, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: leal 32767(%rax), %ecx -; AVX-NEXT: testw %ax, %ax -; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-32768, %ecx # imm = 0x8000 -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $26, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm2 +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm1, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpsrlw $15, %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3],xmm3[4,5,6,7] +; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -423,133 +267,153 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { ; SSE-LABEL: dont_fold_srem_i64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movq %xmm1, %rcx -; SSE-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: imulq %rdx -; SSE-NEXT: addq %rcx, %rdx -; SSE-NEXT: movq %rdx, %rax -; SSE-NEXT: shrq $63, %rax -; SSE-NEXT: sarq $4, %rdx -; SSE-NEXT: addq %rax, %rdx -; SSE-NEXT: leaq (%rdx,%rdx,2), %rax -; SSE-NEXT: shlq $3, %rax -; SSE-NEXT: subq %rax, %rdx -; SSE-NEXT: addq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: pextrq $1, %xmm2, %rcx -; SSE-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: imulq %rdx -; SSE-NEXT: movq %rdx, %rax -; SSE-NEXT: shrq $63, %rax -; SSE-NEXT: sarq $11, %rdx -; SSE-NEXT: addq %rax, %rdx -; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F -; SSE-NEXT: subq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm2 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: pextrq $1, %xmm0, %rcx -; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: imulq %rdx -; SSE-NEXT: movq %rdx, %rax -; SSE-NEXT: shrq $63, %rax -; SSE-NEXT: sarq $8, %rdx -; SSE-NEXT: addq %rax, %rdx -; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E -; SSE-NEXT: subq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE-NEXT: pextrq $1, %xmm1, %rax +; SSE-NEXT: movabsq $6966426675817289639, %rcx # imm = 0x60ADB826E5E517A7 +; SSE-NEXT: imulq %rcx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: movabsq $-5614226457215950491, %rcx # imm = 0xB21642C8590B2165 +; SSE-NEXT: imulq %rcx +; SSE-NEXT: movq %rdx, %xmm3 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero +; SSE-NEXT: paddq %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrlq $11, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psrlq $4, %xmm4 +; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [576460752303423488,4503599627370496] +; SSE-NEXT: pxor %xmm3, %xmm4 +; SSE-NEXT: psrlq $63, %xmm2 +; SSE-NEXT: paddq %xmm4, %xmm2 +; SSE-NEXT: psubq %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [23,5423] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pmuludq %xmm3, %xmm4 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 +; SSE-NEXT: paddq %xmm4, %xmm2 +; SSE-NEXT: psubq %xmm2, %xmm1 +; SSE-NEXT: pextrq $1, %xmm0, %rax +; SSE-NEXT: movabsq $7220743857598845893, %rcx # imm = 0x64353C48064353C5 +; SSE-NEXT: imulq %rcx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; SSE-NEXT: movq {{.*#+}} xmm3 = xmm0[0],zero +; SSE-NEXT: paddq %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: psrlq $8, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,36028797018963968] +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: psrlq $63, %xmm3 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; SSE-NEXT: paddq %xmm2, %xmm5 +; SSE-NEXT: psubq %xmm4, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,654] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: psrlq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm2, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: paddq %xmm3, %xmm5 +; SSE-NEXT: psubq %xmm5, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: dont_fold_srem_i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: imulq %rdx -; AVX1-NEXT: addq %rcx, %rdx -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq $4, %rdx -; AVX1-NEXT: addq %rax, %rdx -; AVX1-NEXT: leaq (%rdx,%rdx,2), %rax -; AVX1-NEXT: shlq $3, %rax -; AVX1-NEXT: subq %rax, %rdx -; AVX1-NEXT: addq %rcx, %rdx +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: movabsq $6966426675817289639, %rcx # imm = 0x60ADB826E5E517A7 +; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: movabsq $-5614226457215950491, %rcx # imm = 0xB21642C8590B2165 +; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $11, %xmm2, %xmm4 +; AVX1-NEXT: vpsrlq $4, %xmm2, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [576460752303423488,4503599627370496] +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [23,5423] +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: movabsq $7220743857598845893, %rcx # imm = 0x64353C48064353C5 +; AVX1-NEXT: imulq %rcx ; AVX1-NEXT: vmovq %rdx, %xmm2 -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: imulq %rdx -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq $11, %rdx -; AVX1-NEXT: addq %rax, %rdx -; AVX1-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: imulq %rdx -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq $8, %rdx -; AVX1-NEXT: addq %rax, %rdx -; AVX1-NEXT: imulq $654, %rdx, %rax # imm = 0x28E -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $8, %xmm2, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,36028797018963968] +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,654] +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: dont_fold_srem_i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: imulq %rdx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq $4, %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: leaq (%rdx,%rdx,2), %rax -; AVX2-NEXT: shlq $3, %rax -; AVX2-NEXT: subq %rax, %rdx -; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: movabsq $6966426675817289639, %rcx # imm = 0x60ADB826E5E517A7 +; AVX2-NEXT: imulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: movabsq $-5614226457215950491, %rcx # imm = 0xB21642C8590B2165 +; AVX2-NEXT: imulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: movabsq $7220743857598845893, %rcx # imm = 0x64353C48064353C5 +; AVX2-NEXT: imulq %rcx ; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: imulq %rdx -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq $11, %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: imulq %rdx -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq $8, %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 -; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $63, %ymm1, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [9223372036854775808,36028797018963968,576460752303423488,4503599627370496] +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,654,23,5423] +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %1 = srem <4 x i64> %x, ret <4 x i64> %1 diff --git a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll --- a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll @@ -295,17 +295,51 @@ define i1 @t64_3_2(i64 %X) nounwind { ; X86-LABEL: t64_3_2: ; X86: # %bb.0: -; X86-NEXT: subl $12, %esp -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $3 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll __umoddi3 -; X86-NEXT: addl $16, %esp -; X86-NEXT: xorl $2, %eax -; X86-NEXT: orl %edx, %eax +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: setb %al +; X86-NEXT: movzbl %al, %ebp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %edi, %eax +; X86-NEXT: adcl %ebp, %ebx +; X86-NEXT: shrdl $1, %ebx, %eax +; X86-NEXT: movl $3, %edx +; X86-NEXT: mull %edx +; X86-NEXT: shrl %ebx +; X86-NEXT: leal (%ebx,%ebx,2), %edi +; X86-NEXT: addl %edx, %edi +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: sbbl %edi, %esi +; X86-NEXT: xorl $2, %ecx +; X86-NEXT: orl %esi, %ecx ; X86-NEXT: sete %al -; X86-NEXT: addl $12, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: t64_3_2: diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll --- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll @@ -6,81 +6,42 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { ; SSE-LABEL: fold_urem_vec_1: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrl $2, %ecx -; SSE-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211 -; SSE-NEXT: shrl $19, %ecx -; SSE-NEXT: imull $124, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: movzwl %cx, %edx -; SSE-NEXT: imull $44151, %edx, %edx # imm = 0xAC77 -; SSE-NEXT: shrl $22, %edx -; SSE-NEXT: imull $95, %edx, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrl %ecx -; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 -; SSE-NEXT: shrl $17, %ecx -; SSE-NEXT: imull $98, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $1373, %eax, %ecx # imm = 0x55D -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: movl %eax, %edx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx -; SSE-NEXT: shrl %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: shrl $9, %edx -; SSE-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = +; SSE-NEXT: pmulhuw %xmm0, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] +; SSE-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psubw %xmm1, %xmm2 +; SSE-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm2 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: packusdw %xmm2, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm2 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2 +; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fold_urem_vec_1: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $2, %ecx -; AVX-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211 -; AVX-NEXT: shrl $19, %ecx -; AVX-NEXT: imull $124, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: movzwl %cx, %edx -; AVX-NEXT: imull $44151, %edx, %edx # imm = 0xAC77 -; AVX-NEXT: shrl $22, %edx -; AVX-NEXT: imull $95, %edx, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl %ecx -; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 -; AVX-NEXT: shrl $17, %ecx -; AVX-NEXT: imull $98, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $1373, %eax, %ecx # imm = 0x55D -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $9, %edx -; AVX-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -89,20 +50,34 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { ; SSE-LABEL: fold_urem_vec_2: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] -; SSE-NEXT: pmulhuw %xmm0, %xmm1 -; SSE-NEXT: psrlw $6, %xmm1 +; SSE-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $22, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 ; SSE-NEXT: psubw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fold_urem_vec_2: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: fold_urem_vec_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $22, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fold_urem_vec_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [44151,44151,44151,44151] +; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrld $22, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 } @@ -112,23 +87,38 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; SSE-LABEL: combine_urem_udiv: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] -; SSE-NEXT: pmulhuw %xmm0, %xmm1 -; SSE-NEXT: psrlw $6, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $22, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = <95,95,95,95,u,u,u,u> ; SSE-NEXT: pmullw %xmm1, %xmm2 ; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_urem_udiv: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 -; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_urem_udiv: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $22, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_urem_udiv: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [44151,44151,44151,44151] +; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrld $22, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 +; AVX2-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = urem <4 x i16> %x, %2 = udiv <4 x i16> %x, %3 = add <4 x i16> %1, %2 @@ -139,41 +129,27 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; SSE-LABEL: dont_fold_urem_power_of_two: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; SSE-NEXT: shrl $22, %ecx -; SSE-NEXT: imull $95, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pextrw $1, %xmm0, %ecx -; SSE-NEXT: andl $31, %ecx -; SSE-NEXT: movd %xmm0, %edx -; SSE-NEXT: andl $63, %edx -; SSE-NEXT: movd %edx, %xmm1 -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %ecx -; SSE-NEXT: andl $7, %ecx -; SSE-NEXT: pinsrw $2, %ecx, %xmm1 -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlw $6, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2 +; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_urem_power_of_two: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; AVX-NEXT: shrl $22, %ecx -; AVX-NEXT: imull $95, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpextrw $1, %xmm0, %ecx -; AVX-NEXT: andl $31, %ecx -; AVX-NEXT: vmovd %xmm0, %edx -; AVX-NEXT: andl $63, %edx -; AVX-NEXT: vmovd %edx, %xmm1 -; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %ecx -; AVX-NEXT: andl $7, %ecx -; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 -; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $6, %xmm1, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -183,65 +159,39 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { ; SSE-LABEL: dont_fold_urem_one: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: movl %eax, %edx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx -; SSE-NEXT: shrl %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: shrl $4, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B -; SSE-NEXT: shrl $25, %ecx -; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 -; SSE-NEXT: shrl $26, %ecx -; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psubw %xmm1, %xmm2 +; SSE-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm2 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: packusdw %xmm2, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2 +; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_urem_one: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: imull $25645, %eax, %ecx # imm = 0x642D -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $4, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B -; AVX-NEXT: shrl $25, %ecx -; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 -; AVX-NEXT: shrl $26, %ecx -; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -260,119 +210,135 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { ; SSE-LABEL: dont_fold_urem_i64: ; SSE: # %bb.0: -; SSE-NEXT: movq %xmm1, %rcx -; SSE-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: mulq %rdx -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: subq %rdx, %rax +; SSE-NEXT: pextrq $1, %xmm1, %rax +; SSE-NEXT: movabsq $-4513890722074972339, %rcx # imm = 0xC15B704DCBCA2F4D +; SSE-NEXT: mulq %rcx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: movabsq $7218291159277650633, %rcx # imm = 0x642C8590B21642C9 +; SSE-NEXT: mulq %rcx +; SSE-NEXT: movq %rdx, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psubq %xmm3, %xmm4 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movq %xmm4, %rax ; SSE-NEXT: shrq %rax -; SSE-NEXT: addq %rdx, %rax -; SSE-NEXT: shrq $4, %rax -; SSE-NEXT: leaq (%rax,%rax,2), %rdx -; SSE-NEXT: shlq $3, %rdx -; SSE-NEXT: subq %rdx, %rax -; SSE-NEXT: addq %rcx, %rax ; SSE-NEXT: movq %rax, %xmm2 -; SSE-NEXT: pextrq $1, %xmm1, %rcx -; SSE-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: mulq %rdx -; SSE-NEXT: shrq $12, %rdx -; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F -; SSE-NEXT: subq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: pextrq $1, %xmm0, %rcx -; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: paddq %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrlq $12, %xmm3 +; SSE-NEXT: psrlq $4, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [23,5423] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pmuludq %xmm3, %xmm4 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 +; SSE-NEXT: paddq %xmm4, %xmm2 +; SSE-NEXT: psubq %xmm2, %xmm1 +; SSE-NEXT: pextrq $1, %xmm0, %rax ; SSE-NEXT: shrq %rax -; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 -; SSE-NEXT: mulq %rdx -; SSE-NEXT: shrq $7, %rdx -; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E -; SSE-NEXT: subq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movabsq $7220743857598845893, %rcx # imm = 0x64353C48064353C5 +; SSE-NEXT: mulq %rcx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; SSE-NEXT: psrlq $7, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,654] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pmuludq %xmm3, %xmm4 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 +; SSE-NEXT: paddq %xmm4, %xmm2 +; SSE-NEXT: psubq %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: dont_fold_urem_i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rdx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: addq %rdx, %rax -; AVX1-NEXT: shrq $4, %rax -; AVX1-NEXT: leaq (%rax,%rax,2), %rdx -; AVX1-NEXT: shlq $3, %rdx -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: addq %rcx, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rdx -; AVX1-NEXT: shrq $12, %rdx -; AVX1-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: movabsq $-4513890722074972339, %rcx # imm = 0xC15B704DCBCA2F4D +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: movabsq $7218291159277650633, %rcx # imm = 0x642C8590B21642C9 +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vmovq %xmm3, %rax ; AVX1-NEXT: shrq %rax -; AVX1-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 -; AVX1-NEXT: mulq %rdx -; AVX1-NEXT: shrq $7, %rdx -; AVX1-NEXT: imulq $654, %rdx, %rax # imm = 0x28E -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlq $12, %xmm2, %xmm3 +; AVX1-NEXT: vpsrlq $4, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm3 +; AVX1-NEXT: vpextrq $1, %xmm3, %rax +; AVX1-NEXT: movabsq $7220743857598845893, %rcx # imm = 0x64353C48064353C5 +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-NEXT: vpsrlq $7, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [23,5423] +; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,654] +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: dont_fold_urem_i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rdx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: subq %rdx, %rax +; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-NEXT: movabsq $-4513890722074972339, %rcx # imm = 0xC15B704DCBCA2F4D +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vmovq %xmm2, %rax +; AVX2-NEXT: movabsq $7218291159277650633, %rcx # imm = 0x642C8590B21642C9 +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm2[0],xmm3[0] +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: movabsq $7220743857598845893, %rcx # imm = 0x64353C48064353C5 +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm1 +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vmovq %xmm2, %rax ; AVX2-NEXT: shrq %rax -; AVX2-NEXT: addq %rdx, %rax -; AVX2-NEXT: shrq $4, %rax -; AVX2-NEXT: leaq (%rax,%rax,2), %rdx -; AVX2-NEXT: shlq $3, %rdx -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: addq %rcx, %rax ; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rdx -; AVX2-NEXT: shrq $12, %rdx -; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 -; AVX2-NEXT: mulq %rdx -; AVX2-NEXT: shrq $7, %rdx -; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 -; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,654,23,5423] +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %1 = urem <4 x i64> %x, ret <4 x i64> %1 -} \ No newline at end of file +} diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -15,21 +15,22 @@ ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 ; SSE2-NEXT: imulq %rcx -; SSE2-NEXT: movq %rdx, %rax -; SSE2-NEXT: shrq $63, %rax -; SSE2-NEXT: sarq %rdx -; SSE2-NEXT: addq %rax, %rdx ; SSE2-NEXT: movq %rdx, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: imulq %rcx -; SSE2-NEXT: movq %rdx, %rax -; SSE2-NEXT: shrq $63, %rax -; SSE2-NEXT: sarq %rdx -; SSE2-NEXT: addq %rax, %rdx ; SSE2-NEXT: movq %rdx, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrad $1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlq $1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: psrlq $63, %xmm1 +; SSE2-NEXT: paddq %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_div7_2i64: @@ -37,40 +38,69 @@ ; SSE41-NEXT: pextrq $1, %xmm0, %rax ; SSE41-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 ; SSE41-NEXT: imulq %rcx -; SSE41-NEXT: movq %rdx, %rax -; SSE41-NEXT: shrq $63, %rax -; SSE41-NEXT: sarq %rdx -; SSE41-NEXT: addq %rax, %rdx ; SSE41-NEXT: movq %rdx, %xmm1 ; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: imulq %rcx -; SSE41-NEXT: movq %rdx, %rax -; SSE41-NEXT: shrq $63, %rax -; SSE41-NEXT: sarq %rdx -; SSE41-NEXT: addq %rax, %rdx ; SSE41-NEXT: movq %rdx, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrad $1, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrlq $1, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; SSE41-NEXT: psrlq $63, %xmm0 +; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: test_div7_2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpextrq $1, %xmm0, %rax -; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 -; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm1 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: test_div7_2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpsrad $1, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2NOBW-LABEL: test_div7_2i64: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpextrq $1, %xmm0, %rax +; AVX2NOBW-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm1 +; AVX2NOBW-NEXT: vmovq %xmm0, %rax +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm0 +; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2NOBW-NEXT: vpsrad $1, %xmm0, %xmm1 +; AVX2NOBW-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX2NOBW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2NOBW-NEXT: vpsrlq $63, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_div7_2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax +; AVX512BW-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm1 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vpsrlq $63, %xmm0, %xmm1 +; AVX512BW-NEXT: vpsraq $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %res = sdiv <2 x i64> %a, ret <2 x i64> %res } @@ -415,89 +445,111 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-LABEL: test_rem7_2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: imulq %rsi -; SSE2-NEXT: movq %rdx, %rax -; SSE2-NEXT: shrq $63, %rax -; SSE2-NEXT: sarq %rdx -; SSE2-NEXT: addq %rax, %rdx -; SSE2-NEXT: leaq (,%rdx,8), %rax -; SSE2-NEXT: subq %rax, %rdx -; SSE2-NEXT: addq %rcx, %rdx +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; SSE2-NEXT: imulq %rcx ; SSE2-NEXT: movq %rdx, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: imulq %rsi -; SSE2-NEXT: movq %rdx, %rax -; SSE2-NEXT: shrq $63, %rax -; SSE2-NEXT: sarq %rdx -; SSE2-NEXT: addq %rax, %rdx -; SSE2-NEXT: leaq (,%rdx,8), %rax -; SSE2-NEXT: subq %rax, %rdx -; SSE2-NEXT: addq %rcx, %rdx -; SSE2-NEXT: movq %rdx, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: imulq %rcx +; SSE2-NEXT: movq %rdx, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrad $1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psrlq $1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: psrlq $63, %xmm1 +; SSE2-NEXT: paddq %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psllq $3, %xmm2 +; SSE2-NEXT: psubq %xmm2, %xmm1 +; SSE2-NEXT: paddq %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_rem7_2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pextrq $1, %xmm0, %rcx -; SSE41-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: imulq %rsi -; SSE41-NEXT: movq %rdx, %rax -; SSE41-NEXT: shrq $63, %rax -; SSE41-NEXT: sarq %rdx -; SSE41-NEXT: addq %rax, %rdx -; SSE41-NEXT: leaq (,%rdx,8), %rax -; SSE41-NEXT: subq %rax, %rdx -; SSE41-NEXT: addq %rcx, %rdx +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; SSE41-NEXT: imulq %rcx ; SSE41-NEXT: movq %rdx, %xmm1 -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: imulq %rsi -; SSE41-NEXT: movq %rdx, %rax -; SSE41-NEXT: shrq $63, %rax -; SSE41-NEXT: sarq %rdx -; SSE41-NEXT: addq %rax, %rdx -; SSE41-NEXT: leaq (,%rdx,8), %rax -; SSE41-NEXT: subq %rax, %rdx -; SSE41-NEXT: addq %rcx, %rdx -; SSE41-NEXT: movq %rdx, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: imulq %rcx +; SSE41-NEXT: movq %rdx, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrad $1, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlq $1, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] +; SSE41-NEXT: psrlq $63, %xmm2 +; SSE41-NEXT: paddq %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psllq $3, %xmm1 +; SSE41-NEXT: psubq %xmm1, %xmm2 +; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: test_rem7_2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx -; AVX-NEXT: vmovq %rdx, %xmm1 -; AVX-NEXT: vmovq %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx -; AVX-NEXT: vmovq %rdx, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: test_rem7_2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsllq $3, %xmm1, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2NOBW-LABEL: test_rem7_2i64: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpextrq $1, %xmm0, %rax +; AVX2NOBW-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm1 +; AVX2NOBW-NEXT: vmovq %xmm0, %rax +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm2 +; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2NOBW-NEXT: vpsrad $1, %xmm1, %xmm2 +; AVX2NOBW-NEXT: vpsrlq $1, %xmm1, %xmm3 +; AVX2NOBW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] +; AVX2NOBW-NEXT: vpsrlq $63, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX2NOBW-NEXT: vpsllq $3, %xmm1, %xmm2 +; AVX2NOBW-NEXT: vpsubq %xmm2, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax +; AVX512BW-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm1 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512BW-NEXT: vpsrlq $63, %xmm1, %xmm2 +; AVX512BW-NEXT: vpsraq $1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsllq $3, %xmm1, %xmm2 +; AVX512BW-NEXT: vpsubq %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %res = srem <2 x i64> %a, ret <2 x i64> %res } diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -14,73 +14,80 @@ ; AVX1-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 ; AVX1-NEXT: imulq %rcx -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq %rdx -; AVX1-NEXT: addq %rax, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: imulq %rcx -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq %rdx -; AVX1-NEXT: addq %rax, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: imulq %rcx -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq %rdx -; AVX1-NEXT: addq %rax, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm2 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: imulq %rcx -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq %rdx -; AVX1-NEXT: addq %rax, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpsrad $1, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: test_div7_4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 -; AVX2-NEXT: imulq %rcx -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: imulq %rcx -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: imulq %rcx -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: imulq %rcx -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2NOBW-LABEL: test_div7_4i64: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2NOBW-NEXT: vpextrq $1, %xmm1, %rax +; AVX2NOBW-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm2 +; AVX2NOBW-NEXT: vmovq %xmm1, %rax +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm1 +; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2NOBW-NEXT: vpextrq $1, %xmm0, %rax +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm2 +; AVX2NOBW-NEXT: vmovq %xmm0, %rax +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm0 +; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2NOBW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpsrad $1, %ymm0, %ymm1 +; AVX2NOBW-NEXT: vpsrlq $1, %ymm0, %ymm2 +; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2NOBW-NEXT: vpsrlq $63, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_div7_4i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax +; AVX512BW-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vmovq %xmm1, %rax +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlq $63, %ymm0, %ymm1 +; AVX512BW-NEXT: vpsraq $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq %res = sdiv <4 x i64> %a, ret <4 x i64> %res } @@ -351,108 +358,95 @@ ; AVX1-LABEL: test_rem7_4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: imulq %rsi -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq %rdx -; AVX1-NEXT: addq %rax, %rdx -; AVX1-NEXT: leaq (,%rdx,8), %rax -; AVX1-NEXT: subq %rax, %rdx -; AVX1-NEXT: addq %rcx, %rdx +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX1-NEXT: imulq %rcx ; AVX1-NEXT: vmovq %rdx, %xmm2 -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: imulq %rsi -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq %rdx -; AVX1-NEXT: addq %rax, %rdx -; AVX1-NEXT: leaq (,%rdx,8), %rax -; AVX1-NEXT: subq %rax, %rdx -; AVX1-NEXT: addq %rcx, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: imulq %rsi -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq %rdx -; AVX1-NEXT: addq %rax, %rdx -; AVX1-NEXT: leaq (,%rdx,8), %rax -; AVX1-NEXT: subq %rax, %rdx -; AVX1-NEXT: addq %rcx, %rdx +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpsrad $1, %xmm2, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsllq $3, %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: imulq %rcx ; AVX1-NEXT: vmovq %rdx, %xmm2 -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: imulq %rsi -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq %rdx -; AVX1-NEXT: addq %rax, %rdx -; AVX1-NEXT: leaq (,%rdx,8), %rax -; AVX1-NEXT: subq %rax, %rdx -; AVX1-NEXT: addq %rcx, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpsrad $1, %xmm2, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsllq $3, %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: test_rem7_4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: imulq %rsi -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: leaq (,%rdx,8), %rax -; AVX2-NEXT: subq %rax, %rdx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: imulq %rsi -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: leaq (,%rdx,8), %rax -; AVX2-NEXT: subq %rax, %rdx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: imulq %rsi -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: leaq (,%rdx,8), %rax -; AVX2-NEXT: subq %rax, %rdx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: imulq %rsi -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: leaq (,%rdx,8), %rax -; AVX2-NEXT: subq %rax, %rdx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2NOBW-LABEL: test_rem7_4i64: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2NOBW-NEXT: vpextrq $1, %xmm1, %rax +; AVX2NOBW-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm2 +; AVX2NOBW-NEXT: vmovq %xmm1, %rax +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm1 +; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2NOBW-NEXT: vpextrq $1, %xmm0, %rax +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm2 +; AVX2NOBW-NEXT: vmovq %xmm0, %rax +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm3 +; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2NOBW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2NOBW-NEXT: vpsrad $1, %ymm1, %ymm2 +; AVX2NOBW-NEXT: vpsrlq $1, %ymm1, %ymm3 +; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2NOBW-NEXT: vpsrlq $63, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2NOBW-NEXT: vpsllq $3, %ymm1, %ymm2 +; AVX2NOBW-NEXT: vpsubq %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_4i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax +; AVX512BW-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vmovq %xmm1, %rax +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512BW-NEXT: vpsrlq $63, %ymm1, %ymm2 +; AVX512BW-NEXT: vpsraq $1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpsllq $3, %ymm1, %ymm2 +; AVX512BW-NEXT: vpsubq %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq %res = srem <4 x i64> %a, ret <4 x i64> %res } diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -9,73 +9,44 @@ define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind { ; AVX-LABEL: test_div7_8i64: ; AVX: # %bb.0: -; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX-NEXT: vpextrq $1, %xmm1, %rax -; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 -; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm2 -; AVX-NEXT: vmovq %xmm1, %rax -; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX-NEXT: vpextrq $1, %xmm2, %rax +; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 ; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: vmovq %rdx, %xmm3 ; AVX-NEXT: vmovq %xmm2, %rax ; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vpextrq $1, %xmm1, %rax +; AVX-NEXT: imulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm1, %rax +; AVX-NEXT: imulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpextrq $1, %xmm2, %rax ; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: vmovq %rdx, %xmm3 ; AVX-NEXT: vmovq %xmm2, %rax ; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX-NEXT: vpextrq $1, %xmm0, %rax ; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: vmovq %rdx, %xmm3 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: vmovq %rdx, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX-NEXT: vpsrlq $63, %zmm0, %zmm1 +; AVX-NEXT: vpsraq $1, %zmm0, %zmm0 +; AVX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX-NEXT: retq %res = sdiv <8 x i64> %a, ret <8 x i64> %res @@ -291,105 +262,47 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-LABEL: test_rem7_8i64: ; AVX: # %bb.0: -; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX-NEXT: vpextrq $1, %xmm1, %rcx -; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx -; AVX-NEXT: vmovq %rdx, %xmm2 -; AVX-NEXT: vmovq %xmm1, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx -; AVX-NEXT: vmovq %rdx, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX-NEXT: vpextrq $1, %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX-NEXT: vpextrq $1, %xmm2, %rax +; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX-NEXT: imulq %rcx ; AVX-NEXT: vmovq %rdx, %xmm3 -; AVX-NEXT: vmovq %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vmovq %xmm2, %rax +; AVX-NEXT: imulq %rcx ; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vpextrq $1, %xmm1, %rax +; AVX-NEXT: imulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm1, %rax +; AVX-NEXT: imulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpextrq $1, %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vpextrq $1, %xmm2, %rax +; AVX-NEXT: imulq %rcx ; AVX-NEXT: vmovq %rdx, %xmm3 -; AVX-NEXT: vmovq %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vmovq %xmm2, %rax +; AVX-NEXT: imulq %rcx ; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: imulq %rcx ; AVX-NEXT: vmovq %rdx, %xmm3 -; AVX-NEXT: vmovq %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx -; AVX-NEXT: vmovq %rdx, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: imulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm4 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX-NEXT: vpsrlq $63, %zmm1, %zmm2 +; AVX-NEXT: vpsraq $1, %zmm1, %zmm1 +; AVX-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX-NEXT: vpsllq $3, %zmm1, %zmm2 +; AVX-NEXT: vpsubq %zmm2, %zmm1, %zmm1 +; AVX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX-NEXT: retq %res = srem <8 x i64> %a, ret <8 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -12,66 +12,50 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind { ; SSE2-LABEL: test_div7_2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: mulq %rsi -; SSE2-NEXT: subq %rdx, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: addq %rdx, %rcx -; SSE2-NEXT: movq %rcx, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: mulq %rsi -; SSE2-NEXT: subq %rdx, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: addq %rdx, %rcx -; SSE2-NEXT: movq %rcx, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: psrlq $2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; SSE2-NEXT: mulq %rcx +; SSE2-NEXT: movq %rdx, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: mulq %rcx +; SSE2-NEXT: movq %rdx, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: psrlq $1, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: psrlq $2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_div7_2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pextrq $1, %xmm0, %rcx -; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: mulq %rsi -; SSE41-NEXT: subq %rdx, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: addq %rdx, %rcx -; SSE41-NEXT: movq %rcx, %xmm1 -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: mulq %rsi -; SSE41-NEXT: subq %rdx, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: addq %rdx, %rcx -; SSE41-NEXT: movq %rcx, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; SSE41-NEXT: mulq %rcx +; SSE41-NEXT: movq %rdx, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: mulq %rcx +; SSE41-NEXT: movq %rdx, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE41-NEXT: psubq %xmm2, %xmm0 +; SSE41-NEXT: psrlq $1, %xmm0 +; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: psrlq $2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_div7_2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm1 -; AVX-NEXT: vmovq %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm2 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlq $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrlq $2, %xmm0, %xmm0 ; AVX-NEXT: retq %res = udiv <2 x i64> %a, @@ -430,94 +414,66 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-LABEL: test_rem7_2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: mulq %rsi -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: subq %rdx, %rax -; SSE2-NEXT: shrq %rax -; SSE2-NEXT: addq %rdx, %rax -; SSE2-NEXT: shrq $2, %rax -; SSE2-NEXT: leaq (,%rax,8), %rdx -; SSE2-NEXT: subq %rdx, %rax -; SSE2-NEXT: addq %rcx, %rax -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: mulq %rsi -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: subq %rdx, %rax -; SSE2-NEXT: shrq %rax -; SSE2-NEXT: addq %rdx, %rax -; SSE2-NEXT: shrq $2, %rax -; SSE2-NEXT: leaq (,%rax,8), %rdx -; SSE2-NEXT: subq %rdx, %rax -; SSE2-NEXT: addq %rcx, %rax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; SSE2-NEXT: mulq %rcx +; SSE2-NEXT: movq %rdx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: mulq %rcx +; SSE2-NEXT: movq %rdx, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psubq %xmm2, %xmm1 +; SSE2-NEXT: psrlq $1, %xmm1 +; SSE2-NEXT: paddq %xmm2, %xmm1 +; SSE2-NEXT: psrlq $2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psllq $3, %xmm2 +; SSE2-NEXT: psubq %xmm2, %xmm1 +; SSE2-NEXT: paddq %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_rem7_2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pextrq $1, %xmm0, %rcx -; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: mulq %rsi -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: subq %rdx, %rax -; SSE41-NEXT: shrq %rax -; SSE41-NEXT: addq %rdx, %rax -; SSE41-NEXT: shrq $2, %rax -; SSE41-NEXT: leaq (,%rax,8), %rdx -; SSE41-NEXT: subq %rdx, %rax -; SSE41-NEXT: addq %rcx, %rax -; SSE41-NEXT: movq %rax, %xmm1 -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: mulq %rsi -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: subq %rdx, %rax -; SSE41-NEXT: shrq %rax -; SSE41-NEXT: addq %rdx, %rax -; SSE41-NEXT: shrq $2, %rax -; SSE41-NEXT: leaq (,%rax,8), %rdx -; SSE41-NEXT: subq %rdx, %rax -; SSE41-NEXT: addq %rcx, %rax -; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; SSE41-NEXT: mulq %rcx +; SSE41-NEXT: movq %rdx, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: mulq %rcx +; SSE41-NEXT: movq %rdx, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psubq %xmm2, %xmm1 +; SSE41-NEXT: psrlq $1, %xmm1 +; SSE41-NEXT: paddq %xmm2, %xmm1 +; SSE41-NEXT: psrlq $2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psllq $3, %xmm2 +; SSE41-NEXT: psubq %xmm2, %xmm1 +; SSE41-NEXT: paddq %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_rem7_2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vmovq %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm2 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpsrlq $1, %xmm2, %xmm2 +; AVX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpsrlq $2, %xmm1, %xmm1 +; AVX-NEXT: vpsllq $3, %xmm1, %xmm2 +; AVX-NEXT: vpsubq %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %res = urem <2 x i64> %a, ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -10,39 +10,29 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind { ; AVX1-LABEL: test_div7_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: subq %rdx, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: addq %rdx, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm1 -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: subq %rdx, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: addq %rdx, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm2 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: subq %rdx, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: addq %rdx, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm2 -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: subq %rdx, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: addq %rdx, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -50,38 +40,25 @@ ; AVX2-LABEL: test_div7_4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rsi -; AVX2-NEXT: subq %rdx, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rsi -; AVX2-NEXT: subq %rdx, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm1 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rsi -; AVX2-NEXT: subq %rdx, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm2 -; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rsi -; AVX2-NEXT: subq %rdx, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlq $2, %ymm0, %ymm0 ; AVX2-NEXT: retq %res = udiv <4 x i64> %a, @@ -370,115 +347,64 @@ ; AVX1-LABEL: test_rem7_4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: addq %rdx, %rax -; AVX1-NEXT: shrq $2, %rax -; AVX1-NEXT: leaq (,%rax,8), %rdx -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: addq %rcx, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: addq %rdx, %rax -; AVX1-NEXT: shrq $2, %rax -; AVX1-NEXT: leaq (,%rax,8), %rdx -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: addq %rcx, %rax -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: addq %rdx, %rax -; AVX1-NEXT: shrq $2, %rax -; AVX1-NEXT: leaq (,%rax,8), %rdx -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: addq %rcx, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: addq %rdx, %rax -; AVX1-NEXT: shrq $2, %rax -; AVX1-NEXT: leaq (,%rax,8), %rdx -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: addq %rcx, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $3, %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $3, %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_rem7_4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rsi -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: addq %rdx, %rax -; AVX2-NEXT: shrq $2, %rax -; AVX2-NEXT: leaq (,%rax,8), %rdx -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rsi -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: addq %rdx, %rax -; AVX2-NEXT: shrq $2, %rax -; AVX2-NEXT: leaq (,%rax,8), %rdx -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm1 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rsi -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: addq %rdx, %rax -; AVX2-NEXT: shrq $2, %rax -; AVX2-NEXT: leaq (,%rax,8), %rdx -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rsi -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: addq %rdx, %rax -; AVX2-NEXT: shrq $2, %rax -; AVX2-NEXT: leaq (,%rax,8), %rdx -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlq $1, %ymm2, %ymm2 +; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpsrlq $2, %ymm1, %ymm1 +; AVX2-NEXT: vpsllq $3, %ymm1, %ymm2 +; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %res = urem <4 x i64> %a, ret <4 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -9,73 +9,44 @@ define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind { ; AVX-LABEL: test_div7_8i64: ; AVX: # %bb.0: -; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX-NEXT: vpextrq $1, %xmm1, %rcx -; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm2 -; AVX-NEXT: vmovq %xmm1, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX-NEXT: vpextrq $1, %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm3 -; AVX-NEXT: vmovq %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm2 +; AVX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX-NEXT: vpextrq $1, %xmm2, %rax +; AVX-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm2, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vpextrq $1, %xmm1, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm1, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpextrq $1, %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm3 -; AVX-NEXT: vmovq %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm2 +; AVX-NEXT: vpextrq $1, %xmm2, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm2, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm3 -; AVX-NEXT: vmovq %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm4 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX-NEXT: vpsrlq $1, %zmm0, %zmm0 +; AVX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX-NEXT: vpsrlq $2, %zmm0, %zmm0 ; AVX-NEXT: retq %res = udiv <8 x i64> %a, @@ -295,113 +266,48 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-LABEL: test_rem7_8i64: ; AVX: # %bb.0: -; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX-NEXT: vpextrq $1, %xmm1, %rcx -; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm2 -; AVX-NEXT: vmovq %xmm1, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX-NEXT: vpextrq $1, %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm3 -; AVX-NEXT: vmovq %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm2 +; AVX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX-NEXT: vpextrq $1, %xmm2, %rax +; AVX-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm2, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vpextrq $1, %xmm1, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm1, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpextrq $1, %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm3 -; AVX-NEXT: vmovq %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm2 +; AVX-NEXT: vpextrq $1, %xmm2, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm2, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm3 -; AVX-NEXT: vmovq %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm4 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX-NEXT: vpsubq %zmm1, %zmm0, %zmm2 +; AVX-NEXT: vpsrlq $1, %zmm2, %zmm2 +; AVX-NEXT: vpaddq %zmm1, %zmm2, %zmm1 +; AVX-NEXT: vpsrlq $2, %zmm1, %zmm1 +; AVX-NEXT: vpsllq $3, %zmm1, %zmm2 +; AVX-NEXT: vpsubq %zmm2, %zmm1, %zmm1 +; AVX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX-NEXT: retq %res = urem <8 x i64> %a, ret <8 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll --- a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll @@ -6,17 +6,16 @@ ; X64-LABEL: test_udiv7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; X64-NEXT: pmuludq %xmm1, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; X64-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] ; X64-NEXT: psubd %xmm2, %xmm0 -; X64-NEXT: psrld $1, %xmm0 -; X64-NEXT: paddd %xmm2, %xmm0 +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: psrlq $1, %xmm0 +; X64-NEXT: psrlq $32, %xmm1 +; X64-NEXT: paddd %xmm0, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; X64-NEXT: psrld $2, %xmm0 ; X64-NEXT: movq %xmm0, (%rsi) ; X64-NEXT: retq @@ -26,18 +25,18 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X86-NEXT: psrlq $32, %xmm1 ; X86-NEXT: psubd %xmm2, %xmm0 -; X86-NEXT: psrld $1, %xmm0 -; X86-NEXT: paddd %xmm2, %xmm0 +; X86-NEXT: pxor %xmm2, %xmm2 +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-NEXT: psllq $31, %xmm0 +; X86-NEXT: psrlq $32, %xmm0 +; X86-NEXT: paddd %xmm1, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: psrld $2, %xmm0 ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl @@ -51,18 +50,17 @@ ; X64-LABEL: test_urem7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; X64-NEXT: pmuludq %xmm1, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psubd %xmm2, %xmm1 -; X64-NEXT: psrld $1, %xmm1 -; X64-NEXT: paddd %xmm2, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; X64-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X64-NEXT: movdqa %xmm0, %xmm3 +; X64-NEXT: psubd %xmm2, %xmm3 +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X64-NEXT: psrlq $1, %xmm3 +; X64-NEXT: psrlq $32, %xmm1 +; X64-NEXT: paddd %xmm3, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-NEXT: psrld $2, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm2 ; X64-NEXT: pslld $3, %xmm2 @@ -76,19 +74,19 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X86-NEXT: movdqa %xmm0, %xmm1 -; X86-NEXT: psubd %xmm2, %xmm1 -; X86-NEXT: psrld $1, %xmm1 -; X86-NEXT: paddd %xmm2, %xmm1 +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X86-NEXT: psrlq $32, %xmm1 +; X86-NEXT: movdqa %xmm0, %xmm3 +; X86-NEXT: psubd %xmm2, %xmm3 +; X86-NEXT: pxor %xmm2, %xmm2 +; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X86-NEXT: psllq $31, %xmm3 +; X86-NEXT: psrlq $32, %xmm3 +; X86-NEXT: paddd %xmm1, %xmm3 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; X86-NEXT: psrld $2, %xmm1 ; X86-NEXT: movdqa %xmm1, %xmm2 ; X86-NEXT: pslld $3, %xmm2 @@ -106,25 +104,27 @@ ; X64-LABEL: test_sdiv7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; X64-NEXT: pmuludq %xmm1, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X64-NEXT: pxor %xmm3, %xmm3 -; X64-NEXT: pcmpgtd %xmm0, %xmm3 -; X64-NEXT: pand %xmm1, %xmm3 -; X64-NEXT: paddd %xmm0, %xmm3 -; X64-NEXT: psubd %xmm3, %xmm2 -; X64-NEXT: paddd %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; X64-NEXT: movdqa {{.*#+}} xmm3 = [18446744071868851347,18446744071868851347] +; X64-NEXT: pmuludq %xmm3, %xmm2 +; X64-NEXT: movdqa %xmm0, %xmm4 +; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X64-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; X64-NEXT: pmuludq %xmm4, %xmm1 +; X64-NEXT: paddq %xmm2, %xmm1 +; X64-NEXT: psllq $32, %xmm1 +; X64-NEXT: pmuludq %xmm3, %xmm4 +; X64-NEXT: paddq %xmm1, %xmm4 +; X64-NEXT: psrlq $32, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; X64-NEXT: paddd %xmm0, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: psrld $31, %xmm0 -; X64-NEXT: psrad $2, %xmm2 -; X64-NEXT: paddd %xmm0, %xmm2 -; X64-NEXT: movq %xmm2, (%rsi) +; X64-NEXT: psrad $2, %xmm1 +; X64-NEXT: paddd %xmm0, %xmm1 +; X64-NEXT: movq %xmm1, (%rsi) ; X64-NEXT: retq ; ; X86-LABEL: test_sdiv7_v2i32: @@ -132,26 +132,27 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X86-NEXT: pxor %xmm3, %xmm3 -; X86-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-NEXT: pand %xmm1, %xmm3 -; X86-NEXT: paddd %xmm0, %xmm3 -; X86-NEXT: psubd %xmm3, %xmm2 -; X86-NEXT: paddd %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm0 +; X86-NEXT: pxor %xmm1, %xmm1 +; X86-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; X86-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,4294967295,2454267027,4294967295] +; X86-NEXT: pmuludq %xmm3, %xmm2 +; X86-NEXT: movdqa %xmm0, %xmm4 +; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X86-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-NEXT: pmuludq %xmm4, %xmm1 +; X86-NEXT: paddq %xmm2, %xmm1 +; X86-NEXT: psllq $32, %xmm1 +; X86-NEXT: pmuludq %xmm3, %xmm4 +; X86-NEXT: paddq %xmm1, %xmm4 +; X86-NEXT: psrlq $32, %xmm4 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; X86-NEXT: paddd %xmm0, %xmm1 +; X86-NEXT: movdqa %xmm1, %xmm0 ; X86-NEXT: psrld $31, %xmm0 -; X86-NEXT: psrad $2, %xmm2 -; X86-NEXT: paddd %xmm0, %xmm2 -; X86-NEXT: movq %xmm2, (%eax) +; X86-NEXT: psrad $2, %xmm1 +; X86-NEXT: paddd %xmm0, %xmm1 +; X86-NEXT: movq %xmm1, (%eax) ; X86-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = sdiv <2 x i32> %a, @@ -163,29 +164,31 @@ ; X64-LABEL: test_srem7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; X64-NEXT: pmuludq %xmm1, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X64-NEXT: pxor %xmm3, %xmm3 -; X64-NEXT: pcmpgtd %xmm0, %xmm3 -; X64-NEXT: pand %xmm1, %xmm3 -; X64-NEXT: paddd %xmm0, %xmm3 -; X64-NEXT: psubd %xmm3, %xmm2 -; X64-NEXT: paddd %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm1 -; X64-NEXT: psrld $31, %xmm1 -; X64-NEXT: psrad $2, %xmm2 -; X64-NEXT: paddd %xmm1, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm1 -; X64-NEXT: pslld $3, %xmm1 -; X64-NEXT: psubd %xmm1, %xmm2 -; X64-NEXT: paddd %xmm0, %xmm2 -; X64-NEXT: movq %xmm2, (%rsi) +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; X64-NEXT: movdqa {{.*#+}} xmm3 = [18446744071868851347,18446744071868851347] +; X64-NEXT: pmuludq %xmm3, %xmm2 +; X64-NEXT: movdqa %xmm0, %xmm4 +; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X64-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; X64-NEXT: pmuludq %xmm4, %xmm1 +; X64-NEXT: paddq %xmm2, %xmm1 +; X64-NEXT: psllq $32, %xmm1 +; X64-NEXT: pmuludq %xmm3, %xmm4 +; X64-NEXT: paddq %xmm1, %xmm4 +; X64-NEXT: psrlq $32, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; X64-NEXT: paddd %xmm0, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: psrld $31, %xmm2 +; X64-NEXT: psrad $2, %xmm1 +; X64-NEXT: paddd %xmm2, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: pslld $3, %xmm2 +; X64-NEXT: psubd %xmm2, %xmm1 +; X64-NEXT: paddd %xmm0, %xmm1 +; X64-NEXT: movq %xmm1, (%rsi) ; X64-NEXT: retq ; ; X86-LABEL: test_srem7_v2i32: @@ -193,30 +196,31 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X86-NEXT: pxor %xmm3, %xmm3 -; X86-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-NEXT: pand %xmm1, %xmm3 -; X86-NEXT: paddd %xmm0, %xmm3 -; X86-NEXT: psubd %xmm3, %xmm2 -; X86-NEXT: paddd %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm1 -; X86-NEXT: psrld $31, %xmm1 -; X86-NEXT: psrad $2, %xmm2 -; X86-NEXT: paddd %xmm1, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm1 -; X86-NEXT: pslld $3, %xmm1 -; X86-NEXT: psubd %xmm1, %xmm2 -; X86-NEXT: paddd %xmm0, %xmm2 -; X86-NEXT: movq %xmm2, (%eax) +; X86-NEXT: pxor %xmm1, %xmm1 +; X86-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; X86-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,4294967295,2454267027,4294967295] +; X86-NEXT: pmuludq %xmm3, %xmm2 +; X86-NEXT: movdqa %xmm0, %xmm4 +; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X86-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-NEXT: pmuludq %xmm4, %xmm1 +; X86-NEXT: paddq %xmm2, %xmm1 +; X86-NEXT: psllq $32, %xmm1 +; X86-NEXT: pmuludq %xmm3, %xmm4 +; X86-NEXT: paddq %xmm1, %xmm4 +; X86-NEXT: psrlq $32, %xmm4 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; X86-NEXT: paddd %xmm0, %xmm1 +; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: psrld $31, %xmm2 +; X86-NEXT: psrad $2, %xmm1 +; X86-NEXT: paddd %xmm2, %xmm1 +; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: pslld $3, %xmm2 +; X86-NEXT: psubd %xmm2, %xmm1 +; X86-NEXT: paddd %xmm0, %xmm1 +; X86-NEXT: movq %xmm1, (%eax) ; X86-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = srem <2 x i32> %a, diff --git a/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll b/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll --- a/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll @@ -8,29 +8,31 @@ define <8 x i8> @vshli_target_constant(<8 x i16> %arg, <8 x i32> %arg1) { ; CHECK-LABEL: vshli_target_constant: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] -; CHECK-NEXT: pmuludq %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,3,3] +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2863311531,2863311531] +; CHECK-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-NEXT: psrlq $33, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; CHECK-NEXT: pmuludq %xmm4, %xmm0 +; CHECK-NEXT: psrlq $33, %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,3,3] +; CHECK-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-NEXT: psrlq $33, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] +; CHECK-NEXT: pmuludq %xmm4, %xmm2 +; CHECK-NEXT: psrlq $33, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] ; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: pslld $15, %xmm2 +; CHECK-NEXT: pslld $16, %xmm2 ; CHECK-NEXT: psrad $16, %xmm2 -; CHECK-NEXT: pslld $15, %xmm4 -; CHECK-NEXT: psrad $16, %xmm4 -; CHECK-NEXT: packssdw %xmm2, %xmm4 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm4 -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; CHECK-NEXT: pmullw %xmm4, %xmm1 +; CHECK-NEXT: pslld $16, %xmm0 +; CHECK-NEXT: psrad $16, %xmm0 +; CHECK-NEXT: packssdw %xmm2, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; CHECK-NEXT: pmullw %xmm0, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]